From c40e45b75eb76d79a05c7fa85c1fa9b5c728a12f Mon Sep 17 00:00:00 2001 From: Sebastian Huber Date: Fri, 7 Oct 2016 15:10:20 +0200 Subject: Update to FreeBSD head 2016-08-23 Git mirror commit 9fe7c416e6abb28b1398fd3e5687099846800cfd. --- freebsd/sys/netinet/accf_dns.c | 6 +- freebsd/sys/netinet/accf_http.c | 11 +- freebsd/sys/netinet/cc.h | 167 -- freebsd/sys/netinet/cc/cc.c | 15 +- freebsd/sys/netinet/cc/cc.h | 178 ++ freebsd/sys/netinet/cc/cc_newreno.c | 18 +- freebsd/sys/netinet/icmp6.h | 91 +- freebsd/sys/netinet/icmp_var.h | 21 +- freebsd/sys/netinet/if_atm.c | 6 +- freebsd/sys/netinet/if_atm.h | 2 +- freebsd/sys/netinet/if_ether.c | 1122 ++++++--- freebsd/sys/netinet/if_ether.h | 32 +- freebsd/sys/netinet/igmp.c | 462 ++-- freebsd/sys/netinet/igmp_var.h | 88 +- freebsd/sys/netinet/in.c | 1782 ++++++------- freebsd/sys/netinet/in.h | 211 +- freebsd/sys/netinet/in_fib.c | 235 ++ freebsd/sys/netinet/in_fib.h | 61 + freebsd/sys/netinet/in_gif.c | 421 +--- freebsd/sys/netinet/in_kdtrace.h | 72 + freebsd/sys/netinet/in_mcast.c | 149 +- freebsd/sys/netinet/in_pcb.c | 386 ++- freebsd/sys/netinet/in_pcb.h | 137 +- freebsd/sys/netinet/in_proto.c | 37 +- freebsd/sys/netinet/in_rmx.c | 377 +-- freebsd/sys/netinet/in_rss.h | 57 + freebsd/sys/netinet/in_systm.h | 16 +- freebsd/sys/netinet/in_var.h | 155 +- freebsd/sys/netinet/ip.h | 25 +- freebsd/sys/netinet/ip6.h | 6 - freebsd/sys/netinet/ip_carp.c | 2691 +++++++++----------- freebsd/sys/netinet/ip_carp.h | 62 +- freebsd/sys/netinet/ip_divert.c | 59 +- freebsd/sys/netinet/ip_dummynet.h | 32 +- freebsd/sys/netinet/ip_ecn.h | 4 - freebsd/sys/netinet/ip_encap.c | 68 +- freebsd/sys/netinet/ip_encap.h | 2 +- freebsd/sys/netinet/ip_fastfwd.c | 181 +- freebsd/sys/netinet/ip_fw.h | 469 +++- freebsd/sys/netinet/ip_gre.c | 354 +-- freebsd/sys/netinet/ip_icmp.c | 248 +- freebsd/sys/netinet/ip_icmp.h | 12 +- freebsd/sys/netinet/ip_id.c | 260 +- freebsd/sys/netinet/ip_input.c | 1046 +++----- freebsd/sys/netinet/ip_ipsec.h | 4 +- freebsd/sys/netinet/ip_mroute.c | 136 +- freebsd/sys/netinet/ip_mroute.h | 29 +- freebsd/sys/netinet/ip_options.c | 116 +- freebsd/sys/netinet/ip_options.h | 5 +- freebsd/sys/netinet/ip_output.c | 614 +++-- freebsd/sys/netinet/ip_reass.c | 660 +++++ freebsd/sys/netinet/ip_var.h | 116 +- freebsd/sys/netinet/libalias/alias.c | 32 +- freebsd/sys/netinet/libalias/alias_cuseeme.c | 36 +- freebsd/sys/netinet/libalias/alias_db.c | 21 +- freebsd/sys/netinet/libalias/alias_dummy.c | 42 +- freebsd/sys/netinet/libalias/alias_irc.c | 24 +- freebsd/sys/netinet/libalias/alias_local.h | 2 +- freebsd/sys/netinet/libalias/alias_mod.c | 192 +- freebsd/sys/netinet/libalias/alias_mod.h | 138 +- freebsd/sys/netinet/libalias/alias_nbt.c | 54 +- freebsd/sys/netinet/libalias/alias_pptp.c | 60 +- freebsd/sys/netinet/libalias/alias_sctp.h | 1 - freebsd/sys/netinet/libalias/alias_skinny.c | 20 +- freebsd/sys/netinet/libalias/alias_smedia.c | 22 +- freebsd/sys/netinet/pim_var.h | 35 +- freebsd/sys/netinet/raw_ip.c | 106 +- freebsd/sys/netinet/sctp.h | 36 +- freebsd/sys/netinet/sctp_asconf.c | 221 +- freebsd/sys/netinet/sctp_auth.c | 70 +- freebsd/sys/netinet/sctp_auth.h | 3 +- freebsd/sys/netinet/sctp_bsd_addr.c | 21 +- freebsd/sys/netinet/sctp_cc_functions.c | 145 +- freebsd/sys/netinet/sctp_constants.h | 83 +- freebsd/sys/netinet/sctp_dtrace_declare.h | 1 - freebsd/sys/netinet/sctp_dtrace_define.h | 177 +- freebsd/sys/netinet/sctp_header.h | 86 +- freebsd/sys/netinet/sctp_indata.c | 3491 ++++++++++++++------------ freebsd/sys/netinet/sctp_indata.h | 26 +- freebsd/sys/netinet/sctp_input.c | 956 +++---- freebsd/sys/netinet/sctp_input.h | 4 +- freebsd/sys/netinet/sctp_lock_bsd.h | 2 +- freebsd/sys/netinet/sctp_os_bsd.h | 84 +- freebsd/sys/netinet/sctp_output.c | 2332 +++++++++-------- freebsd/sys/netinet/sctp_output.h | 22 +- freebsd/sys/netinet/sctp_pcb.c | 653 +++-- freebsd/sys/netinet/sctp_pcb.h | 33 +- freebsd/sys/netinet/sctp_peeloff.c | 9 +- freebsd/sys/netinet/sctp_structs.h | 144 +- freebsd/sys/netinet/sctp_sysctl.c | 692 ++--- freebsd/sys/netinet/sctp_sysctl.h | 90 +- freebsd/sys/netinet/sctp_timer.c | 72 +- freebsd/sys/netinet/sctp_timer.h | 4 - freebsd/sys/netinet/sctp_uio.h | 76 +- freebsd/sys/netinet/sctp_usrreq.c | 2231 +++++++++++----- freebsd/sys/netinet/sctp_var.h | 93 +- freebsd/sys/netinet/sctputil.c | 1339 ++++++---- freebsd/sys/netinet/sctputil.h | 59 +- freebsd/sys/netinet/tcp.h | 16 +- freebsd/sys/netinet/tcp_debug.c | 3 +- freebsd/sys/netinet/tcp_hostcache.c | 81 +- freebsd/sys/netinet/tcp_hostcache.h | 2 +- freebsd/sys/netinet/tcp_input.c | 1134 +++++---- freebsd/sys/netinet/tcp_lro.c | 458 +++- freebsd/sys/netinet/tcp_lro.h | 45 +- freebsd/sys/netinet/tcp_offload.c | 3 +- freebsd/sys/netinet/tcp_output.c | 445 +++- freebsd/sys/netinet/tcp_reass.c | 80 +- freebsd/sys/netinet/tcp_sack.c | 39 +- freebsd/sys/netinet/tcp_subr.c | 1387 +++++++--- freebsd/sys/netinet/tcp_syncache.c | 995 +++++--- freebsd/sys/netinet/tcp_syncache.h | 47 +- freebsd/sys/netinet/tcp_timer.c | 583 +++-- freebsd/sys/netinet/tcp_timer.h | 39 +- freebsd/sys/netinet/tcp_timewait.c | 225 +- freebsd/sys/netinet/tcp_usrreq.c | 568 +++-- freebsd/sys/netinet/tcp_var.h | 490 ++-- freebsd/sys/netinet/udp_usrreq.c | 584 ++++- freebsd/sys/netinet/udp_var.h | 109 +- freebsd/sys/netinet/udplite.h | 38 + 120 files changed, 20169 insertions(+), 14756 deletions(-) delete mode 100644 freebsd/sys/netinet/cc.h create mode 100644 freebsd/sys/netinet/cc/cc.h create mode 100644 freebsd/sys/netinet/in_fib.c create mode 100644 freebsd/sys/netinet/in_fib.h create mode 100644 freebsd/sys/netinet/in_kdtrace.h create mode 100644 freebsd/sys/netinet/in_rss.h create mode 100644 freebsd/sys/netinet/ip_reass.c create mode 100644 freebsd/sys/netinet/udplite.h (limited to 'freebsd/sys/netinet') diff --git a/freebsd/sys/netinet/accf_dns.c b/freebsd/sys/netinet/accf_dns.c index 9858db4e..b6d2ff63 100644 --- a/freebsd/sys/netinet/accf_dns.c +++ b/freebsd/sys/netinet/accf_dns.c @@ -77,7 +77,7 @@ sohasdns(struct socket *so, void *arg, int waitflag) struct sockbuf *sb = &so->so_rcv; /* If the socket is full, we're ready. */ - if (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax) + if (sbused(sb) >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax) goto ready; /* Check to see if we have a request. */ @@ -117,14 +117,14 @@ skippacket(struct sockbuf *sb) { unsigned long packlen; struct packet q, *p = &q; - if (sb->sb_cc < 2) + if (sbavail(sb) < 2) return DNS_WAIT; q.m = sb->sb_mb; q.n = q.m->m_nextpkt; q.moff = 0; q.offset = 0; - q.len = sb->sb_cc; + q.len = sbavail(sb); GET16(p, packlen); if (packlen + 2 > q.len) diff --git a/freebsd/sys/netinet/accf_http.c b/freebsd/sys/netinet/accf_http.c index 3af867b0..83093db3 100644 --- a/freebsd/sys/netinet/accf_http.c +++ b/freebsd/sys/netinet/accf_http.c @@ -94,7 +94,7 @@ sbfull(struct sockbuf *sb) "mbcnt(%ld) >= mbmax(%ld): %d", sb->sb_cc, sb->sb_hiwat, sb->sb_cc >= sb->sb_hiwat, sb->sb_mbcnt, sb->sb_mbmax, sb->sb_mbcnt >= sb->sb_mbmax); - return (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax); + return (sbused(sb) >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax); } /* @@ -164,13 +164,14 @@ static int sohashttpget(struct socket *so, void *arg, int waitflag) { - if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) { + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 && + !sbfull(&so->so_rcv)) { struct mbuf *m; char *cmp; int cmplen, cc; m = so->so_rcv.sb_mb; - cc = so->so_rcv.sb_cc - 1; + cc = sbavail(&so->so_rcv) - 1; if (cc < 1) return (SU_OK); switch (*mtod(m, char *)) { @@ -217,7 +218,7 @@ soparsehttpvers(struct socket *so, void *arg, int waitflag) goto fallout; m = so->so_rcv.sb_mb; - cc = so->so_rcv.sb_cc; + cc = sbavail(&so->so_rcv); inspaces = spaces = 0; for (m = so->so_rcv.sb_mb; m; m = n) { n = m->m_nextpkt; @@ -306,7 +307,7 @@ soishttpconnected(struct socket *so, void *arg, int waitflag) * have NCHRS left */ copied = 0; - ccleft = so->so_rcv.sb_cc; + ccleft = sbavail(&so->so_rcv); if (ccleft < NCHRS) goto readmore; a = b = c = '\0'; diff --git a/freebsd/sys/netinet/cc.h b/freebsd/sys/netinet/cc.h deleted file mode 100644 index 14b4a9de..00000000 --- a/freebsd/sys/netinet/cc.h +++ /dev/null @@ -1,167 +0,0 @@ -/*- - * Copyright (c) 2007-2008 - * Swinburne University of Technology, Melbourne, Australia. - * Copyright (c) 2009-2010 Lawrence Stewart - * Copyright (c) 2010 The FreeBSD Foundation - * All rights reserved. - * - * This software was developed at the Centre for Advanced Internet - * Architectures, Swinburne University of Technology, by Lawrence Stewart and - * James Healy, made possible in part by a grant from the Cisco University - * Research Program Fund at Community Foundation Silicon Valley. - * - * Portions of this software were developed at the Centre for Advanced - * Internet Architectures, Swinburne University of Technology, Melbourne, - * Australia by David Hayes under sponsorship from the FreeBSD Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -/* - * This software was first released in 2007 by James Healy and Lawrence Stewart - * whilst working on the NewTCP research project at Swinburne University of - * Technology's Centre for Advanced Internet Architectures, Melbourne, - * Australia, which was made possible in part by a grant from the Cisco - * University Research Program Fund at Community Foundation Silicon Valley. - * More details are available at: - * http://caia.swin.edu.au/urp/newtcp/ - */ - -#ifndef _NETINET_CC_H_ -#define _NETINET_CC_H_ - -/* XXX: TCP_CA_NAME_MAX define lives in tcp.h for compat reasons. */ -#include - -/* Global CC vars. */ -extern STAILQ_HEAD(cc_head, cc_algo) cc_list; -extern const int tcprexmtthresh; -extern struct cc_algo newreno_cc_algo; - -/* Per-netstack bits. */ -VNET_DECLARE(struct cc_algo *, default_cc_ptr); -#define V_default_cc_ptr VNET(default_cc_ptr) - -/* Define the new net.inet.tcp.cc sysctl tree. */ -SYSCTL_DECL(_net_inet_tcp_cc); - -/* CC housekeeping functions. */ -int cc_register_algo(struct cc_algo *add_cc); -int cc_deregister_algo(struct cc_algo *remove_cc); - -/* - * Wrapper around transport structs that contain same-named congestion - * control variables. Allows algos to be shared amongst multiple CC aware - * transprots. - */ -struct cc_var { - void *cc_data; /* Per-connection private CC algorithm data. */ - int bytes_this_ack; /* # bytes acked by the current ACK. */ - tcp_seq curack; /* Most recent ACK. */ - uint32_t flags; /* Flags for cc_var (see below) */ - int type; /* Indicates which ptr is valid in ccvc. */ - union ccv_container { - struct tcpcb *tcp; - struct sctp_nets *sctp; - } ccvc; -}; - -/* cc_var flags. */ -#define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */ -#define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */ - -/* ACK types passed to the ack_received() hook. */ -#define CC_ACK 0x0001 /* Regular in sequence ACK. */ -#define CC_DUPACK 0x0002 /* Duplicate ACK. */ -#define CC_PARTIALACK 0x0004 /* Not yet. */ -#define CC_SACK 0x0008 /* Not yet. */ - -/* - * Congestion signal types passed to the cong_signal() hook. The highest order 8 - * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own - * congestion signal types. - */ -#define CC_ECN 0x00000001 /* ECN marked packet received. */ -#define CC_RTO 0x00000002 /* RTO fired. */ -#define CC_RTO_ERR 0x00000004 /* RTO fired in error. */ -#define CC_NDUPACK 0x00000008 /* Threshold of dupack's reached. */ - -#define CC_SIGPRIVMASK 0xFF000000 /* Mask to check if sig is private. */ - -/* - * Structure to hold data and function pointers that together represent a - * congestion control algorithm. - */ -struct cc_algo { - char name[TCP_CA_NAME_MAX]; - - /* Init global module state on kldload. */ - int (*mod_init)(void); - - /* Cleanup global module state on kldunload. */ - int (*mod_destroy)(void); - - /* Init CC state for a new control block. */ - int (*cb_init)(struct cc_var *ccv); - - /* Cleanup CC state for a terminating control block. */ - void (*cb_destroy)(struct cc_var *ccv); - - /* Init variables for a newly established connection. */ - void (*conn_init)(struct cc_var *ccv); - - /* Called on receipt of an ack. */ - void (*ack_received)(struct cc_var *ccv, uint16_t type); - - /* Called on detection of a congestion signal. */ - void (*cong_signal)(struct cc_var *ccv, uint32_t type); - - /* Called after exiting congestion recovery. */ - void (*post_recovery)(struct cc_var *ccv); - - /* Called when data transfer resumes after an idle period. */ - void (*after_idle)(struct cc_var *ccv); - - STAILQ_ENTRY (cc_algo) entries; -}; - -/* Macro to obtain the CC algo's struct ptr. */ -#define CC_ALGO(tp) ((tp)->cc_algo) - -/* Macro to obtain the CC algo's data ptr. */ -#define CC_DATA(tp) ((tp)->ccv->cc_data) - -/* Macro to obtain the system default CC algo's struct ptr. */ -#define CC_DEFAULT() V_default_cc_ptr - -extern struct rwlock cc_list_lock; -#define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list") -#define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock) -#define CC_LIST_RLOCK() rw_rlock(&cc_list_lock) -#define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock) -#define CC_LIST_WLOCK() rw_wlock(&cc_list_lock) -#define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock) -#define CC_LIST_LOCK_ASSERT() rw_assert(&cc_list_lock, RA_LOCKED) - -#endif /* _NETINET_CC_H_ */ diff --git a/freebsd/sys/netinet/cc/cc.c b/freebsd/sys/netinet/cc/cc.c index 4be9a63b..ab3e831e 100644 --- a/freebsd/sys/netinet/cc/cc.c +++ b/freebsd/sys/netinet/cc/cc.c @@ -65,13 +65,13 @@ __FBSDID("$FreeBSD$"); #include #include -#include -#include +#include -#include #include #include +#include #include +#include #include @@ -320,13 +320,14 @@ SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); /* Declare sysctl tree and populate it. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL, - "congestion control related settings"); + "Congestion control related settings"); -SYSCTL_VNET_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW, - NULL, 0, cc_default_algo, "A", "default congestion control algorithm"); +SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, + CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, cc_default_algo, "A", "Default congestion control algorithm"); #ifndef __rtems__ SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, cc_list_available, "A", - "list available congestion control algorithms"); + "List available congestion control algorithms"); #endif /* __rtems__ */ diff --git a/freebsd/sys/netinet/cc/cc.h b/freebsd/sys/netinet/cc/cc.h new file mode 100644 index 00000000..1da6f620 --- /dev/null +++ b/freebsd/sys/netinet/cc/cc.h @@ -0,0 +1,178 @@ +/*- + * Copyright (c) 2007-2008 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Lawrence Stewart and + * James Healy, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University of + * Technology's Centre for Advanced Internet Architectures, Melbourne, + * Australia, which was made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +#ifndef _NETINET_CC_CC_H_ +#define _NETINET_CC_CC_H_ + +#if !defined(_KERNEL) +#error "no user-servicable parts inside" +#endif + +/* Global CC vars. */ +extern STAILQ_HEAD(cc_head, cc_algo) cc_list; +extern const int tcprexmtthresh; +extern struct cc_algo newreno_cc_algo; + +/* Per-netstack bits. */ +VNET_DECLARE(struct cc_algo *, default_cc_ptr); +#define V_default_cc_ptr VNET(default_cc_ptr) + +/* Define the new net.inet.tcp.cc sysctl tree. */ +SYSCTL_DECL(_net_inet_tcp_cc); + +/* CC housekeeping functions. */ +int cc_register_algo(struct cc_algo *add_cc); +int cc_deregister_algo(struct cc_algo *remove_cc); + +/* + * Wrapper around transport structs that contain same-named congestion + * control variables. Allows algos to be shared amongst multiple CC aware + * transprots. + */ +struct cc_var { + void *cc_data; /* Per-connection private CC algorithm data. */ + int bytes_this_ack; /* # bytes acked by the current ACK. */ + tcp_seq curack; /* Most recent ACK. */ + uint32_t flags; /* Flags for cc_var (see below) */ + int type; /* Indicates which ptr is valid in ccvc. */ + union ccv_container { + struct tcpcb *tcp; + struct sctp_nets *sctp; + } ccvc; +}; + +/* cc_var flags. */ +#define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */ +#define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */ +#define CCF_DELACK 0x0004 /* Is this ack delayed? */ +#define CCF_ACKNOW 0x0008 /* Will this ack be sent now? */ +#define CCF_IPHDR_CE 0x0010 /* Does this packet set CE bit? */ +#define CCF_TCPHDR_CWR 0x0020 /* Does this packet set CWR bit? */ + +/* ACK types passed to the ack_received() hook. */ +#define CC_ACK 0x0001 /* Regular in sequence ACK. */ +#define CC_DUPACK 0x0002 /* Duplicate ACK. */ +#define CC_PARTIALACK 0x0004 /* Not yet. */ +#define CC_SACK 0x0008 /* Not yet. */ + +/* + * Congestion signal types passed to the cong_signal() hook. The highest order 8 + * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own + * congestion signal types. + */ +#define CC_ECN 0x00000001 /* ECN marked packet received. */ +#define CC_RTO 0x00000002 /* RTO fired. */ +#define CC_RTO_ERR 0x00000004 /* RTO fired in error. */ +#define CC_NDUPACK 0x00000008 /* Threshold of dupack's reached. */ + +#define CC_SIGPRIVMASK 0xFF000000 /* Mask to check if sig is private. */ + +/* + * Structure to hold data and function pointers that together represent a + * congestion control algorithm. + */ +struct cc_algo { + char name[TCP_CA_NAME_MAX]; + + /* Init global module state on kldload. */ + int (*mod_init)(void); + + /* Cleanup global module state on kldunload. */ + int (*mod_destroy)(void); + + /* Init CC state for a new control block. */ + int (*cb_init)(struct cc_var *ccv); + + /* Cleanup CC state for a terminating control block. */ + void (*cb_destroy)(struct cc_var *ccv); + + /* Init variables for a newly established connection. */ + void (*conn_init)(struct cc_var *ccv); + + /* Called on receipt of an ack. */ + void (*ack_received)(struct cc_var *ccv, uint16_t type); + + /* Called on detection of a congestion signal. */ + void (*cong_signal)(struct cc_var *ccv, uint32_t type); + + /* Called after exiting congestion recovery. */ + void (*post_recovery)(struct cc_var *ccv); + + /* Called when data transfer resumes after an idle period. */ + void (*after_idle)(struct cc_var *ccv); + + /* Called for an additional ECN processing apart from RFC3168. */ + void (*ecnpkt_handler)(struct cc_var *ccv); + + /* Called for {get|set}sockopt() on a TCP socket with TCP_CCALGOOPT. */ + int (*ctl_output)(struct cc_var *, struct sockopt *, void *); + + STAILQ_ENTRY (cc_algo) entries; +}; + +/* Macro to obtain the CC algo's struct ptr. */ +#define CC_ALGO(tp) ((tp)->cc_algo) + +/* Macro to obtain the CC algo's data ptr. */ +#define CC_DATA(tp) ((tp)->ccv->cc_data) + +/* Macro to obtain the system default CC algo's struct ptr. */ +#define CC_DEFAULT() V_default_cc_ptr + +extern struct rwlock cc_list_lock; +#define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list") +#define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock) +#define CC_LIST_RLOCK() rw_rlock(&cc_list_lock) +#define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock) +#define CC_LIST_WLOCK() rw_wlock(&cc_list_lock) +#define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock) +#define CC_LIST_LOCK_ASSERT() rw_assert(&cc_list_lock, RA_LOCKED) + +#endif /* _NETINET_CC_CC_H_ */ diff --git a/freebsd/sys/netinet/cc/cc_newreno.c b/freebsd/sys/netinet/cc/cc_newreno.c index c0f0cfc5..8077bb22 100644 --- a/freebsd/sys/netinet/cc/cc_newreno.c +++ b/freebsd/sys/netinet/cc/cc_newreno.c @@ -64,10 +64,10 @@ __FBSDID("$FreeBSD$"); #include -#include +#include #include #include - +#include #include static void newreno_ack_received(struct cc_var *ccv, uint16_t type); @@ -216,6 +216,9 @@ newreno_cong_signal(struct cc_var *ccv, uint32_t type) static void newreno_post_recovery(struct cc_var *ccv) { + int pipe; + pipe = 0; + if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * Fast recovery will conclude after returning from this @@ -226,10 +229,13 @@ newreno_post_recovery(struct cc_var *ccv) * * XXXLAS: Find a way to do this without needing curack */ - if (SEQ_GT(ccv->curack + CCV(ccv, snd_ssthresh), - CCV(ccv, snd_max))) - CCV(ccv, snd_cwnd) = CCV(ccv, snd_max) - - ccv->curack + CCV(ccv, t_maxseg); + if (V_tcp_do_rfc6675_pipe) + pipe = tcp_compute_pipe(ccv->ccvc.tcp); + else + pipe = CCV(ccv, snd_max) - ccv->curack; + + if (pipe < CCV(ccv, snd_ssthresh)) + CCV(ccv, snd_cwnd) = pipe + CCV(ccv, t_maxseg); else CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); } diff --git a/freebsd/sys/netinet/icmp6.h b/freebsd/sys/netinet/icmp6.h index 5483721d..af35c847 100644 --- a/freebsd/sys/netinet/icmp6.h +++ b/freebsd/sys/netinet/icmp6.h @@ -144,6 +144,9 @@ struct icmp6_hdr { #define ICMP6_DST_UNREACH_BEYONDSCOPE 2 /* beyond scope of source address */ #define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ #define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ +#define ICMP6_DST_UNREACH_POLICY 5 /* failed ingress/egress policy */ +#define ICMP6_DST_UNREACH_REJECT 6 /* Reject route to destination */ +#define ICMP6_DST_UNREACH_SRCROUTE 7 /* Error in source routing header */ #define ICMP6_TIME_EXCEED_TRANSIT 0 /* ttl==0 in transit */ #define ICMP6_TIME_EXCEED_REASSEMBLY 1 /* ttl==0 in reass */ @@ -297,9 +300,11 @@ struct nd_opt_hdr { /* Neighbor discovery option header */ #define ND_OPT_PREFIX_INFORMATION 3 #define ND_OPT_REDIRECTED_HEADER 4 #define ND_OPT_MTU 5 +#define ND_OPT_NONCE 14 /* RFC 3971 */ #define ND_OPT_ROUTE_INFO 24 /* RFC 4191 */ #define ND_OPT_RDNSS 25 /* RFC 6106 */ #define ND_OPT_DNSSL 31 /* RFC 6106 */ +#define ND_OPT_MAX 31 struct nd_opt_prefix_info { /* prefix information */ u_int8_t nd_opt_pi_type; @@ -330,6 +335,16 @@ struct nd_opt_mtu { /* MTU option */ u_int32_t nd_opt_mtu_mtu; } __packed; +#define ND_OPT_NONCE_LEN ((1 * 8) - 2) +#if ((ND_OPT_NONCE_LEN + 2) % 8) != 0 +#error "(ND_OPT_NONCE_LEN + 2) must be a multiple of 8." +#endif +struct nd_opt_nonce { /* nonce option */ + u_int8_t nd_opt_nonce_type; + u_int8_t nd_opt_nonce_len; + u_int8_t nd_opt_nonce[ND_OPT_NONCE_LEN]; +} __packed; + struct nd_opt_route_info { /* route info */ u_int8_t nd_opt_rti_type; u_int8_t nd_opt_rti_len; @@ -555,39 +570,39 @@ do { \ * of the internet control message protocol version 6. */ struct icmp6errstat { - u_quad_t icp6errs_dst_unreach_noroute; - u_quad_t icp6errs_dst_unreach_admin; - u_quad_t icp6errs_dst_unreach_beyondscope; - u_quad_t icp6errs_dst_unreach_addr; - u_quad_t icp6errs_dst_unreach_noport; - u_quad_t icp6errs_packet_too_big; - u_quad_t icp6errs_time_exceed_transit; - u_quad_t icp6errs_time_exceed_reassembly; - u_quad_t icp6errs_paramprob_header; - u_quad_t icp6errs_paramprob_nextheader; - u_quad_t icp6errs_paramprob_option; - u_quad_t icp6errs_redirect; /* we regard redirect as an error here */ - u_quad_t icp6errs_unknown; + uint64_t icp6errs_dst_unreach_noroute; + uint64_t icp6errs_dst_unreach_admin; + uint64_t icp6errs_dst_unreach_beyondscope; + uint64_t icp6errs_dst_unreach_addr; + uint64_t icp6errs_dst_unreach_noport; + uint64_t icp6errs_packet_too_big; + uint64_t icp6errs_time_exceed_transit; + uint64_t icp6errs_time_exceed_reassembly; + uint64_t icp6errs_paramprob_header; + uint64_t icp6errs_paramprob_nextheader; + uint64_t icp6errs_paramprob_option; + uint64_t icp6errs_redirect; /* we regard redirect as an error here */ + uint64_t icp6errs_unknown; }; struct icmp6stat { /* statistics related to icmp6 packets generated */ - u_quad_t icp6s_error; /* # of calls to icmp6_error */ - u_quad_t icp6s_canterror; /* no error 'cuz old was icmp */ - u_quad_t icp6s_toofreq; /* no error 'cuz rate limitation */ - u_quad_t icp6s_outhist[256]; + uint64_t icp6s_error; /* # of calls to icmp6_error */ + uint64_t icp6s_canterror; /* no error 'cuz old was icmp */ + uint64_t icp6s_toofreq; /* no error 'cuz rate limitation */ + uint64_t icp6s_outhist[256]; /* statistics related to input message processed */ - u_quad_t icp6s_badcode; /* icmp6_code out of range */ - u_quad_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */ - u_quad_t icp6s_checksum; /* bad checksum */ - u_quad_t icp6s_badlen; /* calculated bound mismatch */ + uint64_t icp6s_badcode; /* icmp6_code out of range */ + uint64_t icp6s_tooshort; /* packet < sizeof(struct icmp6_hdr) */ + uint64_t icp6s_checksum; /* bad checksum */ + uint64_t icp6s_badlen; /* calculated bound mismatch */ /* * number of responses: this member is inherited from netinet code, but * for netinet6 code, it is already available in icp6s_outhist[]. */ - u_quad_t icp6s_reflect; - u_quad_t icp6s_inhist[256]; - u_quad_t icp6s_nd_toomanyopt; /* too many ND options */ + uint64_t icp6s_reflect; + uint64_t icp6s_inhist[256]; + uint64_t icp6s_nd_toomanyopt; /* too many ND options */ struct icmp6errstat icp6s_outerrhist; #define icp6s_odst_unreach_noroute \ icp6s_outerrhist.icp6errs_dst_unreach_noroute @@ -607,29 +622,33 @@ struct icmp6stat { #define icp6s_oparamprob_option icp6s_outerrhist.icp6errs_paramprob_option #define icp6s_oredirect icp6s_outerrhist.icp6errs_redirect #define icp6s_ounknown icp6s_outerrhist.icp6errs_unknown - u_quad_t icp6s_pmtuchg; /* path MTU changes */ - u_quad_t icp6s_nd_badopt; /* bad ND options */ - u_quad_t icp6s_badns; /* bad neighbor solicitation */ - u_quad_t icp6s_badna; /* bad neighbor advertisement */ - u_quad_t icp6s_badrs; /* bad router advertisement */ - u_quad_t icp6s_badra; /* bad router advertisement */ - u_quad_t icp6s_badredirect; /* bad redirect message */ + uint64_t icp6s_pmtuchg; /* path MTU changes */ + uint64_t icp6s_nd_badopt; /* bad ND options */ + uint64_t icp6s_badns; /* bad neighbor solicitation */ + uint64_t icp6s_badna; /* bad neighbor advertisement */ + uint64_t icp6s_badrs; /* bad router advertisement */ + uint64_t icp6s_badra; /* bad router advertisement */ + uint64_t icp6s_badredirect; /* bad redirect message */ }; #ifdef _KERNEL +#include + +VNET_PCPUSTAT_DECLARE(struct icmp6stat, icmp6stat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define ICMP6STAT_ADD(name, val) V_icmp6stat.name += (val) +#define ICMP6STAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct icmp6stat, icmp6stat, name, (val)) #define ICMP6STAT_INC(name) ICMP6STAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_icmp6stat_inc(int statnum); -#define KMOD_ICMP6STAT_INC(name) \ - kmod_icmp6stat_inc(offsetof(struct icmp6stat, name) / sizeof(u_quad_t)) +#define KMOD_ICMP6STAT_INC(name) \ + kmod_icmp6stat_inc(offsetof(struct icmp6stat, name) / sizeof(uint64_t)) #endif /* @@ -688,7 +707,9 @@ void icmp6_mtudisc_update(struct ip6ctlparam *, int); #define icmp6_ifstat_inc(ifp, tag) \ do { \ if (ifp) \ - ((struct in6_ifextra *)((ifp)->if_afdata[AF_INET6]))->icmp6_ifstat->tag++; \ + counter_u64_add(((struct in6_ifextra *) \ + ((ifp)->if_afdata[AF_INET6]))->icmp6_ifstat[\ + offsetof(struct icmp6_ifstat, tag) / sizeof(uint64_t)], 1);\ } while (/*CONSTCOND*/ 0) #define icmp6_ifoutstat_inc(ifp, type, code) \ diff --git a/freebsd/sys/netinet/icmp_var.h b/freebsd/sys/netinet/icmp_var.h index d939cc2e..d3e72bc2 100644 --- a/freebsd/sys/netinet/icmp_var.h +++ b/freebsd/sys/netinet/icmp_var.h @@ -58,11 +58,15 @@ struct icmpstat { }; #ifdef _KERNEL +#include + +VNET_PCPUSTAT_DECLARE(struct icmpstat, icmpstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define ICMPSTAT_ADD(name, val) V_icmpstat.name += (val) +#define ICMPSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct icmpstat, icmpstat, name, (val)) #define ICMPSTAT_INC(name) ICMPSTAT_ADD(name, 1) /* @@ -70,30 +74,19 @@ struct icmpstat { */ void kmod_icmpstat_inc(int statnum); #define KMOD_ICMPSTAT_INC(name) \ - kmod_icmpstat_inc(offsetof(struct icmpstat, name) / sizeof(u_long)) + kmod_icmpstat_inc(offsetof(struct icmpstat, name) / sizeof(uint64_t)) #endif /* - * Names for ICMP sysctl objects + * Identifiers for ICMP sysctl nodes */ #define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */ #define ICMPCTL_STATS 2 /* statistics (read-only) */ #define ICMPCTL_ICMPLIM 3 -#define ICMPCTL_MAXID 4 - -#define ICMPCTL_NAMES { \ - { 0, 0 }, \ - { "maskrepl", CTLTYPE_INT }, \ - { "stats", CTLTYPE_STRUCT }, \ - { "icmplim", CTLTYPE_INT }, \ -} #ifdef _KERNEL SYSCTL_DECL(_net_inet_icmp); -VNET_DECLARE(struct icmpstat, icmpstat); /* icmp statistics. */ -#define V_icmpstat VNET(icmpstat) - extern int badport_bandlim(int); #define BANDLIM_UNLIMITED -1 #define BANDLIM_ICMP_UNREACH 0 diff --git a/freebsd/sys/netinet/if_atm.c b/freebsd/sys/netinet/if_atm.c index e26d0c7c..cb0317fb 100644 --- a/freebsd/sys/netinet/if_atm.c +++ b/freebsd/sys/netinet/if_atm.c @@ -54,6 +54,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -319,7 +320,7 @@ failed: * but this is enough for PVCs entered via the "route" command. */ int -atmresolve(struct rtentry *rt, struct mbuf *m, struct sockaddr *dst, +atmresolve(struct rtentry *rt, struct mbuf *m, const struct sockaddr *dst, struct atm_pseudohdr *desten) { struct sockaddr_dl *sdl; @@ -331,7 +332,8 @@ atmresolve(struct rtentry *rt, struct mbuf *m, struct sockaddr *dst, } if (rt == NULL) { - rt = RTALLOC1(dst, 0); /* link level on table 0 XXX MRT */ + /* link level on table 0 XXX MRT */ + rt = RTALLOC1(__DECONST(struct sockaddr *, dst), 0); if (rt == NULL) goto bad; /* failed */ RT_REMREF(rt); /* don't keep LL references */ diff --git a/freebsd/sys/netinet/if_atm.h b/freebsd/sys/netinet/if_atm.h index bd8b5143..04ad218d 100644 --- a/freebsd/sys/netinet/if_atm.h +++ b/freebsd/sys/netinet/if_atm.h @@ -43,5 +43,5 @@ struct rtentry; struct sockaddr; void atm_rtrequest(int, struct rtentry *, struct rt_addrinfo *); -int atmresolve(struct rtentry *, struct mbuf *, struct sockaddr *, +int atmresolve(struct rtentry *, struct mbuf *, const struct sockaddr *, struct atm_pseudohdr *); diff --git a/freebsd/sys/netinet/if_ether.c b/freebsd/sys/netinet/if_ether.c index eec06dd8..0a8b101e 100644 --- a/freebsd/sys/netinet/if_ether.c +++ b/freebsd/sys/netinet/if_ether.c @@ -44,39 +44,50 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include #include #include #include +#include #include #include #include +#include #include #include #include -#include #include #include #include #include +#include #include #include #include -#if defined(INET) +#ifdef INET #include #endif -#include -#include - #include -#define SIN(s) ((struct sockaddr_in *)s) -#define SDL(s) ((struct sockaddr_dl *)s) +#define SIN(s) ((const struct sockaddr_in *)(s)) + +static struct timeval arp_lastlog; +static int arp_curpps; +static int arp_maxpps = 1; + +/* Simple ARP state machine */ +enum arp_llinfo_state { + ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */ + ARP_LLINFO_REACHABLE, /* LLE is valid */ + ARP_LLINFO_VERIFY, /* LLE is valid, need refresh */ + ARP_LLINFO_DELETED, /* LLE is deleted */ +}; SYSCTL_DECL(_net_link_ether); static SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); @@ -86,53 +97,67 @@ static SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, ""); static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20 * minutes */ static VNET_DEFINE(int, arp_maxtries) = 5; -VNET_DEFINE(int, useloopback) = 1; /* use loopback interface for - * local traffic */ static VNET_DEFINE(int, arp_proxyall) = 0; static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for * 20 seconds */ -VNET_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ +static VNET_DEFINE(int, arpt_rexmit) = 1; /* retransmit arp entries, sec*/ +VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ +VNET_PCPUSTAT_SYSINIT(arpstat); + +#ifdef VIMAGE +VNET_PCPUSTAT_SYSUNINIT(arpstat); +#endif /* VIMAGE */ static VNET_DEFINE(int, arp_maxhold) = 1; #define V_arpt_keep VNET(arpt_keep) #define V_arpt_down VNET(arpt_down) +#define V_arpt_rexmit VNET(arpt_rexmit) #define V_arp_maxtries VNET(arp_maxtries) #define V_arp_proxyall VNET(arp_proxyall) -#define V_arpstat VNET(arpstat) #define V_arp_maxhold VNET(arp_maxhold) -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_keep), 0, "ARP entry lifetime in seconds"); -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxtries), 0, "ARP resolution attempts before returning error"); -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, - &VNET_NAME(useloopback), 0, - "Use the loopback interface for local traffic"); -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_proxyall), 0, "Enable proxy ARP for all suitable requests"); -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arpt_down), 0, "Incomplete ARP entry lifetime in seconds"); -SYSCTL_VNET_STRUCT(_net_link_ether_arp, OID_AUTO, stats, CTLFLAG_RW, - &VNET_NAME(arpstat), arpstat, - "ARP statistics (struct arpstat, net/if_arp.h)"); -SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_RW, +SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat, + arpstat, "ARP statistics (struct arpstat, net/if_arp.h)"); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(arp_maxhold), 0, "Number of packets to hold per ARP entry"); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second, + CTLFLAG_RW, &arp_maxpps, 0, + "Maximum number of remotely triggered ARP messages that can be " + "logged per second"); + +#define ARP_LOG(pri, ...) do { \ + if (ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps)) \ + log((pri), "arp: " __VA_ARGS__); \ +} while (0) + -static void arp_init(void); -void arprequest(struct ifnet *, - struct in_addr *, struct in_addr *, u_char *); static void arpintr(struct mbuf *); static void arptimer(void *); #ifdef INET static void in_arpinput(struct mbuf *); #endif +static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, + struct ifnet *ifp, int bridged, struct llentry *la); +static void arp_mark_lle_reachable(struct llentry *la); +static void arp_iflladdr(void *arg __unused, struct ifnet *ifp); + +static eventhandler_tag iflladdr_tag; + static const struct netisr_handler arp_nh = { .nh_name = "arp", .nh_handler = arpintr, @@ -140,29 +165,6 @@ static const struct netisr_handler arp_nh = { .nh_policy = NETISR_POLICY_SOURCE, }; -#ifdef AF_INET -void arp_ifscrub(struct ifnet *ifp, uint32_t addr); - -/* - * called by in_ifscrub to remove entry from the table when - * the interface goes away - */ -void -arp_ifscrub(struct ifnet *ifp, uint32_t addr) -{ - struct sockaddr_in addr4; - - bzero((void *)&addr4, sizeof(addr4)); - addr4.sin_len = sizeof(addr4); - addr4.sin_family = AF_INET; - addr4.sin_addr.s_addr = addr; - IF_AFDATA_WLOCK(ifp); - lla_lookup(LLTABLE(ifp), (LLE_DELETE | LLE_IFADDR), - (struct sockaddr *)&addr4); - IF_AFDATA_WUNLOCK(ifp); -} -#endif - /* * Timeout routine. Age arp_tab entries periodically. */ @@ -171,15 +173,83 @@ arptimer(void *arg) { struct llentry *lle = (struct llentry *)arg; struct ifnet *ifp; + int r_skip_req; if (lle->la_flags & LLE_STATIC) { - LLE_WUNLOCK(lle); return; } - + LLE_WLOCK(lle); + if (callout_pending(&lle->lle_timer)) { + /* + * Here we are a bit odd here in the treatment of + * active/pending. If the pending bit is set, it got + * rescheduled before I ran. The active + * bit we ignore, since if it was stopped + * in ll_tablefree() and was currently running + * it would have return 0 so the code would + * not have deleted it since the callout could + * not be stopped so we want to go through + * with the delete here now. If the callout + * was restarted, the pending bit will be back on and + * we just want to bail since the callout_reset would + * return 1 and our reference would have been removed + * by arpresolve() below. + */ + LLE_WUNLOCK(lle); + return; + } ifp = lle->lle_tbl->llt_ifp; CURVNET_SET(ifp->if_vnet); + switch (lle->ln_state) { + case ARP_LLINFO_REACHABLE: + + /* + * Expiration time is approaching. + * Let's try to refresh entry if it is still + * in use. + * + * Set r_skip_req to get feedback from + * fast path. Change state and re-schedule + * ourselves. + */ + LLE_REQ_LOCK(lle); + lle->r_skip_req = 1; + LLE_REQ_UNLOCK(lle); + lle->ln_state = ARP_LLINFO_VERIFY; + callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); + LLE_WUNLOCK(lle); + CURVNET_RESTORE(); + return; + case ARP_LLINFO_VERIFY: + LLE_REQ_LOCK(lle); + r_skip_req = lle->r_skip_req; + LLE_REQ_UNLOCK(lle); + + if (r_skip_req == 0 && lle->la_preempt > 0) { + /* Entry was used, issue refresh request */ + struct in_addr dst; + dst = lle->r_l3addr.addr4; + lle->la_preempt--; + callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); + LLE_WUNLOCK(lle); + arprequest(ifp, NULL, &dst, NULL); + CURVNET_RESTORE(); + return; + } + /* Nothing happened. Reschedule if not too late */ + if (lle->la_expire > time_uptime) { + callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit); + LLE_WUNLOCK(lle); + CURVNET_RESTORE(); + return; + } + break; + case ARP_LLINFO_INCOMPLETE: + case ARP_LLINFO_DELETED: + break; + } + if ((lle->la_flags & LLE_DELETED) == 0) { int evt; @@ -190,7 +260,7 @@ arptimer(void *arg) EVENTHANDLER_INVOKE(lle_event, lle, evt); } - callout_stop(&lle->la_timer); + callout_stop(&lle->lle_timer); /* XXX: LOR avoidance. We still have ref on lle. */ LLE_WUNLOCK(lle); @@ -199,21 +269,50 @@ arptimer(void *arg) /* Guard against race with other llentry_free(). */ if (lle->la_flags & LLE_LINKED) { - size_t pkts_dropped; - LLE_REMREF(lle); - pkts_dropped = llentry_free(lle); - ARPSTAT_ADD(dropped, pkts_dropped); - } else - LLE_FREE_LOCKED(lle); - + lltable_unlink_entry(lle->lle_tbl, lle); + } IF_AFDATA_UNLOCK(ifp); + size_t pkts_dropped = llentry_free(lle); + + ARPSTAT_ADD(dropped, pkts_dropped); ARPSTAT_INC(timeouts); CURVNET_RESTORE(); } +/* + * Stores link-layer header for @ifp in format suitable for if_output() + * into buffer @buf. Resulting header length is stored in @bufsize. + * + * Returns 0 on success. + */ +static int +arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf, + size_t *bufsize) +{ + struct if_encap_req ereq; + int error; + + bzero(buf, *bufsize); + bzero(&ereq, sizeof(ereq)); + ereq.buf = buf; + ereq.bufsize = *bufsize; + ereq.rtype = IFENCAP_LL; + ereq.family = AF_ARP; + ereq.lladdr = ar_tha(ah); + ereq.hdata = (u_char *)ah; + if (bcast) + ereq.flags = IFENCAP_FLAG_BROADCAST; + error = ifp->if_requestencap(ifp, &ereq); + if (error == 0) + *bufsize = ereq.bufsize; + + return (error); +} + + /* * Broadcast an ARP request. Caller specifies: * - arp header source ip address @@ -221,42 +320,58 @@ arptimer(void *arg) * - arp header source ethernet address */ void -arprequest(struct ifnet *ifp, struct in_addr *sip, struct in_addr *tip, - u_char *enaddr) +arprequest(struct ifnet *ifp, const struct in_addr *sip, + const struct in_addr *tip, u_char *enaddr) { struct mbuf *m; struct arphdr *ah; struct sockaddr sa; + u_char *carpaddr = NULL; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + struct route ro; + int error; if (sip == NULL) { - /* XXX don't believe this can happen (or explain why) */ /* * The caller did not supply a source address, try to find * a compatible one among those assigned to this interface. */ struct ifaddr *ifa; + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (!ifa->ifa_addr || - ifa->ifa_addr->sa_family != AF_INET) + if (ifa->ifa_addr->sa_family != AF_INET) continue; - sip = &SIN(ifa->ifa_addr)->sin_addr; + + if (ifa->ifa_carp) { + if ((*carp_iamatch_p)(ifa, &carpaddr) == 0) + continue; + sip = &IA_SIN(ifa)->sin_addr; + } else { + carpaddr = NULL; + sip = &IA_SIN(ifa)->sin_addr; + } + if (0 == ((sip->s_addr ^ tip->s_addr) & - SIN(ifa->ifa_netmask)->sin_addr.s_addr) ) + IA_MASKSIN(ifa)->sin_addr.s_addr)) break; /* found it. */ } + IF_ADDR_RUNLOCK(ifp); if (sip == NULL) { printf("%s: cannot find matching address\n", __func__); return; } } + if (enaddr == NULL) + enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp); - if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) + if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) return; - m->m_len = sizeof(*ah) + 2*sizeof(struct in_addr) + - 2*ifp->if_data.ifi_addrlen; + m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) + + 2 * ifp->if_addrlen; m->m_pkthdr.len = m->m_len; - MH_ALIGN(m, m->m_len); + M_ALIGN(m, m->m_len); ah = mtod(m, struct arphdr *); bzero((caddr_t)ah, m->m_len); #ifdef MAC @@ -266,109 +381,121 @@ arprequest(struct ifnet *ifp, struct in_addr *sip, struct in_addr *tip, ah->ar_hln = ifp->if_addrlen; /* hardware address length */ ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ ah->ar_op = htons(ARPOP_REQUEST); - bcopy((caddr_t)enaddr, (caddr_t)ar_sha(ah), ah->ar_hln); - bcopy((caddr_t)sip, (caddr_t)ar_spa(ah), ah->ar_pln); - bcopy((caddr_t)tip, (caddr_t)ar_tpa(ah), ah->ar_pln); + bcopy(enaddr, ar_sha(ah), ah->ar_hln); + bcopy(sip, ar_spa(ah), ah->ar_pln); + bcopy(tip, ar_tpa(ah), ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; + + /* Calculate link header for sending frame */ + bzero(&ro, sizeof(ro)); + linkhdrsize = sizeof(linkhdr); + error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize); + if (error != 0 && error != EAFNOSUPPORT) { + ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", + if_name(ifp), error); + return; + } + + ro.ro_prepend = linkhdr; + ro.ro_plen = linkhdrsize; + ro.ro_flags = 0; + m->m_flags |= M_BCAST; - (*ifp->if_output)(ifp, m, &sa, NULL); + m_clrprotoflags(m); /* Avoid confusing lower layers. */ + (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txrequests); } + /* - * Resolve an IP address into an ethernet address. - * On input: - * ifp is the interface we use - * rt0 is the route to the final destination (possibly useless) - * m is the mbuf. May be NULL if we don't have a packet. - * dst is the next hop, - * desten is where we want the address. + * Resolve an IP address into an ethernet address - heavy version. + * Used internally by arpresolve(). + * We have already checked than we can't use existing lle without + * modification so we have to acquire LLE_EXCLUSIVE lle lock. * - * On success, desten is filled in and the function returns 0; + * On success, desten and flags are filled in and the function returns 0; * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. */ -int -arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m, - struct sockaddr *dst, u_char *desten, struct llentry **lle) +static int +arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m, + const struct sockaddr *dst, u_char *desten, uint32_t *pflags, + struct llentry **plle) { - struct llentry *la = 0; - u_int flags = 0; + struct llentry *la = NULL, *la_tmp; struct mbuf *curr = NULL; struct mbuf *next = NULL; int error, renew; + char *lladdr; + int ll_len; - *lle = NULL; - if (m != NULL) { - if (m->m_flags & M_BCAST) { - /* broadcast */ - (void)memcpy(desten, - ifp->if_broadcastaddr, ifp->if_addrlen); - return (0); - } - if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) { - /* multicast */ - ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); - return (0); - } + if (pflags != NULL) + *pflags = 0; + if (plle != NULL) + *plle = NULL; + + if ((flags & LLE_CREATE) == 0) { + IF_AFDATA_RLOCK(ifp); + la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); + IF_AFDATA_RUNLOCK(ifp); } -retry: - IF_AFDATA_RLOCK(ifp); - la = lla_lookup(LLTABLE(ifp), flags, dst); - IF_AFDATA_RUNLOCK(ifp); - if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0) - && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) { - flags |= (LLE_CREATE | LLE_EXCLUSIVE); + if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) { + la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); + if (la == NULL) { + log(LOG_DEBUG, + "arpresolve: can't allocate llinfo for %s on %s\n", + inet_ntoa(SIN(dst)->sin_addr), if_name(ifp)); + m_freem(m); + return (EINVAL); + } + IF_AFDATA_WLOCK(ifp); - la = lla_lookup(LLTABLE(ifp), flags, dst); + LLE_WLOCK(la); + la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); + /* Prefer ANY existing lle over newly-created one */ + if (la_tmp == NULL) + lltable_link_entry(LLTABLE(ifp), la); IF_AFDATA_WUNLOCK(ifp); + if (la_tmp != NULL) { + lltable_free_entry(LLTABLE(ifp), la); + la = la_tmp; + } } if (la == NULL) { - if (flags & LLE_CREATE) - log(LOG_DEBUG, - "arpresolve: can't allocate llinfo for %s on %s\n", - inet_ntoa(SIN(dst)->sin_addr), ifp->if_xname); m_freem(m); return (EINVAL); } if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { - bcopy(&la->ll_addr, desten, ifp->if_addrlen); - /* - * If entry has an expiry time and it is approaching, - * see if we need to send an ARP request within this - * arpt_down interval. - */ - if (!(la->la_flags & LLE_STATIC) && - time_uptime + la->la_preempt > la->la_expire) { - arprequest(ifp, NULL, - &SIN(dst)->sin_addr, IF_LLADDR(ifp)); - - la->la_preempt--; + if (flags & LLE_ADDRONLY) { + lladdr = la->ll_addr; + ll_len = ifp->if_addrlen; + } else { + lladdr = la->r_linkdata; + ll_len = la->r_hdrlen; } + bcopy(lladdr, desten, ll_len); - *lle = la; - error = 0; - goto done; - } - - if (la->la_flags & LLE_STATIC) { /* should not happen! */ - log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n", - inet_ntoa(SIN(dst)->sin_addr)); - m_freem(m); - error = EINVAL; - goto done; + /* Check if we have feedback request from arptimer() */ + if (la->r_skip_req != 0) { + LLE_REQ_LOCK(la); + la->r_skip_req = 0; /* Notify that entry was used */ + LLE_REQ_UNLOCK(la); + } + if (pflags != NULL) + *pflags = la->la_flags & (LLE_VALID|LLE_IFADDR); + if (plle) { + LLE_ADDREF(la); + *plle = la; + } + LLE_WUNLOCK(la); + return (0); } renew = (la->la_asked == 0 || la->la_expire != time_uptime); - if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) { - flags |= LLE_EXCLUSIVE; - LLE_RUNLOCK(la); - goto retry; - } /* * There is an arptab entry, but no ethernet address * response yet. Add the mbuf to the list, dropping @@ -393,11 +520,6 @@ retry: } else la->la_hold = m; la->la_numheld++; - if (renew == 0 && (flags & LLE_EXCLUSIVE)) { - flags &= ~LLE_EXCLUSIVE; - LLE_DOWNGRADE(la); - } - } /* * Return EWOULDBLOCK if we have tried less than arp_maxtries. It @@ -408,32 +530,113 @@ retry: if (la->la_asked < V_arp_maxtries) error = EWOULDBLOCK; /* First request. */ else - error = rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) ? - EHOSTUNREACH : EHOSTDOWN; + error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN; if (renew) { int canceled; LLE_ADDREF(la); la->la_expire = time_uptime; - canceled = callout_reset(&la->la_timer, hz * V_arpt_down, + canceled = callout_reset(&la->lle_timer, hz * V_arpt_down, arptimer, la); if (canceled) LLE_REMREF(la); la->la_asked++; LLE_WUNLOCK(la); - arprequest(ifp, NULL, &SIN(dst)->sin_addr, - IF_LLADDR(ifp)); + arprequest(ifp, NULL, &SIN(dst)->sin_addr, NULL); return (error); } -done: - if (flags & LLE_EXCLUSIVE) - LLE_WUNLOCK(la); - else - LLE_RUNLOCK(la); + + LLE_WUNLOCK(la); + return (error); +} + +/* + * Resolve an IP address into an ethernet address. + */ +int +arpresolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, + char *desten, uint32_t *pflags, struct llentry **plle) +{ + int error; + + flags |= LLE_ADDRONLY; + error = arpresolve_full(ifp, 0, flags, NULL, dst, desten, pflags, plle); return (error); } + +/* + * Lookups link header based on an IP address. + * On input: + * ifp is the interface we use + * is_gw != 0 if @dst represents gateway to some destination + * m is the mbuf. May be NULL if we don't have a packet. + * dst is the next hop, + * desten is the storage to put LL header. + * flags returns subset of lle flags: LLE_VALID | LLE_IFADDR + * + * On success, full/partial link header and flags are filled in and + * the function returns 0. + * If the packet must be held pending resolution, we return EWOULDBLOCK + * On other errors, we return the corresponding error code. + * Note that m_freem() handles NULL. + */ +int +arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, + const struct sockaddr *dst, u_char *desten, uint32_t *pflags, + struct llentry **plle) +{ + struct llentry *la = NULL; + + if (pflags != NULL) + *pflags = 0; + if (plle != NULL) + *plle = NULL; + + if (m != NULL) { + if (m->m_flags & M_BCAST) { + /* broadcast */ + (void)memcpy(desten, + ifp->if_broadcastaddr, ifp->if_addrlen); + return (0); + } + if (m->m_flags & M_MCAST) { + /* multicast */ + ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); + return (0); + } + } + + IF_AFDATA_RLOCK(ifp); + la = lla_lookup(LLTABLE(ifp), plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, dst); + if (la != NULL && (la->r_flags & RLLE_VALID) != 0) { + /* Entry found, let's copy lle info */ + bcopy(la->r_linkdata, desten, la->r_hdrlen); + if (pflags != NULL) + *pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR); + /* Check if we have feedback request from arptimer() */ + if (la->r_skip_req != 0) { + LLE_REQ_LOCK(la); + la->r_skip_req = 0; /* Notify that entry was used */ + LLE_REQ_UNLOCK(la); + } + if (plle) { + LLE_ADDREF(la); + *plle = la; + LLE_WUNLOCK(la); + } + IF_AFDATA_RUNLOCK(ifp); + return (0); + } + if (plle && la) + LLE_WUNLOCK(la); + IF_AFDATA_RUNLOCK(ifp); + + return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst, + desten, pflags, plle)); +} + /* * Common length and type checks are done here, * then the protocol-specific routine is called. @@ -442,34 +645,76 @@ static void arpintr(struct mbuf *m) { struct arphdr *ar; + struct ifnet *ifp; + char *layer; + int hlen; + + ifp = m->m_pkthdr.rcvif; if (m->m_len < sizeof(struct arphdr) && ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { - log(LOG_NOTICE, "arp: runt packet -- m_pullup failed\n"); + ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n", + if_name(ifp)); return; } ar = mtod(m, struct arphdr *); - if (ntohs(ar->ar_hrd) != ARPHRD_ETHER && - ntohs(ar->ar_hrd) != ARPHRD_IEEE802 && - ntohs(ar->ar_hrd) != ARPHRD_ARCNET && - ntohs(ar->ar_hrd) != ARPHRD_IEEE1394 && - ntohs(ar->ar_hrd) != ARPHRD_INFINIBAND) { - log(LOG_NOTICE, "arp: unknown hardware address format (0x%2D)" - " (from %*D to %*D)\n", (unsigned char *)&ar->ar_hrd, "", - ETHER_ADDR_LEN, (u_char *)ar_sha(ar), ":", - ETHER_ADDR_LEN, (u_char *)ar_tha(ar), ":"); + /* Check if length is sufficient */ + if (m->m_len < arphdr_len(ar)) { + m = m_pullup(m, arphdr_len(ar)); + if (m == NULL) { + ARP_LOG(LOG_NOTICE, "short packet received on %s\n", + if_name(ifp)); + return; + } + ar = mtod(m, struct arphdr *); + } + + hlen = 0; + layer = ""; + switch (ntohs(ar->ar_hrd)) { + case ARPHRD_ETHER: + hlen = ETHER_ADDR_LEN; /* RFC 826 */ + layer = "ethernet"; + break; + case ARPHRD_IEEE802: + hlen = 6; /* RFC 1390, FDDI_ADDR_LEN */ + layer = "fddi"; + break; + case ARPHRD_ARCNET: + hlen = 1; /* RFC 1201, ARC_ADDR_LEN */ + layer = "arcnet"; + break; + case ARPHRD_INFINIBAND: + hlen = 20; /* RFC 4391, INFINIBAND_ALEN */ + layer = "infiniband"; + break; + case ARPHRD_IEEE1394: + hlen = 0; /* SHALL be 16 */ /* RFC 2734 */ + layer = "firewire"; + + /* + * Restrict too long hardware addresses. + * Currently we are capable of handling 20-byte + * addresses ( sizeof(lle->ll_addr) ) + */ + if (ar->ar_hln >= 20) + hlen = 16; + break; + default: + ARP_LOG(LOG_NOTICE, + "packet with unknown hardware format 0x%02d received on " + "%s\n", ntohs(ar->ar_hrd), if_name(ifp)); m_freem(m); return; } - if (m->m_len < arphdr_len(ar)) { - if ((m = m_pullup(m, arphdr_len(ar))) == NULL) { - log(LOG_NOTICE, "arp: runt packet\n"); - m_freem(m); - return; - } - ar = mtod(m, struct arphdr *); + if (hlen != 0 && hlen != ar->ar_hln) { + ARP_LOG(LOG_NOTICE, + "packet with invalid %s address length %d received on %s\n", + layer, ar->ar_hln, if_name(ifp)); + m_freem(m); + return; } ARPSTAT_INC(received); @@ -518,20 +763,27 @@ SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW, static void in_arpinput(struct mbuf *m) { + struct rm_priotracker in_ifa_tracker; struct arphdr *ah; struct ifnet *ifp = m->m_pkthdr.rcvif; - struct llentry *la = NULL; - struct rtentry *rt; + struct llentry *la = NULL, *la_tmp; struct ifaddr *ifa; struct in_ifaddr *ia; struct sockaddr sa; struct in_addr isaddr, itaddr, myaddr; u_int8_t *enaddr = NULL; - int op, flags; - int req_len; + int op; int bridged = 0, is_bridge = 0; - int carp_match = 0; + int carped; struct sockaddr_in sin; + struct sockaddr *dst; + struct nhop4_basic nh4; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + struct route ro; + size_t linkhdrsize; + int lladdr_off; + int error; + sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; @@ -541,25 +793,24 @@ in_arpinput(struct mbuf *m) if (ifp->if_type == IFT_BRIDGE) is_bridge = 1; - req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); - if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) { - log(LOG_NOTICE, "in_arp: runt packet -- m_pullup failed\n"); - return; - } - + /* + * We already have checked that mbuf contains enough contiguous data + * to hold entire arp message according to the arp header. + */ ah = mtod(m, struct arphdr *); + /* * ARP is only for IPv4 so we can reject packets with * a protocol length not equal to an IPv4 address. */ if (ah->ar_pln != sizeof(struct in_addr)) { - log(LOG_NOTICE, "in_arp: requested protocol length != %zu\n", + ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n", sizeof(struct in_addr)); goto drop; } if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) { - log(LOG_NOTICE, "arp: %*D is multicast\n", + ARP_LOG(LOG_NOTICE, "%*D is multicast\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":"); goto drop; } @@ -575,26 +826,16 @@ in_arpinput(struct mbuf *m) * For a bridge, we want to check the address irrespective * of the receive interface. (This will change slightly * when we have clusters of interfaces). - * If the interface does not match, but the recieving interface - * is part of carp, we call carp_iamatch to see if this is a - * request for the virtual host ip. - * XXX: This is really ugly! */ - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) || ia->ia_ifp == ifp) && - itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { + itaddr.s_addr == ia->ia_addr.sin_addr.s_addr && + (ia->ia_ifa.ifa_carp == NULL || + (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) { ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); - goto match; - } - if (ifp->if_carp != NULL && - (*carp_iamatch_p)(ifp, ia, &isaddr, &enaddr) && - itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { - carp_match = 1; - ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } @@ -603,7 +844,7 @@ in_arpinput(struct mbuf *m) ia->ia_ifp == ifp) && isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } @@ -622,13 +863,13 @@ in_arpinput(struct mbuf *m) if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) { ifa_ref(&ia->ia_ifa); ifp = ia->ia_ifp; - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } } #undef BDG_MEMBER_MATCHES_ARP - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No match, use the first inet address on the receive interface @@ -636,7 +877,9 @@ in_arpinput(struct mbuf *m) */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) - if (ifa->ifa_addr->sa_family == AF_INET) { + if (ifa->ifa_addr->sa_family == AF_INET && + (ifa->ifa_carp == NULL || + (*carp_iamatch_p)(ifa, &enaddr))) { ia = ifatoia(ifa); ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); @@ -647,35 +890,44 @@ in_arpinput(struct mbuf *m) /* * If bridging, fall back to using any inet address. */ - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto drop; } ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); match: if (!enaddr) enaddr = (u_int8_t *)IF_LLADDR(ifp); + carped = (ia->ia_ifa.ifa_carp != NULL); myaddr = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) goto drop; /* it's from me, ignore it. */ if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { - log(LOG_NOTICE, - "arp: link address is broadcast for IP address %s!\n", - inet_ntoa(isaddr)); + ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address " + "%s!\n", inet_ntoa(isaddr)); + goto drop; + } + + if (ifp->if_addrlen != ah->ar_hln) { + ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, " + "i/f %d (ignored)\n", ifp->if_addrlen, + (u_char *) ar_sha(ah), ":", ah->ar_hln, + ifp->if_addrlen); goto drop; } + /* * Warn if another host is using the same IP address, but only if the * IP address isn't 0.0.0.0, which is used for DHCP only, in which * case we suppress the warning to avoid false positive complaints of * potential misconfiguration. */ - if (!bridged && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { - log(LOG_ERR, - "arp: %*D is using my IP address %s on %s!\n", + if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr && + myaddr.s_addr != 0) { + ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n", ifp->if_addrlen, (u_char *)ar_sha(ah), ":", inet_ntoa(isaddr), ifp->if_xname); itaddr = myaddr; @@ -689,95 +941,73 @@ match: sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr = isaddr; - flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0; - flags |= LLE_EXCLUSIVE; - IF_AFDATA_LOCK(ifp); - la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin); - IF_AFDATA_UNLOCK(ifp); - if (la != NULL) { - /* the following is not an error when doing bridging */ - if (!bridged && la->lle_tbl->llt_ifp != ifp && !carp_match) { - if (log_arp_wrong_iface) - log(LOG_WARNING, "arp: %s is on %s " - "but got reply from %*D on %s\n", - inet_ntoa(isaddr), - la->lle_tbl->llt_ifp->if_xname, - ifp->if_addrlen, (u_char *)ar_sha(ah), ":", - ifp->if_xname); - LLE_WUNLOCK(la); + dst = (struct sockaddr *)&sin; + IF_AFDATA_RLOCK(ifp); + la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); + IF_AFDATA_RUNLOCK(ifp); + if (la != NULL) + arp_check_update_lle(ah, isaddr, ifp, bridged, la); + else if (itaddr.s_addr == myaddr.s_addr) { + /* + * Request/reply to our address, but no lle exists yet. + * Calculate full link prepend to use in lle. + */ + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, + &linkhdrsize, &lladdr_off) != 0) goto reply; - } - if ((la->la_flags & LLE_VALID) && - bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) { - if (la->la_flags & LLE_STATIC) { - LLE_WUNLOCK(la); - if (log_arp_permanent_modify) - log(LOG_ERR, - "arp: %*D attempts to modify " - "permanent entry for %s on %s\n", - ifp->if_addrlen, - (u_char *)ar_sha(ah), ":", - inet_ntoa(isaddr), ifp->if_xname); - goto reply; - } - if (log_arp_movements) { - log(LOG_INFO, "arp: %s moved from %*D " - "to %*D on %s\n", - inet_ntoa(isaddr), - ifp->if_addrlen, - (u_char *)&la->ll_addr, ":", - ifp->if_addrlen, (u_char *)ar_sha(ah), ":", - ifp->if_xname); - } - } - if (ifp->if_addrlen != ah->ar_hln) { - LLE_WUNLOCK(la); - log(LOG_WARNING, "arp from %*D: addr len: new %d, " - "i/f %d (ignored)\n", ifp->if_addrlen, - (u_char *) ar_sha(ah), ":", ah->ar_hln, - ifp->if_addrlen); - goto drop; - } - (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen); - la->la_flags |= LLE_VALID; + /* Allocate new entry */ + la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); + if (la == NULL) { - EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); + /* + * lle creation may fail if source address belongs + * to non-directly connected subnet. However, we + * will try to answer the request instead of dropping + * frame. + */ + goto reply; + } + lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize, + lladdr_off); - if (!(la->la_flags & LLE_STATIC)) { - int canceled; + IF_AFDATA_WLOCK(ifp); + LLE_WLOCK(la); + la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); - LLE_ADDREF(la); - la->la_expire = time_uptime + V_arpt_keep; - canceled = callout_reset(&la->la_timer, - hz * V_arpt_keep, arptimer, la); - if (canceled) - LLE_REMREF(la); - } - la->la_asked = 0; - la->la_preempt = V_arp_maxtries; /* - * The packets are all freed within the call to the output - * routine. + * Check if lle still does not exists. + * If it does, that means that we either + * 1) have configured it explicitly, via + * 1a) 'arp -s' static entry or + * 1b) interface address static record + * or + * 2) it was the result of sending first packet to-host + * or + * 3) it was another arp reply packet we handled in + * different thread. * - * NB: The lock MUST be released before the call to the - * output routine. + * In all cases except 3) we definitely need to prefer + * existing lle. For the sake of simplicity, prefer any + * existing lle over newly-create one. */ - if (la->la_hold != NULL) { - struct mbuf *m_hold, *m_hold_next; + if (la_tmp == NULL) + lltable_link_entry(LLTABLE(ifp), la); + IF_AFDATA_WUNLOCK(ifp); - m_hold = la->la_hold; - la->la_hold = NULL; - la->la_numheld = 0; - memcpy(&sa, L3_ADDR(la), sizeof(sa)); - LLE_WUNLOCK(la); - for (; m_hold != NULL; m_hold = m_hold_next) { - m_hold_next = m_hold->m_nextpkt; - m_hold->m_nextpkt = NULL; - (*ifp->if_output)(ifp, m_hold, &sa, NULL); - } - } else + if (la_tmp == NULL) { + arp_mark_lle_reachable(la); LLE_WUNLOCK(la); + } else { + /* Free newly-create entry and handle packet */ + lltable_free_entry(LLTABLE(ifp), la); + la = la_tmp; + la_tmp = NULL; + arp_check_update_lle(ah, isaddr, ifp, bridged, la); + /* arp_check_update_lle() returns @la unlocked */ + } + la = NULL; } reply: if (op != ARPOP_REQUEST) @@ -798,7 +1028,7 @@ reply: if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); - (void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln); + (void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln); LLE_RUNLOCK(lle); } else { @@ -808,10 +1038,8 @@ reply: if (!V_arp_proxyall) goto drop; - sin.sin_addr = itaddr; /* XXX MRT use table 0 for arp reply */ - rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); - if (!rt) + if (fib4_lookup_nh_basic(0, itaddr, 0, 0, &nh4) != 0) goto drop; /* @@ -819,11 +1047,8 @@ reply: * as this one came out of, or we'll get into a fight * over who claims what Ether address. */ - if (!rt->rt_ifp || rt->rt_ifp == ifp) { - RTFREE_LOCKED(rt); + if (nh4.nh_ifp == ifp) goto drop; - } - RTFREE_LOCKED(rt); (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); @@ -834,21 +1059,16 @@ reply: * avoids ARP chaos if an interface is connected to the * wrong network. */ - sin.sin_addr = isaddr; /* XXX MRT use table 0 for arp checks */ - rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); - if (!rt) + if (fib4_lookup_nh_basic(0, isaddr, 0, 0, &nh4) != 0) goto drop; - if (rt->rt_ifp != ifp) { - log(LOG_INFO, "arp_proxy: ignoring request" - " from %s via %s, expecting %s\n", - inet_ntoa(isaddr), ifp->if_xname, - rt->rt_ifp->if_xname); - RTFREE_LOCKED(rt); + if (nh4.nh_ifp != ifp) { + ARP_LOG(LOG_INFO, "proxy: ignoring request" + " from %s via %s\n", + inet_ntoa(isaddr), ifp->if_xname); goto drop; } - RTFREE_LOCKED(rt); #ifdef DEBUG_PROXY printf("arp: proxying for %s\n", inet_ntoa(itaddr)); @@ -878,7 +1098,29 @@ reply: m->m_pkthdr.rcvif = NULL; sa.sa_family = AF_ARP; sa.sa_len = 2; - (*ifp->if_output)(ifp, m, &sa, NULL); + + /* Calculate link header for sending frame */ + bzero(&ro, sizeof(ro)); + linkhdrsize = sizeof(linkhdr); + error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize); + + /* + * arp_fillheader() may fail due to lack of support inside encap request + * routing. This is not necessary an error, AF_ARP can/should be handled + * by if_output(). + */ + if (error != 0 && error != EAFNOSUPPORT) { + ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n", + if_name(ifp), error); + return; + } + + ro.ro_prepend = linkhdr; + ro.ro_plen = linkhdrsize; + ro.ro_flags = 0; + + m_clrprotoflags(m); /* Avoid confusing lower layers. */ + (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txreplies); return; @@ -887,45 +1129,249 @@ drop: } #endif +/* + * Checks received arp data against existing @la. + * Updates lle state/performs notification if necessary. + */ +static void +arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, + int bridged, struct llentry *la) +{ + struct sockaddr sa; + struct mbuf *m_hold, *m_hold_next; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; + + LLE_WLOCK_ASSERT(la); + + /* the following is not an error when doing bridging */ + if (!bridged && la->lle_tbl->llt_ifp != ifp) { + if (log_arp_wrong_iface) + ARP_LOG(LOG_WARNING, "%s is on %s " + "but got reply from %*D on %s\n", + inet_ntoa(isaddr), + la->lle_tbl->llt_ifp->if_xname, + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + ifp->if_xname); + LLE_WUNLOCK(la); + return; + } + if ((la->la_flags & LLE_VALID) && + bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) { + if (la->la_flags & LLE_STATIC) { + LLE_WUNLOCK(la); + if (log_arp_permanent_modify) + ARP_LOG(LOG_ERR, + "%*D attempts to modify " + "permanent entry for %s on %s\n", + ifp->if_addrlen, + (u_char *)ar_sha(ah), ":", + inet_ntoa(isaddr), ifp->if_xname); + return; + } + if (log_arp_movements) { + ARP_LOG(LOG_INFO, "%s moved from %*D " + "to %*D on %s\n", + inet_ntoa(isaddr), + ifp->if_addrlen, + (u_char *)&la->ll_addr, ":", + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + ifp->if_xname); + } + } + + /* Calculate full link prepend to use in lle */ + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, + &linkhdrsize, &lladdr_off) != 0) + return; + + /* Check if something has changed */ + if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 || + (la->la_flags & LLE_VALID) == 0) { + /* Try to perform LLE update */ + if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize, + lladdr_off) == 0) + return; + + /* Clear fast path feedback request if set */ + la->r_skip_req = 0; + } + + arp_mark_lle_reachable(la); + + /* + * The packets are all freed within the call to the output + * routine. + * + * NB: The lock MUST be released before the call to the + * output routine. + */ + if (la->la_hold != NULL) { + m_hold = la->la_hold; + la->la_hold = NULL; + la->la_numheld = 0; + lltable_fill_sa_entry(la, &sa); + LLE_WUNLOCK(la); + for (; m_hold != NULL; m_hold = m_hold_next) { + m_hold_next = m_hold->m_nextpkt; + m_hold->m_nextpkt = NULL; + /* Avoid confusing lower layers. */ + m_clrprotoflags(m_hold); + (*ifp->if_output)(ifp, m_hold, &sa, NULL); + } + } else + LLE_WUNLOCK(la); +} + +static void +arp_mark_lle_reachable(struct llentry *la) +{ + int canceled, wtime; + + LLE_WLOCK_ASSERT(la); + + la->ln_state = ARP_LLINFO_REACHABLE; + EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED); + + if (!(la->la_flags & LLE_STATIC)) { + LLE_ADDREF(la); + la->la_expire = time_uptime + V_arpt_keep; + wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit; + if (wtime < 0) + wtime = V_arpt_keep; + canceled = callout_reset(&la->lle_timer, + hz * wtime, arptimer, la); + if (canceled) + LLE_REMREF(la); + } + la->la_asked = 0; + la->la_preempt = V_arp_maxtries; +} + +/* + * Add pernament link-layer record for given interface address. + */ +static __noinline void +arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst) +{ + struct llentry *lle, *lle_tmp; + + /* + * Interface address LLE record is considered static + * because kernel code relies on LLE_STATIC flag to check + * if these entries can be rewriten by arp updates. + */ + lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst); + if (lle == NULL) { + log(LOG_INFO, "arp_ifinit: cannot create arp " + "entry for interface address\n"); + return; + } + + IF_AFDATA_WLOCK(ifp); + LLE_WLOCK(lle); + /* Unlink any entry if exists */ + lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); + if (lle_tmp != NULL) + lltable_unlink_entry(LLTABLE(ifp), lle_tmp); + + lltable_link_entry(LLTABLE(ifp), lle); + IF_AFDATA_WUNLOCK(ifp); + + if (lle_tmp != NULL) + EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED); + + EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED); + LLE_WUNLOCK(lle); + if (lle_tmp != NULL) + lltable_free_entry(LLTABLE(ifp), lle_tmp); +} + void arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { - struct llentry *lle; + const struct sockaddr_in *dst_in; + const struct sockaddr *dst; - if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) { - arprequest(ifp, &IA_SIN(ifa)->sin_addr, - &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp)); - /* - * interface address is considered static entry - * because the output of the arp utility shows - * that L2 entry as permanent - */ - IF_AFDATA_LOCK(ifp); - lle = lla_lookup(LLTABLE(ifp), (LLE_CREATE | LLE_IFADDR | LLE_STATIC), - (struct sockaddr *)IA_SIN(ifa)); - IF_AFDATA_UNLOCK(ifp); - if (lle == NULL) - log(LOG_INFO, "arp_ifinit: cannot create arp " - "entry for interface address\n"); - else - LLE_RUNLOCK(lle); - } - ifa->ifa_rtrequest = NULL; + if (ifa->ifa_carp != NULL) + return; + + dst = ifa->ifa_addr; + dst_in = (const struct sockaddr_in *)dst; + + if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY) + return; + arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp)); + + arp_add_ifa_lle(ifp, dst); } void -arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr) +arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr) +{ + + if (ntohl(addr.s_addr) != INADDR_ANY) + arprequest(ifp, &addr, &addr, enaddr); +} + +/* + * Sends gratuitous ARPs for each ifaddr to notify other + * nodes about the address change. + */ +static __noinline void +arp_handle_ifllchange(struct ifnet *ifp) +{ + struct ifaddr *ifa; + + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family == AF_INET) + arp_ifinit(ifp, ifa); + } +} + +/* + * A handler for interface link layer address change event. + */ +static void +arp_iflladdr(void *arg __unused, struct ifnet *ifp) +{ + + lltable_update_ifaddr(LLTABLE(ifp)); + + if ((ifp->if_flags & IFF_UP) != 0) + arp_handle_ifllchange(ifp); +} + +static void +vnet_arp_init(void) { - if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) - arprequest(ifp, &IA_SIN(ifa)->sin_addr, - &IA_SIN(ifa)->sin_addr, enaddr); - ifa->ifa_rtrequest = NULL; + + if (IS_DEFAULT_VNET(curvnet)) { + netisr_register(&arp_nh); + iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, + arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY); + } +#ifdef VIMAGE + else + netisr_register_vnet(&arp_nh); +#endif } +VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, + vnet_arp_init, 0); +#ifdef VIMAGE +/* + * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH + * lookups after destroying the hash. Ideally this would go on SI_ORDER_3.5. + */ static void -arp_init(void) +vnet_arp_destroy(__unused void *arg) { - netisr_register(&arp_nh); + netisr_unregister_vnet(&arp_nh); } -SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); +VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, + vnet_arp_destroy, NULL); +#endif diff --git a/freebsd/sys/netinet/if_ether.h b/freebsd/sys/netinet/if_ether.h index ce63d8db..27e51f78 100644 --- a/freebsd/sys/netinet/if_ether.h +++ b/freebsd/sys/netinet/if_ether.h @@ -48,9 +48,9 @@ (enaddr)[0] = 0x01; \ (enaddr)[1] = 0x00; \ (enaddr)[2] = 0x5e; \ - (enaddr)[3] = ((u_char *)ipaddr)[1] & 0x7f; \ - (enaddr)[4] = ((u_char *)ipaddr)[2]; \ - (enaddr)[5] = ((u_char *)ipaddr)[3]; \ + (enaddr)[3] = ((const u_char *)ipaddr)[1] & 0x7f; \ + (enaddr)[4] = ((const u_char *)ipaddr)[2]; \ + (enaddr)[5] = ((const u_char *)ipaddr)[3]; \ } /* * Macro to map an IP6 multicast address to an Ethernet multicast address. @@ -63,10 +63,10 @@ { \ (enaddr)[0] = 0x33; \ (enaddr)[1] = 0x33; \ - (enaddr)[2] = ((u_char *)ip6addr)[12]; \ - (enaddr)[3] = ((u_char *)ip6addr)[13]; \ - (enaddr)[4] = ((u_char *)ip6addr)[14]; \ - (enaddr)[5] = ((u_char *)ip6addr)[15]; \ + (enaddr)[2] = ((const u_char *)ip6addr)[12]; \ + (enaddr)[3] = ((const u_char *)ip6addr)[13]; \ + (enaddr)[4] = ((const u_char *)ip6addr)[14]; \ + (enaddr)[5] = ((const u_char *)ip6addr)[15]; \ } /* @@ -89,6 +89,7 @@ struct ether_arp { #define arp_pln ea_hdr.ar_pln #define arp_op ea_hdr.ar_op +#ifndef BURN_BRIDGES /* Can be used by third party software. */ struct sockaddr_inarp { u_char sin_len; u_char sin_family; @@ -99,6 +100,8 @@ struct sockaddr_inarp { u_short sin_other; #define SIN_PROXY 1 }; +#endif /* !BURN_BRIDGES */ + /* * IP and ethernet specific routing flags */ @@ -109,14 +112,19 @@ struct sockaddr_inarp { extern u_char ether_ipmulticast_min[ETHER_ADDR_LEN]; extern u_char ether_ipmulticast_max[ETHER_ADDR_LEN]; -struct llentry; struct ifaddr; +struct llentry; -int arpresolve(struct ifnet *ifp, struct rtentry *rt, - struct mbuf *m, struct sockaddr *dst, u_char *desten, - struct llentry **lle); +int arpresolve_addr(struct ifnet *ifp, int flags, + const struct sockaddr *dst, char *desten, uint32_t *pflags, + struct llentry **plle); +int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, + const struct sockaddr *dst, u_char *desten, uint32_t *pflags, + struct llentry **plle); +void arprequest(struct ifnet *, const struct in_addr *, + const struct in_addr *, u_char *); void arp_ifinit(struct ifnet *, struct ifaddr *); -void arp_ifinit2(struct ifnet *, struct ifaddr *, u_char *); +void arp_announce_ifaddr(struct ifnet *, struct in_addr addr, u_char *); #endif #endif diff --git a/freebsd/sys/netinet/igmp.c b/freebsd/sys/netinet/igmp.c index 78d9685b..cd57e426 100644 --- a/freebsd/sys/netinet/igmp.c +++ b/freebsd/sys/netinet/igmp.c @@ -52,6 +52,8 @@ #include __FBSDID("$FreeBSD$"); +#include + #include #include #include @@ -60,11 +62,18 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include #include +#ifdef DDB +#include +#endif + #include +#include #include #include @@ -85,15 +94,15 @@ __FBSDID("$FreeBSD$"); #define KTR_IGMPV3 KTR_INET #endif -static struct igmp_ifinfo * +static struct igmp_ifsoftc * igi_alloc_locked(struct ifnet *); static void igi_delete_locked(const struct ifnet *); -static void igmp_dispatch_queue(struct ifqueue *, int, const int); +static void igmp_dispatch_queue(struct mbufq *, int, const int); static void igmp_fasttimo_vnet(void); -static void igmp_final_leave(struct in_multi *, struct igmp_ifinfo *); +static void igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *); static int igmp_handle_state_change(struct in_multi *, - struct igmp_ifinfo *); -static int igmp_initial_join(struct in_multi *, struct igmp_ifinfo *); + struct igmp_ifsoftc *); +static int igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *); static int igmp_input_v1_query(struct ifnet *, const struct ip *, const struct igmp *); static int igmp_input_v2_query(struct ifnet *, const struct ip *, @@ -101,7 +110,7 @@ static int igmp_input_v2_query(struct ifnet *, const struct ip *, static int igmp_input_v3_query(struct ifnet *, const struct ip *, /*const*/ struct igmpv3 *); static int igmp_input_v3_group_query(struct in_multi *, - struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *); + struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *); static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, /*const*/ struct igmp *); static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, @@ -113,25 +122,25 @@ static struct mbuf * #ifdef KTR static char * igmp_rec_type_to_str(const int); #endif -static void igmp_set_version(struct igmp_ifinfo *, const int); +static void igmp_set_version(struct igmp_ifsoftc *, const int); static void igmp_slowtimo_vnet(void); static int igmp_v1v2_queue_report(struct in_multi *, const int); static void igmp_v1v2_process_group_timer(struct in_multi *, const int); -static void igmp_v1v2_process_querier_timers(struct igmp_ifinfo *); +static void igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *); static void igmp_v2_update_group(struct in_multi *, const int); -static void igmp_v3_cancel_link_timers(struct igmp_ifinfo *); -static void igmp_v3_dispatch_general_query(struct igmp_ifinfo *); +static void igmp_v3_cancel_link_timers(struct igmp_ifsoftc *); +static void igmp_v3_dispatch_general_query(struct igmp_ifsoftc *); static struct mbuf * igmp_v3_encap_report(struct ifnet *, struct mbuf *); -static int igmp_v3_enqueue_group_record(struct ifqueue *, +static int igmp_v3_enqueue_group_record(struct mbufq *, struct in_multi *, const int, const int, const int); -static int igmp_v3_enqueue_filter_change(struct ifqueue *, +static int igmp_v3_enqueue_filter_change(struct mbufq *, struct in_multi *); -static void igmp_v3_process_group_timers(struct igmp_ifinfo *, - struct ifqueue *, struct ifqueue *, struct in_multi *, +static void igmp_v3_process_group_timers(struct igmp_ifsoftc *, + struct mbufq *, struct mbufq *, struct in_multi *, const int); static int igmp_v3_merge_state_changes(struct in_multi *, - struct ifqueue *); + struct mbufq *); static void igmp_v3_suppress_group_record(struct in_multi *); static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); @@ -159,13 +168,13 @@ static const struct netisr_handler igmp_nh = { * * All output is delegated to the netisr. * Now that Giant has been eliminated, the netisr may be inlined. * * IN_MULTI_LOCK covers in_multi. - * * IGMP_LOCK covers igmp_ifinfo and any global variables in this file, + * * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file, * including the output queue. * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of * per-link state iterators. - * * igmp_ifinfo is valid as long as PF_INET is attached to the interface, + * * igmp_ifsoftc is valid as long as PF_INET is attached to the interface, * therefore it is not refcounted. - * We allow unlocked reads of igmp_ifinfo when accessed via in_multi. + * We allow unlocked reads of igmp_ifsoftc when accessed via in_multi. * * Reference counting * * IGMP acquires its own reference every time an in_multi is passed to @@ -220,7 +229,8 @@ static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host #define V_state_change_timers_running VNET(state_change_timers_running) #define V_current_state_timers_running VNET(current_state_timers_running) -static VNET_DEFINE(LIST_HEAD(, igmp_ifinfo), igi_head); +static VNET_DEFINE(LIST_HEAD(, igmp_ifsoftc), igi_head) = + LIST_HEAD_INITIALIZER(igi_head); static VNET_DEFINE(struct igmpstat, igmpstat) = { .igps_version = IGPS_VERSION_3, .igps_len = sizeof(struct igmpstat), @@ -250,32 +260,32 @@ static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3; /* * Virtualized sysctls. */ -SYSCTL_VNET_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW, +SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmpstat), igmpstat, ""); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_recvifkludge), 0, "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendra), 0, "Send IP Router Alert option in IGMPv2/v3 messages"); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_sendlocal), 0, "Send IGMP membership reports for 224.0.0.0/24 groups"); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v1enable), 0, "Enable backwards compatibility with IGMPv1"); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_v2enable), 0, "Enable backwards compatibility with IGMPv2"); -SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW, +SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(igmp_legacysupp), 0, "Allow v1/v2 reports to suppress v3 group responses"); -SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, default_version, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, +SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I", "Default version of IGMP to run on each interface"); -SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, +SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I", "Rate limit for IGMPv3 Group-and-Source queries in seconds"); @@ -291,7 +301,7 @@ igmp_save_context(struct mbuf *m, struct ifnet *ifp) { #ifdef VIMAGE - m->m_pkthdr.header = ifp->if_vnet; + m->m_pkthdr.PH_loc.ptr = ifp->if_vnet; #endif /* VIMAGE */ m->m_pkthdr.flowid = ifp->if_index; } @@ -300,7 +310,7 @@ static __inline void igmp_scrub_context(struct mbuf *m) { - m->m_pkthdr.header = NULL; + m->m_pkthdr.PH_loc.ptr = NULL; m->m_pkthdr.flowid = 0; } @@ -328,7 +338,7 @@ igmp_restore_context(struct mbuf *m) #ifdef notyet #if defined(VIMAGE) && defined(INVARIANTS) - KASSERT(curvnet == (m->m_pkthdr.header), + KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr), ("%s: called when curvnet was not restored", __func__)); #endif #endif @@ -413,7 +423,7 @@ out_locked: } /* - * Expose struct igmp_ifinfo to userland, keyed by ifindex. + * Expose struct igmp_ifsoftc to userland, keyed by ifindex. * For use by ifmcstat(8). * * SMPng: NOTE: Does an unlocked ifindex space read. @@ -427,7 +437,7 @@ sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) int error; u_int namelen; struct ifnet *ifp; - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; name = (int *)arg1; namelen = arg2; @@ -458,8 +468,18 @@ sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) LIST_FOREACH(igi, &V_igi_head, igi_link) { if (ifp == igi->igi_ifp) { - error = SYSCTL_OUT(req, igi, - sizeof(struct igmp_ifinfo)); + struct igmp_ifinfo info; + + info.igi_version = igi->igi_version; + info.igi_v1_timer = igi->igi_v1_timer; + info.igi_v2_timer = igi->igi_v2_timer; + info.igi_v3_timer = igi->igi_v3_timer; + info.igi_flags = igi->igi_flags; + info.igi_rv = igi->igi_rv; + info.igi_qi = igi->igi_qi; + info.igi_qri = igi->igi_qri; + info.igi_uri = igi->igi_uri; + error = SYSCTL_OUT(req, &info, sizeof(info)); break; } } @@ -476,15 +496,12 @@ out_locked: * VIMAGE: Assumes the vnet pointer has been set. */ static void -igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop) +igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop) { struct mbuf *m; - for (;;) { - _IF_DEQUEUE(ifq, m); - if (m == NULL) - break; - CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m); + while ((m = mbufq_dequeue(mq)) != NULL) { + CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m); if (loop) m->m_flags |= M_IGMP_LOOP; netisr_dispatch(NETISR_IGMP, m); @@ -525,7 +542,7 @@ igmp_ra_alloc(void) struct mbuf *m; struct ipoption *p; - MGET(m, M_DONTWAIT, MT_DATA); + m = m_get(M_WAITOK, MT_DATA); p = mtod(m, struct ipoption *); p->ipopt_dst.s_addr = INADDR_ANY; p->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ @@ -540,10 +557,10 @@ igmp_ra_alloc(void) /* * Attach IGMP when PF_INET is attached to an interface. */ -struct igmp_ifinfo * +struct igmp_ifsoftc * igmp_domifattach(struct ifnet *ifp) { - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); @@ -562,14 +579,14 @@ igmp_domifattach(struct ifnet *ifp) /* * VIMAGE: assume curvnet set by caller. */ -static struct igmp_ifinfo * +static struct igmp_ifsoftc * igi_alloc_locked(/*const*/ struct ifnet *ifp) { - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; IGMP_LOCK_ASSERT(); - igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO); + igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO); if (igi == NULL) goto out; @@ -580,17 +597,12 @@ igi_alloc_locked(/*const*/ struct ifnet *ifp) igi->igi_qi = IGMP_QI_INIT; igi->igi_qri = IGMP_QRI_INIT; igi->igi_uri = IGMP_URI_INIT; - SLIST_INIT(&igi->igi_relinmhead); - - /* - * Responses to general queries are subject to bounds. - */ - IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); + mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); - CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)", + CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)", ifp, ifp->if_xname); out: @@ -609,7 +621,7 @@ out: void igmp_ifdetach(struct ifnet *ifp) { - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma; struct in_multi *inm, *tinm; @@ -656,25 +668,21 @@ igmp_ifdetach(struct ifnet *ifp) void igmp_domifdetach(struct ifnet *ifp) { - struct igmp_ifinfo *igi; CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK(); - - igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; igi_delete_locked(ifp); - IGMP_UNLOCK(); } static void igi_delete_locked(const struct ifnet *ifp) { - struct igmp_ifinfo *igi, *tigi; + struct igmp_ifsoftc *igi, *tigi; - CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)", + CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)", __func__, ifp, ifp->if_xname); IGMP_LOCK_ASSERT(); @@ -684,7 +692,7 @@ igi_delete_locked(const struct ifnet *ifp) /* * Free deferred General Query responses. */ - _IF_DRAIN(&igi->igi_gq); + mbufq_drain(&igi->igi_gq); LIST_REMOVE(igi, igi_link); @@ -696,10 +704,6 @@ igi_delete_locked(const struct ifnet *ifp) return; } } - -#ifdef INVARIANTS - panic("%s: igmp_ifinfo not found for ifp %p\n", __func__, ifp); -#endif } /* @@ -713,7 +717,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; struct in_multi *inm; /* @@ -733,7 +737,7 @@ igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; - KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", @@ -798,7 +802,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, const struct igmp *igmp) { struct ifmultiaddr *ifma; - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint16_t timer; @@ -827,7 +831,7 @@ igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; - KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", @@ -948,7 +952,7 @@ static int igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, /*const*/ struct igmpv3 *igmpv3) { - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; struct in_multi *inm; int is_general_query; uint32_t maxresp, nsrc, qqi; @@ -1021,7 +1025,7 @@ igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; - KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); if (igi->igi_flags & IGIF_LOOPBACK) { CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", @@ -1104,12 +1108,12 @@ out_locked: } /* - * Process a recieved IGMPv3 group-specific or group-and-source-specific + * Process a received IGMPv3 group-specific or group-and-source-specific * query. - * Return <0 if any error occured. Currently this is ignored. + * Return <0 if any error occurred. Currently this is ignored. */ static int -igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi, +igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi, int timer, /*const*/ struct igmpv3 *igmpv3) { int retval; @@ -1214,6 +1218,7 @@ static int igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; @@ -1236,7 +1241,7 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, * Replace 0.0.0.0 with the subnet address if told to do so. */ if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src.s_addr = htonl(ia->ia_subnet); ifa_free(&ia->ia_ifa); @@ -1254,7 +1259,7 @@ igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, IN_MULTI_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; igi = inm->inm_igi; if (igi == NULL) { @@ -1322,6 +1327,7 @@ static int igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, /*const*/ struct igmp *igmp) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_multi *inm; @@ -1330,7 +1336,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, * leave requires knowing that we are the only member of a * group. */ - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { ifa_free(&ia->ia_ifa); return (0); @@ -1378,7 +1384,7 @@ igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, IN_MULTI_LOCK(); inm = inm_lookup(ifp, igmp->igmp_group); if (inm != NULL) { - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; igi = inm->inm_igi; KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); @@ -1425,26 +1431,29 @@ out_locked: return (0); } -void -igmp_input(struct mbuf *m, int off) +int +igmp_input(struct mbuf **mp, int *offp, int proto) { int iphlen; struct ifnet *ifp; struct igmp *igmp; struct ip *ip; + struct mbuf *m; int igmplen; int minlen; int queryver; - CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, m, off); + CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp); + m = *mp; ifp = m->m_pkthdr.rcvif; + *mp = NULL; IGMPSTAT_INC(igps_rcv_total); ip = mtod(m, struct ip *); - iphlen = off; - igmplen = ip->ip_len; + iphlen = *offp; + igmplen = ntohs(ip->ip_len) - iphlen; /* * Validate lengths. @@ -1452,7 +1461,7 @@ igmp_input(struct mbuf *m, int off) if (igmplen < IGMP_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); - return; + return (IPPROTO_DONE); } /* @@ -1464,10 +1473,10 @@ igmp_input(struct mbuf *m, int off) minlen += IGMP_V3_QUERY_MINLEN; else minlen += IGMP_MINLEN; - if ((m->m_flags & M_EXT || m->m_len < minlen) && - (m = m_pullup(m, minlen)) == 0) { + if ((!M_WRITABLE(m) || m->m_len < minlen) && + (m = m_pullup(m, minlen)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); - return; + return (IPPROTO_DONE); } ip = mtod(m, struct ip *); @@ -1480,7 +1489,7 @@ igmp_input(struct mbuf *m, int off) if (in_cksum(m, igmplen)) { IGMPSTAT_INC(igps_rcv_badsum); m_freem(m); - return; + return (IPPROTO_DONE); } m->m_data -= iphlen; m->m_len += iphlen; @@ -1493,7 +1502,7 @@ igmp_input(struct mbuf *m, int off) if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { IGMPSTAT_INC(igps_rcv_badttl); m_freem(m); - return; + return (IPPROTO_DONE); } switch (igmp->igmp_type) { @@ -1508,7 +1517,7 @@ igmp_input(struct mbuf *m, int off) } else { IGMPSTAT_INC(igps_rcv_tooshort); m_freem(m); - return; + return (IPPROTO_DONE); } switch (queryver) { @@ -1518,7 +1527,7 @@ igmp_input(struct mbuf *m, int off) break; if (igmp_input_v1_query(ifp, ip, igmp) != 0) { m_freem(m); - return; + return (IPPROTO_DONE); } break; @@ -1528,7 +1537,7 @@ igmp_input(struct mbuf *m, int off) break; if (igmp_input_v2_query(ifp, ip, igmp) != 0) { m_freem(m); - return; + return (IPPROTO_DONE); } break; @@ -1546,25 +1555,25 @@ igmp_input(struct mbuf *m, int off) if (nsrc * sizeof(in_addr_t) > UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) { IGMPSTAT_INC(igps_rcv_tooshort); - return; + return (IPPROTO_DONE); } /* * m_pullup() may modify m, so pullup in * this scope. */ igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + - sizeof(struct in_addr) * nsrc; - if ((m->m_flags & M_EXT || + sizeof(struct in_addr) * nsrc; + if ((!M_WRITABLE(m) || m->m_len < igmpv3len) && (m = m_pullup(m, igmpv3len)) == NULL) { IGMPSTAT_INC(igps_rcv_tooshort); - return; + return (IPPROTO_DONE); } igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + iphlen); if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { m_freem(m); - return; + return (IPPROTO_DONE); } } break; @@ -1576,7 +1585,7 @@ igmp_input(struct mbuf *m, int off) break; if (igmp_input_v1_report(ifp, ip, igmp) != 0) { m_freem(m); - return; + return (IPPROTO_DONE); } break; @@ -1587,7 +1596,7 @@ igmp_input(struct mbuf *m, int off) IGMPSTAT_INC(igps_rcv_nora); if (igmp_input_v2_report(ifp, ip, igmp) != 0) { m_freem(m); - return; + return (IPPROTO_DONE); } break; @@ -1608,7 +1617,8 @@ igmp_input(struct mbuf *m, int off) * Pass all valid IGMP packets up to any process(es) listening on a * raw IGMP socket. */ - rip_input(m, off); + *mp = m; + return (rip_input(mp, offp, proto)); } @@ -1639,10 +1649,10 @@ igmp_fasttimo(void) static void igmp_fasttimo_vnet(void) { - struct ifqueue scq; /* State-change packets */ - struct ifqueue qrq; /* Query response packets */ + struct mbufq scq; /* State-change packets */ + struct mbufq qrq; /* Query response packets */ struct ifnet *ifp; - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; struct ifmultiaddr *ifma; struct in_multi *inm; int loop, uri_fasthz; @@ -1701,12 +1711,8 @@ igmp_fasttimo_vnet(void) loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_FASTHZ); - - memset(&qrq, 0, sizeof(struct ifqueue)); - IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS); - - memset(&scq, 0, sizeof(struct ifqueue)); - IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); + mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS); + mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); } IF_ADDR_RLOCK(ifp); @@ -1804,8 +1810,8 @@ igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) * Note: Unlocked read from igi. */ static void -igmp_v3_process_group_timers(struct igmp_ifinfo *igi, - struct ifqueue *qrq, struct ifqueue *scq, +igmp_v3_process_group_timers(struct igmp_ifsoftc *igi, + struct mbufq *qrq, struct mbufq *scq, struct in_multi *inm, const int uri_fasthz) { int query_response_timer_expired; @@ -1951,7 +1957,7 @@ igmp_v3_suppress_group_record(struct in_multi *inm) * as per Section 7.2.1. */ static void -igmp_set_version(struct igmp_ifinfo *igi, const int version) +igmp_set_version(struct igmp_ifsoftc *igi, const int version) { int old_version_timer; @@ -2000,7 +2006,7 @@ igmp_set_version(struct igmp_ifinfo *igi, const int version) * query processing. */ static void -igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi) +igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; @@ -2067,7 +2073,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi) */ inm->inm_sctimer = 0; inm->inm_timer = 0; - _IF_DRAIN(&inm->inm_scq); + mbufq_drain(&inm->inm_scq); } IF_ADDR_RUNLOCK(ifp); SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, tinm) { @@ -2081,7 +2087,7 @@ igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi) * See Section 7.2.1 of RFC 3376. */ static void -igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi) +igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi) { IGMP_LOCK_ASSERT(); @@ -2122,6 +2128,7 @@ igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi) __func__, igi->igi_version, IGMP_VERSION_2, igi->igi_ifp, igi->igi_ifp->if_xname); igi->igi_version = IGMP_VERSION_2; + igmp_v3_cancel_link_timers(igi); } } } else if (igi->igi_v1_timer > 0) { @@ -2176,7 +2183,7 @@ igmp_slowtimo(void) static void igmp_slowtimo_vnet(void) { - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; IGMP_LOCK(); @@ -2204,10 +2211,10 @@ igmp_v1v2_queue_report(struct in_multi *inm, const int type) ifp = inm->inm_ifp; - MGETHDR(m, M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOMEM); - MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); + M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); @@ -2226,7 +2233,7 @@ igmp_v1v2_queue_report(struct in_multi *inm, const int type) ip = mtod(m, struct ip *); ip->ip_tos = 0; - ip->ip_len = sizeof(struct ip) + sizeof(struct igmp); + ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp)); ip->ip_off = 0; ip->ip_p = IPPROTO_IGMP; ip->ip_src.s_addr = INADDR_ANY; @@ -2272,7 +2279,7 @@ igmp_v1v2_queue_report(struct in_multi *inm, const int type) int igmp_change_state(struct in_multi *inm) { - struct igmp_ifinfo *igi; + struct igmp_ifsoftc *igi; struct ifnet *ifp; int error; @@ -2295,7 +2302,7 @@ igmp_change_state(struct in_multi *inm) IGMP_LOCK(); igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; - KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp)); /* * If we detect a state transition to or from MCAST_UNDEFINED @@ -2336,10 +2343,10 @@ out_locked: * initial state of the membership. */ static int -igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi) +igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; - struct ifqueue *ifq; + struct mbufq *mq; int error, retval, syncstates; CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)", @@ -2413,9 +2420,9 @@ igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi) * Don't kick the timers if there is nothing to do, * or if an error occurred. */ - ifq = &inm->inm_scq; - _IF_DRAIN(ifq); - retval = igmp_v3_enqueue_group_record(ifq, inm, 1, + mq = &inm->inm_scq; + mbufq_drain(mq); + retval = igmp_v3_enqueue_group_record(mq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); @@ -2464,7 +2471,7 @@ igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi) * Issue an intermediate state change during the IGMP life-cycle. */ static int -igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi) +igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi) { struct ifnet *ifp; int retval; @@ -2495,7 +2502,7 @@ igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi) return (0); } - _IF_DRAIN(&inm->inm_scq); + mbufq_drain(&inm->inm_scq); retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); @@ -2523,7 +2530,7 @@ igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi) * to INCLUDE {} for immediate transmission. */ static void -igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi) +igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi) { int syncstates; @@ -2564,7 +2571,7 @@ igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi) * TO_IN {} to be sent on the next fast timeout, * giving us an opportunity to merge reports. */ - _IF_DRAIN(&inm->inm_scq); + mbufq_drain(&inm->inm_scq); inm->inm_timer = 0; if (igi->igi_flags & IGIF_LOOPBACK) { inm->inm_scrv = 1; @@ -2642,7 +2649,7 @@ igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi) * no record(s) were appended. */ static int -igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, +igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm, const int is_state_change, const int is_group_query, const int is_source_query) { @@ -2732,7 +2739,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, * Generate the filter list changes using a separate function. */ if (is_filter_list_change) - return (igmp_v3_enqueue_filter_change(ifq, inm)); + return (igmp_v3_enqueue_filter_change(mq, inm)); if (type == IGMP_DO_NOTHING) { CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s", @@ -2762,7 +2769,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, * Note: Group records for G/GSR query responses MUST be sent * in their own packet. */ - m0 = ifq->ifq_tail; + m0 = mbufq_last(mq); if (!is_group_query && m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && @@ -2773,7 +2780,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, m = m0; CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); } else { - if (_IF_QFULL(ifq)) { + if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } @@ -2781,14 +2788,14 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); if (!is_state_change && !is_group_query) { - m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; } if (m == NULL) { - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m) - MH_ALIGN(m, IGMP_LEADINGSPACE); + M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); @@ -2886,7 +2893,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, if (m != m0) { CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); m->m_pkthdr.PH_vt.vt_nrecs = 1; - _IF_ENQUEUE(ifq, m); + mbufq_enqueue(mq, m); } else m->m_pkthdr.PH_vt.vt_nrecs++; @@ -2902,17 +2909,17 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, * Always try for a cluster first. */ while (nims != NULL) { - if (_IF_QFULL(ifq)) { + if (mbufq_full(mq)) { CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); return (-ENOMEM); } - m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m) - MH_ALIGN(m, IGMP_LEADINGSPACE); + M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) return (-ENOMEM); @@ -2965,7 +2972,7 @@ igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, nbytes += (msrcs * sizeof(in_addr_t)); CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); - _IF_ENQUEUE(ifq, m); + mbufq_enqueue(mq, m); } return (nbytes); @@ -3005,7 +3012,7 @@ typedef enum { * no record(s) were appended. */ static int -igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) +igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm) { static const int MINRECLEN = sizeof(struct igmp_grouprec) + sizeof(in_addr_t); @@ -3049,7 +3056,7 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) */ while (drt != REC_FULL) { do { - m0 = ifq->ifq_tail; + m0 = mbufq_last(mq); if (m0 != NULL && (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && @@ -3062,13 +3069,13 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) CTR1(KTR_IGMPV3, "%s: use previous packet", __func__); } else { - m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m) m->m_data += IGMP_LEADINGSPACE; if (m == NULL) { - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m) - MH_ALIGN(m, IGMP_LEADINGSPACE); + M_ALIGN(m, IGMP_LEADINGSPACE); } if (m == NULL) { CTR1(KTR_IGMPV3, @@ -3196,7 +3203,7 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) */ m->m_pkthdr.PH_vt.vt_nrecs++; if (m != m0) - _IF_ENQUEUE(ifq, m); + mbufq_enqueue(mq, m); nbytes += npbytes; } while (nims != NULL); drt |= crt; @@ -3210,9 +3217,9 @@ igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) } static int -igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) +igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq) { - struct ifqueue *gq; + struct mbufq *gq; struct mbuf *m; /* pending state-change */ struct mbuf *m0; /* copy of pending state-change */ struct mbuf *mt; /* last state-change in packet */ @@ -3235,13 +3242,13 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) gq = &inm->inm_scq; #ifdef KTR - if (gq->ifq_head == NULL) { + if (mbufq_first(gq) == NULL) { CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", __func__, inm); } #endif - m = gq->ifq_head; + m = mbufq_first(gq); while (m != NULL) { /* * Only merge the report into the current packet if @@ -3252,7 +3259,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) * allocated clusters. */ domerge = 0; - mt = ifscq->ifq_tail; + mt = mbufq_last(scq); if (mt != NULL) { recslen = m_length(m, NULL); @@ -3264,7 +3271,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) domerge = 1; } - if (!domerge && _IF_QFULL(gq)) { + if (!domerge && mbufq_full(gq)) { CTR2(KTR_IGMPV3, "%s: outbound queue full, skipping whole packet %p", __func__, m); @@ -3277,7 +3284,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) if (!docopy) { CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); - _IF_DEQUEUE(gq, m0); + m0 = mbufq_dequeue(gq); m = m0->m_nextpkt; } else { CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); @@ -3289,13 +3296,13 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) } if (!domerge) { - CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)", - __func__, m0, ifscq); - _IF_ENQUEUE(ifscq, m0); + CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)", + __func__, m0, scq); + mbufq_enqueue(scq, m0); } else { struct mbuf *mtl; /* last mbuf of packet mt */ - CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)", + CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)", __func__, m0, mt); mtl = m_last(mt); @@ -3315,7 +3322,7 @@ igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) * Respond to a pending IGMPv3 General Query. */ static void -igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi) +igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi) { struct ifmultiaddr *ifma; struct ifnet *ifp; @@ -3328,6 +3335,15 @@ igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi) KASSERT(igi->igi_version == IGMP_VERSION_3, ("%s: called when version %d", __func__, igi->igi_version)); + /* + * Check that there are some packets queued. If so, send them first. + * For large number of groups the reply to general query can take + * many packets, we should finish sending them before starting of + * queuing the new reply. + */ + if (mbufq_len(&igi->igi_gq) != 0) + goto send; + ifp = igi->igi_ifp; IF_ADDR_RLOCK(ifp); @@ -3363,13 +3379,14 @@ igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi) } IF_ADDR_RUNLOCK(ifp); +send: loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); /* * Slew transmission of bursts over 500ms intervals. */ - if (igi->igi_gq.ifq_head != NULL) { + if (mbufq_first(&igi->igi_gq) != NULL) { igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( IGMP_RESPONSE_BURST_INTERVAL); V_interface_timers_running = 1; @@ -3403,7 +3420,7 @@ igmp_intr(struct mbuf *m) * indexes to guard against interface detach, they are * unique to each VIMAGE and must be retrieved. */ - CURVNET_SET((struct vnet *)(m->m_pkthdr.header)); + CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr)); ifindex = igmp_restore_context(m); /* @@ -3450,7 +3467,7 @@ igmp_intr(struct mbuf *m) } igmp_scrub_context(m0); - m->m_flags &= ~(M_PROTOFLAGS); + m_clrprotoflags(m); m0->m_pkthdr.rcvif = V_loif; #ifdef MAC mac_netinet_igmp_send(ifp, m0); @@ -3485,6 +3502,7 @@ out: static struct mbuf * igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) { + struct rm_priotracker in_ifa_tracker; struct igmp_report *igmp; struct ip *ip; int hdrlen, igmpreclen; @@ -3498,7 +3516,7 @@ igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) if (m->m_flags & M_IGMPV3_HDR) { igmpreclen -= hdrlen; } else { - M_PREPEND(m, hdrlen, M_DONTWAIT); + M_PREPEND(m, hdrlen, M_NOWAIT); if (m == NULL) return (NULL); m->m_flags |= M_IGMPV3_HDR; @@ -3523,8 +3541,8 @@ igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) ip = mtod(m, struct ip *); ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; - ip->ip_len = hdrlen + igmpreclen; - ip->ip_off = IP_DF; + ip->ip_len = htons(hdrlen + igmpreclen); + ip->ip_off = htons(IP_DF); ip->ip_p = IPPROTO_IGMP; ip->ip_sum = 0; @@ -3533,7 +3551,7 @@ igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) if (m->m_flags & M_IGMP_LOOP) { struct in_ifaddr *ia; - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { ip->ip_src = ia->ia_addr.sin_addr; ifa_free(&ia->ia_ifa); @@ -3576,70 +3594,82 @@ igmp_rec_type_to_str(const int type) } #endif +#ifdef VIMAGE static void -igmp_init(void *unused __unused) +vnet_igmp_init(const void *unused __unused) { - CTR1(KTR_IGMPV3, "%s: initializing", __func__); - - IGMP_LOCK_INIT(); - - m_raopt = igmp_ra_alloc(); - - netisr_register(&igmp_nh); + netisr_register_vnet(&igmp_nh); } -SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL); +VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, + vnet_igmp_init, NULL); static void -igmp_uninit(void *unused __unused) +vnet_igmp_uninit(const void *unused __unused) { + /* This can happen when we shutdown the entire network stack. */ CTR1(KTR_IGMPV3, "%s: tearing down", __func__); - netisr_unregister(&igmp_nh); - - m_free(m_raopt); - m_raopt = NULL; - - IGMP_LOCK_DESTROY(); + netisr_unregister_vnet(&igmp_nh); } -SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL); +VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, + vnet_igmp_uninit, NULL); +#endif -static void -vnet_igmp_init(const void *unused __unused) +#ifdef DDB +DB_SHOW_COMMAND(igi_list, db_show_igi_list) { + struct igmp_ifsoftc *igi, *tigi; + LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head; - CTR1(KTR_IGMPV3, "%s: initializing", __func__); - - LIST_INIT(&V_igi_head); -} -VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init, - NULL); - -static void -vnet_igmp_uninit(const void *unused __unused) -{ - - CTR1(KTR_IGMPV3, "%s: tearing down", __func__); - - KASSERT(LIST_EMPTY(&V_igi_head), - ("%s: igi list not empty; ifnets not detached?", __func__)); + if (!have_addr) { + db_printf("usage: show igi_list \n"); + return; + } + igi_head = (struct _igi_list *)addr; + + LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) { + db_printf("igmp_ifsoftc %p:\n", igi); + db_printf(" ifp %p\n", igi->igi_ifp); + db_printf(" version %u\n", igi->igi_version); + db_printf(" v1_timer %u\n", igi->igi_v1_timer); + db_printf(" v2_timer %u\n", igi->igi_v2_timer); + db_printf(" v3_timer %u\n", igi->igi_v3_timer); + db_printf(" flags %#x\n", igi->igi_flags); + db_printf(" rv %u\n", igi->igi_rv); + db_printf(" qi %u\n", igi->igi_qi); + db_printf(" qri %u\n", igi->igi_qri); + db_printf(" uri %u\n", igi->igi_uri); + /* SLIST_HEAD(,in_multi) igi_relinmhead */ + /* struct mbufq igi_gq; */ + db_printf("\n"); + } } -VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, - vnet_igmp_uninit, NULL); +#endif static int igmp_modevent(module_t mod, int type, void *unused __unused) { - switch (type) { - case MOD_LOAD: - case MOD_UNLOAD: - break; - default: - return (EOPNOTSUPP); - } - return (0); + switch (type) { + case MOD_LOAD: + CTR1(KTR_IGMPV3, "%s: initializing", __func__); + IGMP_LOCK_INIT(); + m_raopt = igmp_ra_alloc(); + netisr_register(&igmp_nh); + break; + case MOD_UNLOAD: + CTR1(KTR_IGMPV3, "%s: tearing down", __func__); + netisr_unregister(&igmp_nh); + m_free(m_raopt); + m_raopt = NULL; + IGMP_LOCK_DESTROY(); + break; + default: + return (EOPNOTSUPP); + } + return (0); } static moduledata_t igmp_mod = { @@ -3647,4 +3677,4 @@ static moduledata_t igmp_mod = { igmp_modevent, 0 }; -DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); +DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); diff --git a/freebsd/sys/netinet/igmp_var.h b/freebsd/sys/netinet/igmp_var.h index ca17158f..5242d07d 100644 --- a/freebsd/sys/netinet/igmp_var.h +++ b/freebsd/sys/netinet/igmp_var.h @@ -46,24 +46,6 @@ * MULTICAST Revision: 3.5.1.3 */ -#ifndef BURN_BRIDGES -/* - * Pre-IGMPV3 igmpstat structure. - */ -struct oigmpstat { - u_int igps_rcv_total; /* total IGMP messages received */ - u_int igps_rcv_tooshort; /* received with too few bytes */ - u_int igps_rcv_badsum; /* received with bad checksum */ - u_int igps_rcv_queries; /* received membership queries */ - u_int igps_rcv_badqueries; /* received invalid queries */ - u_int igps_rcv_reports; /* received membership reports */ - u_int igps_rcv_badreports; /* received invalid reports */ - u_int igps_rcv_ourreports; /* received reports for our groups */ - u_int igps_snd_reports; /* sent membership reports */ - u_int igps_rcv_toolong; /* received with too many bytes */ -}; -#endif - /* * IGMPv3 protocol statistics. */ @@ -105,19 +87,16 @@ struct igmpstat { }; #define IGPS_VERSION_3 3 /* as of FreeBSD 8.x */ #define IGPS_VERSION3_LEN 168 - -#ifdef _KERNEL -#define IGMPSTAT_ADD(name, val) V_igmpstat.name += (val) -#define IGMPSTAT_INC(name) IGMPSTAT_ADD(name, 1) -#endif - #ifdef CTASSERT -CTASSERT(sizeof(struct igmpstat) == 168); +CTASSERT(sizeof(struct igmpstat) == IGPS_VERSION3_LEN); #endif -#ifdef _KERNEL -#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) +/* + * Identifiers for IGMP sysctl nodes + */ +#define IGMPCTL_STATS 1 /* statistics (read-only) */ +#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) #define IGMP_MAX_STATE_CHANGES 24 /* Max pending changes per group */ /* @@ -185,6 +164,27 @@ CTASSERT(sizeof(struct igmpstat) == 168); #define IGMP_LEADINGSPACE \ (sizeof(struct ip) + RAOPT_LEN + sizeof(struct igmp_report)) +/* + * Structure returned by net.inet.igmp.ifinfo sysctl. + */ +struct igmp_ifinfo { + uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ + uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ + uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ + uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ + uint32_t igi_flags; /* IGMP per-interface flags */ +#define IGIF_SILENT 0x00000001 /* Do not use IGMP on this ifp */ +#define IGIF_LOOPBACK 0x00000002 /* Send IGMP reports to loopback */ + uint32_t igi_rv; /* IGMPv3 Robustness Variable */ + uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ + uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ + uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ +}; + +#ifdef _KERNEL +#define IGMPSTAT_ADD(name, val) V_igmpstat.name += (val) +#define IGMPSTAT_INC(name) IGMPSTAT_ADD(name, 1) + /* * Subsystem lock macros. * The IGMP lock is only taken with IGMP. Currently it is system-wide. @@ -197,29 +197,35 @@ CTASSERT(sizeof(struct igmpstat) == 168); #define IGMP_UNLOCK() mtx_unlock(&igmp_mtx) #define IGMP_UNLOCK_ASSERT() mtx_assert(&igmp_mtx, MA_NOTOWNED) -struct igmp_ifinfo; +/* + * Per-interface IGMP router version information. + */ +struct igmp_ifsoftc { + LIST_ENTRY(igmp_ifsoftc) igi_link; + struct ifnet *igi_ifp; /* pointer back to interface */ + uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ + uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ + uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ + uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ + uint32_t igi_flags; /* IGMP per-interface flags */ + uint32_t igi_rv; /* IGMPv3 Robustness Variable */ + uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ + uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ + uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ + SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */ + struct mbufq igi_gq; /* general query responses queue */ +}; int igmp_change_state(struct in_multi *); void igmp_fasttimo(void); -struct igmp_ifinfo * +struct igmp_ifsoftc * igmp_domifattach(struct ifnet *); void igmp_domifdetach(struct ifnet *); void igmp_ifdetach(struct ifnet *); -void igmp_input(struct mbuf *, int); +int igmp_input(struct mbuf **, int *, int); void igmp_slowtimo(void); SYSCTL_DECL(_net_inet_igmp); #endif /* _KERNEL */ - -/* - * Names for IGMP sysctl objects - */ -#define IGMPCTL_STATS 1 /* statistics (read-only) */ -#define IGMPCTL_MAXID 2 - -#define IGMPCTL_NAMES { \ - { 0, 0 }, \ - { "stats", CTLTYPE_STRUCT } \ -} #endif diff --git a/freebsd/sys/netinet/in.c b/freebsd/sys/netinet/in.c index 653580c7..06b23973 100644 --- a/freebsd/sys/netinet/in.c +++ b/freebsd/sys/netinet/in.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -45,9 +46,12 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include +#include #include #include +#include #include #include @@ -58,37 +62,33 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include #include +#include #include #include #include -static int in_mask2len(struct in_addr *); -static void in_len2mask(struct in_addr *, int); -static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t, - struct ifnet *, struct thread *); +static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct thread *); +static int in_difaddr_ioctl(caddr_t, struct ifnet *, struct thread *); -static int in_addprefix(struct in_ifaddr *, int); -static int in_scrubprefix(struct in_ifaddr *, u_int); static void in_socktrim(struct sockaddr_in *); -static int in_ifinit(struct ifnet *, - struct in_ifaddr *, struct sockaddr_in *, int); static void in_purgemaddrs(struct ifnet *); -static VNET_DEFINE(int, sameprefixcarponly); -#define V_sameprefixcarponly VNET(sameprefixcarponly) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW, - &VNET_NAME(sameprefixcarponly), 0, +static VNET_DEFINE(int, nosameprefix); +#define V_nosameprefix VNET(nosameprefix) +SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) -VNET_DECLARE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ -#define V_arpstat VNET(arpstat) +static struct sx in_control_sx; +SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host @@ -97,17 +97,18 @@ VNET_DECLARE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ int in_localaddr(struct in_addr in) { + struct rm_priotracker in_ifa_tracker; register u_long i = ntohl(in.s_addr); register struct in_ifaddr *ia; - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } @@ -118,19 +119,68 @@ in_localaddr(struct in_addr in) int in_localip(struct in_addr in) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (1); } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } +/* + * Return 1 if an internet address is configured on an interface. + */ +int +in_ifhasaddr(struct ifnet *ifp, struct in_addr in) +{ + struct ifaddr *ifa; + struct in_ifaddr *ia; + + IF_ADDR_RLOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ia = (struct in_ifaddr *)ifa; + if (ia->ia_addr.sin_addr.s_addr == in.s_addr) { + IF_ADDR_RUNLOCK(ifp); + return (1); + } + } + IF_ADDR_RUNLOCK(ifp); + + return (0); +} + +/* + * Return a reference to the interface address which is different to + * the supplied one but with same IP address value. + */ +static struct in_ifaddr * +in_localip_more(struct in_ifaddr *ia) +{ + struct rm_priotracker in_ifa_tracker; + in_addr_t in = IA_SIN(ia)->sin_addr.s_addr; + struct in_ifaddr *it; + + IN_IFADDR_RLOCK(&in_ifa_tracker); + LIST_FOREACH(it, INADDR_HASH(in), ia_hash) { + if (it != ia && IA_SIN(it)->sin_addr.s_addr == in) { + ifa_ref(&it->ia_ifa); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); + return (it); + } + } + IN_IFADDR_RUNLOCK(&in_ifa_tracker); + + return (NULL); +} + /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination @@ -169,793 +219,430 @@ in_socktrim(struct sockaddr_in *ap) } } -static int -in_mask2len(mask) - struct in_addr *mask; -{ - int x, y; - u_char *p; - - p = (u_char *)mask; - for (x = 0; x < sizeof(*mask); x++) { - if (p[x] != 0xff) - break; - } - y = 0; - if (x < sizeof(*mask)) { - for (y = 0; y < 8; y++) { - if ((p[x] & (0x80 >> y)) == 0) - break; - } - } - return (x * 8 + y); -} - -static void -in_len2mask(struct in_addr *mask, int len) -{ - int i; - u_char *p; - - p = (u_char *)mask; - bzero(mask, sizeof(*mask)); - for (i = 0; i < len / 8; i++) - p[i] = 0xff; - if (len % 8) - p[i] = (0xff00 >> (len % 8)) & 0xff; -} - /* * Generic internet control operations (ioctl's). - * - * ifp is NULL if not an interface-specific ioctl. */ -/* ARGSUSED */ int in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { - register struct ifreq *ifr = (struct ifreq *)data; - register struct in_ifaddr *ia, *iap; - register struct ifaddr *ifa; - struct in_addr allhosts_addr; - struct in_addr dst; - struct in_ifinfo *ii; - struct in_aliasreq *ifra = (struct in_aliasreq *)data; - struct sockaddr_in oldaddr; - int error, hostIsNew, iaIsNew, maskIsNew; - int iaIsFirst; + struct ifreq *ifr = (struct ifreq *)data; + struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; + struct ifaddr *ifa; + struct in_ifaddr *ia; + int error; - ia = NULL; - iaIsFirst = 0; - iaIsNew = 0; - allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); + if (ifp == NULL) + return (EADDRNOTAVAIL); /* - * Filter out ioctls we implement directly; forward the rest on to - * in_lifaddr_ioctl() and ifp->if_ioctl(). + * Filter out 4 ioctls we implement directly. Forward the rest + * to specific functions and ifp->if_ioctl(). */ switch (cmd) { - case SIOCAIFADDR: - case SIOCDIFADDR: case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: + break; + case SIOCDIFADDR: + sx_xlock(&in_control_sx); + error = in_difaddr_ioctl(data, ifp, td); + sx_xunlock(&in_control_sx); + return (error); +#ifndef __rtems__ + case OSIOCAIFADDR: /* 9.x compat */ +#endif /* __rtems__ */ + case SIOCAIFADDR: + sx_xlock(&in_control_sx); + error = in_aifaddr_ioctl(cmd, data, ifp, td); + sx_xunlock(&in_control_sx); + return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: - break; - - case SIOCALIFADDR: - if (td != NULL) { - error = priv_check(td, PRIV_NET_ADDIFADDR); - if (error) - return (error); - } - if (ifp == NULL) - return (EINVAL); - return in_lifaddr_ioctl(so, cmd, data, ifp, td); - - case SIOCDLIFADDR: - if (td != NULL) { - error = priv_check(td, PRIV_NET_DELIFADDR); - if (error) - return (error); - } - if (ifp == NULL) - return (EINVAL); - return in_lifaddr_ioctl(so, cmd, data, ifp, td); - - case SIOCGLIFADDR: - if (ifp == NULL) - return (EINVAL); - return in_lifaddr_ioctl(so, cmd, data, ifp, td); - + /* We no longer support that old commands. */ + return (EINVAL); default: - if (ifp == NULL || ifp->if_ioctl == NULL) + if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } - if (ifp == NULL) + if (addr->sin_addr.s_addr != INADDR_ANY && + prison_check_ip4(td->td_ucred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* - * Security checks before we get involved in any work. - */ - switch (cmd) { - case SIOCAIFADDR: - case SIOCSIFADDR: - case SIOCSIFBRDADDR: - case SIOCSIFNETMASK: - case SIOCSIFDSTADDR: - if (td != NULL) { - error = priv_check(td, PRIV_NET_ADDIFADDR); - if (error) - return (error); - } - break; - - case SIOCDIFADDR: - if (td != NULL) { - error = priv_check(td, PRIV_NET_DELIFADDR); - if (error) - return (error); - } - break; - } - - /* - * Find address for this interface, if it exists. - * - * If an alias address was specified, find that one instead of the + * Find address for this interface, if it exists. If an + * address was specified, find that one instead of the * first one on the interface, if possible. */ - dst = ((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr; - IN_IFADDR_RLOCK(); - LIST_FOREACH(iap, INADDR_HASH(dst.s_addr), ia_hash) { - if (iap->ia_ifp == ifp && - iap->ia_addr.sin_addr.s_addr == dst.s_addr) { - if (td == NULL || prison_check_ip4(td->td_ucred, - &dst) == 0) - ia = iap; + IF_ADDR_RLOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ia = (struct in_ifaddr *)ifa; + if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; - } } - if (ia != NULL) - ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); - if (ia == NULL) { - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - iap = ifatoia(ifa); - if (iap->ia_addr.sin_family == AF_INET) { - if (td != NULL && - prison_check_ip4(td->td_ucred, - &iap->ia_addr.sin_addr) != 0) - continue; - ia = iap; - break; + if (ifa == NULL) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (ifa->ifa_addr->sa_family == AF_INET) { + ia = (struct in_ifaddr *)ifa; + if (prison_check_ip4(td->td_ucred, + &ia->ia_addr.sin_addr) == 0) + break; } - } - if (ia != NULL) - ifa_ref(&ia->ia_ifa); + + if (ifa == NULL) { IF_ADDR_RUNLOCK(ifp); + return (EADDRNOTAVAIL); } - if (ia == NULL) - iaIsFirst = 1; error = 0; switch (cmd) { - case SIOCAIFADDR: - case SIOCDIFADDR: - if (ifra->ifra_addr.sin_family == AF_INET) { - struct in_ifaddr *oia; - - IN_IFADDR_RLOCK(); - for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) { - if (ia->ia_ifp == ifp && - ia->ia_addr.sin_addr.s_addr == - ifra->ifra_addr.sin_addr.s_addr) - break; - } - if (ia != NULL && ia != oia) - ifa_ref(&ia->ia_ifa); - if (oia != NULL && ia != oia) - ifa_free(&oia->ia_ifa); - IN_IFADDR_RUNLOCK(); - if ((ifp->if_flags & IFF_POINTOPOINT) - && (cmd == SIOCAIFADDR) - && (ifra->ifra_dstaddr.sin_addr.s_addr - == INADDR_ANY)) { - error = EDESTADDRREQ; - goto out; - } - } - if (cmd == SIOCDIFADDR && ia == NULL) { - error = EADDRNOTAVAIL; - goto out; - } - /* FALLTHROUGH */ - case SIOCSIFADDR: - case SIOCSIFNETMASK: - case SIOCSIFDSTADDR: - if (ia == NULL) { - ia = (struct in_ifaddr *) - malloc(sizeof *ia, M_IFADDR, M_NOWAIT | - M_ZERO); - if (ia == NULL) { - error = ENOBUFS; - goto out; - } - - ifa = &ia->ia_ifa; - ifa_init(ifa); - ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; - ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; - ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; - - ia->ia_sockmask.sin_len = 8; - ia->ia_sockmask.sin_family = AF_INET; - if (ifp->if_flags & IFF_BROADCAST) { - ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); - ia->ia_broadaddr.sin_family = AF_INET; - } - ia->ia_ifp = ifp; - - ifa_ref(ifa); /* if_addrhead */ - IF_ADDR_WLOCK(ifp); - TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); - IF_ADDR_WUNLOCK(ifp); - ifa_ref(ifa); /* in_ifaddrhead */ - IN_IFADDR_WLOCK(); - TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); - IN_IFADDR_WUNLOCK(); - iaIsNew = 1; - } - break; - - case SIOCSIFBRDADDR: case SIOCGIFADDR: - case SIOCGIFNETMASK: - case SIOCGIFDSTADDR: - case SIOCGIFBRDADDR: - if (ia == NULL) { - error = EADDRNOTAVAIL; - goto out; - } + *addr = ia->ia_addr; break; - } - - /* - * Most paths in this switch return directly or via out. Only paths - * that remove the address break in order to hit common removal code. - */ - switch (cmd) { - case SIOCGIFADDR: - *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; - goto out; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; - goto out; + break; } - *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; - goto out; + *addr = ia->ia_broadaddr; + break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; - goto out; + break; } - *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; - goto out; + *addr = ia->ia_dstaddr; + break; case SIOCGIFNETMASK: - *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; - goto out; - - case SIOCSIFDSTADDR: - if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { - error = EINVAL; - goto out; - } - oldaddr = ia->ia_dstaddr; - ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; - if (ifp->if_ioctl != NULL) { - error = (*ifp->if_ioctl)(ifp, SIOCSIFDSTADDR, - (caddr_t)ia); - if (error) { - ia->ia_dstaddr = oldaddr; - goto out; - } - } - if (ia->ia_flags & IFA_ROUTE) { - ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr; - rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); - ia->ia_ifa.ifa_dstaddr = - (struct sockaddr *)&ia->ia_dstaddr; - rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); - } - goto out; + *addr = ia->ia_sockmask; + break; + } - case SIOCSIFBRDADDR: - if ((ifp->if_flags & IFF_BROADCAST) == 0) { - error = EINVAL; - goto out; - } - ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; - goto out; + IF_ADDR_RUNLOCK(ifp); - case SIOCSIFADDR: - error = in_ifinit(ifp, ia, - (struct sockaddr_in *) &ifr->ifr_addr, 1); - if (error != 0 && iaIsNew) - break; - if (error == 0) { - ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); - if (iaIsFirst && - (ifp->if_flags & IFF_MULTICAST) != 0) { - error = in_joingroup(ifp, &allhosts_addr, - NULL, &ii->ii_allhosts); - } - EVENTHANDLER_INVOKE(ifaddr_event, ifp); - } - error = 0; - goto out; + return (error); +} - case SIOCSIFNETMASK: - ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr; - ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); - goto out; +static int +in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) +{ + const struct in_aliasreq *ifra = (struct in_aliasreq *)data; + const struct sockaddr_in *addr = &ifra->ifra_addr; + const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; + const struct sockaddr_in *mask = &ifra->ifra_mask; + const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; + const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; + struct ifaddr *ifa; + struct in_ifaddr *ia; + bool iaIsFirst; + int error = 0; - case SIOCAIFADDR: - maskIsNew = 0; - hostIsNew = 1; - error = 0; - if (ia->ia_addr.sin_family == AF_INET) { - if (ifra->ifra_addr.sin_len == 0) { - ifra->ifra_addr = ia->ia_addr; - hostIsNew = 0; - } else if (ifra->ifra_addr.sin_addr.s_addr == - ia->ia_addr.sin_addr.s_addr) - hostIsNew = 0; - } - if (ifra->ifra_mask.sin_len) { - /* - * QL: XXX - * Need to scrub the prefix here in case - * the issued command is SIOCAIFADDR with - * the same address, but with a different - * prefix length. And if the prefix length - * is the same as before, then the call is - * un-necessarily executed here. - */ - in_ifscrub(ifp, ia, LLE_STATIC); - ia->ia_sockmask = ifra->ifra_mask; - ia->ia_sockmask.sin_family = AF_INET; - ia->ia_subnetmask = - ntohl(ia->ia_sockmask.sin_addr.s_addr); - maskIsNew = 1; - } - if ((ifp->if_flags & IFF_POINTOPOINT) && - (ifra->ifra_dstaddr.sin_family == AF_INET)) { - in_ifscrub(ifp, ia, LLE_STATIC); - ia->ia_dstaddr = ifra->ifra_dstaddr; - maskIsNew = 1; /* We lie; but the effect's the same */ - } - if (ifra->ifra_addr.sin_family == AF_INET && - (hostIsNew || maskIsNew)) - error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); - if (error != 0 && iaIsNew) - break; + error = priv_check(td, PRIV_NET_ADDIFADDR); + if (error) + return (error); - if ((ifp->if_flags & IFF_BROADCAST) && - (ifra->ifra_broadaddr.sin_family == AF_INET)) - ia->ia_broadaddr = ifra->ifra_broadaddr; - if (error == 0) { - ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); - if (iaIsFirst && - (ifp->if_flags & IFF_MULTICAST) != 0) { - error = in_joingroup(ifp, &allhosts_addr, - NULL, &ii->ii_allhosts); - } - EVENTHANDLER_INVOKE(ifaddr_event, ifp); - } - goto out; + /* + * ifra_addr must be present and be of INET family. + * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. + */ + if (addr->sin_len != sizeof(struct sockaddr_in) || + addr->sin_family != AF_INET) + return (EINVAL); + if (broadaddr->sin_len != 0 && + (broadaddr->sin_len != sizeof(struct sockaddr_in) || + broadaddr->sin_family != AF_INET)) + return (EINVAL); + if (mask->sin_len != 0 && + (mask->sin_len != sizeof(struct sockaddr_in) || + mask->sin_family != AF_INET)) + return (EINVAL); + if ((ifp->if_flags & IFF_POINTOPOINT) && + (dstaddr->sin_len != sizeof(struct sockaddr_in) || + dstaddr->sin_addr.s_addr == INADDR_ANY)) + return (EDESTADDRREQ); + if (vhid > 0 && carp_attach_p == NULL) + return (EPROTONOSUPPORT); - case SIOCDIFADDR: - /* - * in_ifscrub kills the interface route. - */ - in_ifscrub(ifp, ia, LLE_STATIC); + /* + * See whether address already exist. + */ + iaIsFirst = true; + ia = NULL; + IF_ADDR_RLOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + struct in_ifaddr *it; - /* - * in_ifadown gets rid of all the rest of - * the routes. This is not quite the right - * thing to do, but at least if we are running - * a routing process they will come back. - */ - in_ifadown(&ia->ia_ifa, 1); - EVENTHANDLER_INVOKE(ifaddr_event, ifp); - error = 0; - break; + if (ifa->ifa_addr->sa_family != AF_INET) + continue; - default: - panic("in_control: unsupported ioctl"); + it = (struct in_ifaddr *)ifa; + iaIsFirst = false; + if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && + prison_check_ip4(td->td_ucred, &addr->sin_addr) == 0) + ia = it; } + IF_ADDR_RUNLOCK(ifp); + + if (ia != NULL) + (void )in_difaddr_ioctl(data, ifp, td); + + ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); + ia = (struct in_ifaddr *)ifa; + ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; + ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; + ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; + + ia->ia_ifp = ifp; + ia->ia_addr = *addr; + if (mask->sin_len != 0) { + ia->ia_sockmask = *mask; + ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); + } else { + in_addr_t i = ntohl(addr->sin_addr.s_addr); - IF_ADDR_WLOCK(ifp); - /* Re-check that ia is still part of the list. */ - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa == &ia->ia_ifa) - break; - } - if (ifa == NULL) { /* - * If we lost the race with another thread, there is no need to - * try it again for the next loop as there is no other exit - * path between here and out. - */ - IF_ADDR_WUNLOCK(ifp); - error = EADDRNOTAVAIL; - goto out; + * Be compatible with network classes, if netmask isn't + * supplied, guess it based on classes. + */ + if (IN_CLASSA(i)) + ia->ia_subnetmask = IN_CLASSA_NET; + else if (IN_CLASSB(i)) + ia->ia_subnetmask = IN_CLASSB_NET; + else + ia->ia_subnetmask = IN_CLASSC_NET; + ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } - TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); - IF_ADDR_WUNLOCK(ifp); - ifa_free(&ia->ia_ifa); /* if_addrhead */ + ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; + in_socktrim(&ia->ia_sockmask); - IN_IFADDR_WLOCK(); - TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); - if (ia->ia_addr.sin_family == AF_INET) { - struct in_ifaddr *if_ia; + if (ifp->if_flags & IFF_BROADCAST) { + if (broadaddr->sin_len != 0) { + ia->ia_broadaddr = *broadaddr; + } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { + ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; + ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); + ia->ia_broadaddr.sin_family = AF_INET; + } else { + ia->ia_broadaddr.sin_addr.s_addr = + htonl(ia->ia_subnet | ~ia->ia_subnetmask); + ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); + ia->ia_broadaddr.sin_family = AF_INET; + } + } - LIST_REMOVE(ia, ia_hash); - IN_IFADDR_WUNLOCK(); - /* - * If this is the last IPv4 address configured on this - * interface, leave the all-hosts group. - * No state-change report need be transmitted. - */ - if_ia = NULL; - IFP_TO_IA(ifp, if_ia); - if (if_ia == NULL) { - ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); - IN_MULTI_LOCK(); - if (ii->ii_allhosts) { - (void)in_leavegroup_locked(ii->ii_allhosts, - NULL); - ii->ii_allhosts = NULL; - } - IN_MULTI_UNLOCK(); - } else - ifa_free(&if_ia->ia_ifa); - } else - IN_IFADDR_WUNLOCK(); - ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ -out: - if (ia != NULL) - ifa_free(&ia->ia_ifa); - return (error); -} + if (ifp->if_flags & IFF_POINTOPOINT) + ia->ia_dstaddr = *dstaddr; -/* - * SIOC[GAD]LIFADDR. - * SIOCGLIFADDR: get first address. (?!?) - * SIOCGLIFADDR with IFLR_PREFIX: - * get first address that matches the specified prefix. - * SIOCALIFADDR: add the specified address. - * SIOCALIFADDR with IFLR_PREFIX: - * EINVAL since we can't deduce hostid part of the address. - * SIOCDLIFADDR: delete the specified address. - * SIOCDLIFADDR with IFLR_PREFIX: - * delete the first address that matches the specified prefix. - * return values: - * EINVAL on invalid parameters - * EADDRNOTAVAIL on prefix match failed/specified address not found - * other values may be returned from in_ioctl() - */ -static int -in_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, - struct ifnet *ifp, struct thread *td) -{ - struct if_laddrreq *iflr = (struct if_laddrreq *)data; - struct ifaddr *ifa; + /* XXXGL: rtinit() needs this strange assignment. */ + if (ifp->if_flags & IFF_LOOPBACK) + ia->ia_dstaddr = ia->ia_addr; - /* sanity checks */ - if (data == NULL || ifp == NULL) { - panic("invalid argument to in_lifaddr_ioctl"); - /*NOTRECHED*/ + if (vhid != 0) { + error = (*carp_attach_p)(&ia->ia_ifa, vhid); + if (error) + return (error); } - switch (cmd) { - case SIOCGLIFADDR: - /* address must be specified on GET with IFLR_PREFIX */ - if ((iflr->flags & IFLR_PREFIX) == 0) - break; - /*FALLTHROUGH*/ - case SIOCALIFADDR: - case SIOCDLIFADDR: - /* address must be specified on ADD and DELETE */ - if (iflr->addr.ss_family != AF_INET) - return (EINVAL); - if (iflr->addr.ss_len != sizeof(struct sockaddr_in)) - return (EINVAL); - /* XXX need improvement */ - if (iflr->dstaddr.ss_family - && iflr->dstaddr.ss_family != AF_INET) - return (EINVAL); - if (iflr->dstaddr.ss_family - && iflr->dstaddr.ss_len != sizeof(struct sockaddr_in)) - return (EINVAL); - break; - default: /*shouldn't happen*/ - return (EOPNOTSUPP); - } - if (sizeof(struct in_addr) * 8 < iflr->prefixlen) - return (EINVAL); + /* if_addrhead is already referenced by ifa_alloc() */ + IF_ADDR_WLOCK(ifp); + TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); + IF_ADDR_WUNLOCK(ifp); - switch (cmd) { - case SIOCALIFADDR: - { - struct in_aliasreq ifra; + ifa_ref(ifa); /* in_ifaddrhead */ + IN_IFADDR_WLOCK(); + TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); + LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); + IN_IFADDR_WUNLOCK(); - if (iflr->flags & IFLR_PREFIX) - return (EINVAL); + /* + * Give the interface a chance to initialize + * if this is its first address, + * and to validate the address if necessary. + */ + if (ifp->if_ioctl != NULL) { + error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); + if (error) + goto fail1; + } - /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR). */ - bzero(&ifra, sizeof(ifra)); - bcopy(iflr->iflr_name, ifra.ifra_name, - sizeof(ifra.ifra_name)); + /* + * Add route for the network. + */ + if (vhid == 0) { + int flags = RTF_UP; - bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len); + if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) + flags |= RTF_HOST; - if (iflr->dstaddr.ss_family) { /*XXX*/ - bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr, - iflr->dstaddr.ss_len); - } + error = in_addprefix(ia, flags); + if (error) + goto fail1; + } - ifra.ifra_mask.sin_family = AF_INET; - ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in); - in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen); - - return (in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, td)); - } - case SIOCGLIFADDR: - case SIOCDLIFADDR: - { - struct in_ifaddr *ia; - struct in_addr mask, candidate, match; - struct sockaddr_in *sin; - - bzero(&mask, sizeof(mask)); - bzero(&match, sizeof(match)); - if (iflr->flags & IFLR_PREFIX) { - /* lookup a prefix rather than address. */ - in_len2mask(&mask, iflr->prefixlen); - - sin = (struct sockaddr_in *)&iflr->addr; - match.s_addr = sin->sin_addr.s_addr; - match.s_addr &= mask.s_addr; - - /* if you set extra bits, that's wrong */ - if (match.s_addr != sin->sin_addr.s_addr) - return (EINVAL); + /* + * Add a loopback route to self. + */ + if (vhid == 0 && (ifp->if_flags & IFF_LOOPBACK) == 0 && + ia->ia_addr.sin_addr.s_addr != INADDR_ANY && + !((ifp->if_flags & IFF_POINTOPOINT) && + ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)) { + struct in_ifaddr *eia; - } else { - /* on getting an address, take the 1st match */ - /* on deleting an address, do exact match */ - if (cmd != SIOCGLIFADDR) { - in_len2mask(&mask, 32); - sin = (struct sockaddr_in *)&iflr->addr; - match.s_addr = sin->sin_addr.s_addr; - } - } + eia = in_localip_more(ia); - IF_ADDR_RLOCK(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET) - continue; - if (match.s_addr == 0) - break; - candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr; - candidate.s_addr &= mask.s_addr; - if (candidate.s_addr == match.s_addr) - break; - } - if (ifa != NULL) - ifa_ref(ifa); - IF_ADDR_RUNLOCK(ifp); - if (ifa == NULL) - return (EADDRNOTAVAIL); - ia = (struct in_ifaddr *)ifa; + if (eia == NULL) { + error = ifa_add_loopback_route((struct ifaddr *)ia, + (struct sockaddr *)&ia->ia_addr); + if (error) + goto fail2; + } else + ifa_free(&eia->ia_ifa); + } - if (cmd == SIOCGLIFADDR) { - /* fill in the if_laddrreq structure */ - bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len); + if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { + struct in_addr allhosts_addr; + struct in_ifinfo *ii; - if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { - bcopy(&ia->ia_dstaddr, &iflr->dstaddr, - ia->ia_dstaddr.sin_len); - } else - bzero(&iflr->dstaddr, sizeof(iflr->dstaddr)); + ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); + allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); + + error = in_joingroup(ifp, &allhosts_addr, NULL, + &ii->ii_allhosts); + } - iflr->prefixlen = - in_mask2len(&ia->ia_sockmask.sin_addr); + EVENTHANDLER_INVOKE(ifaddr_event, ifp); - iflr->flags = 0; /*XXX*/ - ifa_free(ifa); + return (error); - return (0); - } else { - struct in_aliasreq ifra; - - /* fill in_aliasreq and do ioctl(SIOCDIFADDR) */ - bzero(&ifra, sizeof(ifra)); - bcopy(iflr->iflr_name, ifra.ifra_name, - sizeof(ifra.ifra_name)); - - bcopy(&ia->ia_addr, &ifra.ifra_addr, - ia->ia_addr.sin_len); - if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { - bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr, - ia->ia_dstaddr.sin_len); - } - bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr, - ia->ia_sockmask.sin_len); - ifa_free(ifa); +fail2: + if (vhid == 0) + (void )in_scrubprefix(ia, LLE_STATIC); - return (in_control(so, SIOCDIFADDR, (caddr_t)&ifra, - ifp, td)); - } - } - } +fail1: + if (ia->ia_ifa.ifa_carp) + (*carp_detach_p)(&ia->ia_ifa); - return (EOPNOTSUPP); /*just for safety*/ -} + IF_ADDR_WLOCK(ifp); + TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); + IF_ADDR_WUNLOCK(ifp); + ifa_free(&ia->ia_ifa); /* if_addrhead */ -/* - * Delete any existing route for an interface. - */ -void -in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia, u_int flags) -{ + IN_IFADDR_WLOCK(); + TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); + LIST_REMOVE(ia, ia_hash); + IN_IFADDR_WUNLOCK(); + ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ - in_scrubprefix(ia, flags); + return (error); } -/* - * Initialize an interface's internet address - * and routing table entry. - */ static int -in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, - int scrub) +in_difaddr_ioctl(caddr_t data, struct ifnet *ifp, struct thread *td) { - register u_long i = ntohl(sin->sin_addr.s_addr); - struct sockaddr_in oldaddr; - int s = splimp(), flags = RTF_UP, error = 0; - - oldaddr = ia->ia_addr; - if (oldaddr.sin_family == AF_INET) - LIST_REMOVE(ia, ia_hash); - ia->ia_addr = *sin; - if (ia->ia_addr.sin_family == AF_INET) { - IN_IFADDR_WLOCK(); - LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), - ia, ia_hash); - IN_IFADDR_WUNLOCK(); - } - /* - * Give the interface a chance to initialize - * if this is its first address, - * and to validate the address if necessary. - */ - if (ifp->if_ioctl != NULL) { - error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); - if (error) { - splx(s); - /* LIST_REMOVE(ia, ia_hash) is done in in_control */ - ia->ia_addr = oldaddr; - IN_IFADDR_WLOCK(); - if (ia->ia_addr.sin_family == AF_INET) - LIST_INSERT_HEAD(INADDR_HASH( - ia->ia_addr.sin_addr.s_addr), ia, ia_hash); - else - /* - * If oldaddr family is not AF_INET (e.g. - * interface has been just created) in_control - * does not call LIST_REMOVE, and we end up - * with bogus ia entries in hash - */ - LIST_REMOVE(ia, ia_hash); - IN_IFADDR_WUNLOCK(); + const struct ifreq *ifr = (struct ifreq *)data; + const struct sockaddr_in *addr = (const struct sockaddr_in *) + &ifr->ifr_addr; + struct ifaddr *ifa; + struct in_ifaddr *ia; + bool deleteAny, iaIsLast; + int error; + + if (td != NULL) { + error = priv_check(td, PRIV_NET_DELIFADDR); + if (error) return (error); - } } - splx(s); - if (scrub) { - ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; - in_ifscrub(ifp, ia, LLE_STATIC); - ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + + if (addr->sin_len != sizeof(struct sockaddr_in) || + addr->sin_family != AF_INET) + deleteAny = true; + else + deleteAny = false; + + iaIsLast = true; + ia = NULL; + IF_ADDR_WLOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + struct in_ifaddr *it; + + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + + it = (struct in_ifaddr *)ifa; + if (deleteAny && ia == NULL && (td == NULL || + prison_check_ip4(td->td_ucred, &it->ia_addr.sin_addr) == 0)) + ia = it; + + if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && + (td == NULL || prison_check_ip4(td->td_ucred, + &addr->sin_addr) == 0)) + ia = it; + + if (it != ia) + iaIsLast = false; } - /* - * Be compatible with network classes, if netmask isn't supplied, - * guess it based on classes. - */ - if (ia->ia_subnetmask == 0) { - if (IN_CLASSA(i)) - ia->ia_subnetmask = IN_CLASSA_NET; - else if (IN_CLASSB(i)) - ia->ia_subnetmask = IN_CLASSB_NET; - else - ia->ia_subnetmask = IN_CLASSC_NET; - ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); + + if (ia == NULL) { + IF_ADDR_WUNLOCK(ifp); + return (EADDRNOTAVAIL); } - ia->ia_subnet = i & ia->ia_subnetmask; - in_socktrim(&ia->ia_sockmask); + + TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); + IF_ADDR_WUNLOCK(ifp); + ifa_free(&ia->ia_ifa); /* if_addrhead */ + + IN_IFADDR_WLOCK(); + TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); + LIST_REMOVE(ia, ia_hash); + IN_IFADDR_WUNLOCK(); + /* - * XXX: carp(4) does not have interface route + * in_scrubprefix() kills the interface route. */ - if (ifp->if_type == IFT_CARP) - return (0); + in_scrubprefix(ia, LLE_STATIC); + /* - * Add route for the network. + * in_ifadown gets rid of all the rest of + * the routes. This is not quite the right + * thing to do, but at least if we are running + * a routing process they will come back. */ - ia->ia_ifa.ifa_metric = ifp->if_metric; - if (ifp->if_flags & IFF_BROADCAST) { - if (ia->ia_subnetmask == IN_RFC3021_MASK) - ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; - else - ia->ia_broadaddr.sin_addr.s_addr = - htonl(ia->ia_subnet | ~ia->ia_subnetmask); - } else if (ifp->if_flags & IFF_LOOPBACK) { - ia->ia_dstaddr = ia->ia_addr; - flags |= RTF_HOST; - } else if (ifp->if_flags & IFF_POINTOPOINT) { - if (ia->ia_dstaddr.sin_family != AF_INET) - return (0); - flags |= RTF_HOST; - } - if ((error = in_addprefix(ia, flags)) != 0) - return (error); - - if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY) - return (0); - - if (ifp->if_flags & IFF_POINTOPOINT) { - if (ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) - return (0); - } + in_ifadown(&ia->ia_ifa, 1); + if (ia->ia_ifa.ifa_carp) + (*carp_detach_p)(&ia->ia_ifa); /* - * add a loopback route to self + * If this is the last IPv4 address configured on this + * interface, leave the all-hosts group. + * No state-change report need be transmitted. */ - if (V_useloopback && !(ifp->if_flags & IFF_LOOPBACK)) { - struct route ia_ro; - - bzero(&ia_ro, sizeof(ia_ro)); - *((struct sockaddr_in *)(&ia_ro.ro_dst)) = ia->ia_addr; - rtalloc_ign_fib(&ia_ro, 0, RT_DEFAULT_FIB); - if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) && - (ia_ro.ro_rt->rt_ifp == V_loif)) { - RT_LOCK(ia_ro.ro_rt); - RT_ADDREF(ia_ro.ro_rt); - RTFREE_LOCKED(ia_ro.ro_rt); - } else - error = ifa_add_loopback_route((struct ifaddr *)ia, - (struct sockaddr *)&ia->ia_addr); - if (error == 0) - ia->ia_flags |= IFA_RTSELF; - if (ia_ro.ro_rt != NULL) - RTFREE(ia_ro.ro_rt); + if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { + struct in_ifinfo *ii; + + ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); + IN_MULTI_LOCK(); + if (ii->ii_allhosts) { + (void)in_leavegroup_locked(ii->ii_allhosts, NULL); + ii->ii_allhosts = NULL; + } + IN_MULTI_UNLOCK(); } - return (error); + EVENTHANDLER_INVOKE(ifaddr_event, ifp); + ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ + + return (0); } #define rtinitflags(x) \ @@ -965,9 +652,10 @@ in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, /* * Check if we have a route for the given prefix already or add one accordingly. */ -static int +int in_addprefix(struct in_ifaddr *target, int flags) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error; @@ -981,7 +669,7 @@ in_addprefix(struct in_ifaddr *target, int flags) prefix.s_addr &= mask.s_addr; } - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); /* Look for an existing address with the same prefix, mask, and fib */ TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (rtinitflags(ia)) { @@ -1009,28 +697,26 @@ in_addprefix(struct in_ifaddr *target, int flags) #ifdef RADIX_MPATH if (ia->ia_addr.sin_addr.s_addr == target->ia_addr.sin_addr.s_addr) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else break; #endif - if (V_sameprefixcarponly && - target->ia_ifp->if_type != IFT_CARP && - ia->ia_ifp->if_type != IFT_CARP) { - IN_IFADDR_RUNLOCK(); + if (V_nosameprefix) { + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (EEXIST); } else { int fibnum; - fibnum = rt_add_addr_allfibs ? RT_ALL_FIBS : + fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_ADD, &target->ia_ifa, fibnum); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return (0); } } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * No-one seem to have this prefix route, so we try to insert it. @@ -1041,68 +727,87 @@ in_addprefix(struct in_ifaddr *target, int flags) return (error); } -extern void arp_ifscrub(struct ifnet *ifp, uint32_t addr); +/* + * Removes either all lle entries for given @ia, or lle + * corresponding to @ia address. + */ +static void +in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) +{ + struct sockaddr_in addr, mask; + struct sockaddr *saddr, *smask; + struct ifnet *ifp; + + saddr = (struct sockaddr *)&addr; + bzero(&addr, sizeof(addr)); + addr.sin_len = sizeof(addr); + addr.sin_family = AF_INET; + smask = (struct sockaddr *)&mask; + bzero(&mask, sizeof(mask)); + mask.sin_len = sizeof(mask); + mask.sin_family = AF_INET; + mask.sin_addr.s_addr = ia->ia_subnetmask; + ifp = ia->ia_ifp; + + if (all) { + + /* + * Remove all L2 entries matching given prefix. + * Convert address to host representation to avoid + * doing this on every callback. ia_subnetmask is already + * stored in host representation. + */ + addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); + lltable_prefix_free(AF_INET, saddr, smask, flags); + } else { + /* Remove interface address only */ + addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; + lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); + } +} /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ -static int +int in_scrubprefix(struct in_ifaddr *target, u_int flags) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; - struct in_addr prefix, mask, p; + struct in_addr prefix, mask, p, m; int error = 0; - struct sockaddr_in prefix0, mask0; /* * Remove the loopback route to the interface address. - * The "useloopback" setting is not consulted because if the - * user configures an interface address, turns off this - * setting, and then tries to delete that interface address, - * checking the current setting of "useloopback" would leave - * that interface address loopback route untouched, which - * would be wrong. Therefore the interface address loopback route - * deletion is unconditional. */ if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && !(target->ia_ifp->if_flags & IFF_LOOPBACK) && - (target->ia_flags & IFA_RTSELF)) { - struct route ia_ro; - int freeit = 0; - int fibnum; + (flags & LLE_STATIC)) { + struct in_ifaddr *eia; - bzero(&ia_ro, sizeof(ia_ro)); - *((struct sockaddr_in *)(&ia_ro.ro_dst)) = target->ia_addr; - fibnum = target->ia_ifp->if_fib; - rtalloc_ign_fib(&ia_ro, 0, fibnum); - if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) && - (ia_ro.ro_rt->rt_ifp == V_loif)) { - RT_LOCK(ia_ro.ro_rt); - if (ia_ro.ro_rt->rt_refcnt <= 1) - freeit = 1; - else if (flags & LLE_STATIC) { - RT_REMREF(ia_ro.ro_rt); - target->ia_flags &= ~IFA_RTSELF; - } - RTFREE_LOCKED(ia_ro.ro_rt); - } - if (freeit && (flags & LLE_STATIC)) { + /* + * XXXME: add fib-aware in_localip. + * We definitely don't want to switch between + * prefixes in different fibs. + */ + eia = in_localip_more(target); + + if (eia != NULL) { + error = ifa_switch_loopback_route((struct ifaddr *)eia, + (struct sockaddr *)&target->ia_addr); + ifa_free(&eia->ia_ifa); + } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); - if (error == 0) - target->ia_flags &= ~IFA_RTSELF; } - if ((flags & LLE_STATIC) && - !(target->ia_ifp->if_flags & IFF_NOARP)) - /* remove arp cache */ - arp_ifscrub(target->ia_ifp, IA_SIN(target)->sin_addr.s_addr); } - if (rtinitflags(target)) + if (rtinitflags(target)) { prefix = target->ia_dstaddr.sin_addr; - else { + mask.s_addr = 0; + } else { prefix = target->ia_addr.sin_addr; mask = target->ia_sockmask.sin_addr; prefix.s_addr &= mask.s_addr; @@ -1111,38 +816,48 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags) if ((target->ia_flags & IFA_ROUTE) == 0) { int fibnum; - fibnum = rt_add_addr_allfibs ? RT_ALL_FIBS : + fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : target->ia_ifp->if_fib; rt_addrmsg(RTM_DELETE, &target->ia_ifa, fibnum); + + /* + * Removing address from !IFF_UP interface or + * prefix which exists on other interface (along with route). + * No entries should exist here except target addr. + * Given that, delete this entry only. + */ + in_scrubprefixlle(target, 0, flags); return (0); } - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - if (rtinitflags(ia)) + if (rtinitflags(ia)) { p = ia->ia_dstaddr.sin_addr; - else { + + if (prefix.s_addr != p.s_addr) + continue; + } else { p = ia->ia_addr.sin_addr; - p.s_addr &= ia->ia_sockmask.sin_addr.s_addr; + m = ia->ia_sockmask.sin_addr; + p.s_addr &= m.s_addr; + + if (prefix.s_addr != p.s_addr || + mask.s_addr != m.s_addr) + continue; } - if ((prefix.s_addr != p.s_addr) || - !(ia->ia_ifp->if_flags & IFF_UP)) + if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. - * - * XXX: a special case for carp(4) interface - this should - * be more generally specified as an interface that - * doesn't support such action. */ - if ((ia->ia_flags & IFA_ROUTE) == 0 - && (ia->ia_ifp->if_type != IFT_CARP)) { + if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); error = rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); if (error == 0) @@ -1150,6 +865,9 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags) else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); + /* Scrub all entries IFF interface is different */ + in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, + flags); error = rtinit(&ia->ia_ifa, (int)RTM_ADD, rtinitflags(ia) | RTF_UP); if (error == 0) @@ -1161,21 +879,12 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags) return (error); } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * remove all L2 entries on the given prefix */ - bzero(&prefix0, sizeof(prefix0)); - prefix0.sin_len = sizeof(prefix0); - prefix0.sin_family = AF_INET; - prefix0.sin_addr.s_addr = target->ia_subnet; - bzero(&mask0, sizeof(mask0)); - mask0.sin_len = sizeof(mask0); - mask0.sin_family = AF_INET; - mask0.sin_addr.s_addr = target->ia_subnetmask; - lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0, - (struct sockaddr *)&mask0, flags); + in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. @@ -1190,6 +899,58 @@ in_scrubprefix(struct in_ifaddr *target, u_int flags) #undef rtinitflags +void +in_ifscrub_all(void) +{ + struct ifnet *ifp; + struct ifaddr *ifa, *nifa; + struct ifaliasreq ifr; + + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) { + /* Cannot lock here - lock recursion. */ + /* IF_ADDR_RLOCK(ifp); */ + TAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + + /* + * This is ugly but the only way for legacy IP to + * cleanly remove addresses and everything attached. + */ + bzero(&ifr, sizeof(ifr)); + ifr.ifra_addr = *ifa->ifa_addr; + if (ifa->ifa_dstaddr) + ifr.ifra_broadaddr = *ifa->ifa_dstaddr; + (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, + ifp, NULL); + } + /* IF_ADDR_RUNLOCK(ifp); */ + in_purgemaddrs(ifp); + igmp_domifdetach(ifp); + } + IFNET_RUNLOCK(); +} + +int +in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) +{ + + return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || + /* + * Check for old-style (host 0) broadcast, but + * taking into account that RFC 3021 obsoletes it. + */ + (ia->ia_subnetmask != IN_RFC3021_MASK && + ntohl(in.s_addr) == ia->ia_subnet)) && + /* + * Check for an all one subnetmask. These + * only exist when an interface gets a secondary + * address. + */ + ia->ia_subnetmask != (u_long)0xffffffff); +} + /* * Return 1 if the address might be a local broadcast address. */ @@ -1197,37 +958,27 @@ int in_broadcast(struct in_addr in, struct ifnet *ifp) { register struct ifaddr *ifa; - u_long t; + int found; if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (0); - t = ntohl(in.s_addr); + found = 0; /* * Look through the list of addresses for a match * with a broadcast address. */ -#define ia ((struct in_ifaddr *)ifa) + IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && - (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || - /* - * Check for old-style (host 0) broadcast, but - * taking into account that RFC 3021 obsoletes it. - */ - (ia->ia_subnetmask != IN_RFC3021_MASK && - t == ia->ia_subnet)) && - /* - * Check for an all one subnetmask. These - * only exist when an interface gets a secondary - * address. - */ - ia->ia_subnetmask != (u_long)0xffffffff) - return (1); - return (0); -#undef ia + in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) { + found = 1; + break; + } + IF_ADDR_RUNLOCK(ifp); + return (found); } /* @@ -1239,6 +990,7 @@ in_ifdetach(struct ifnet *ifp) in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); + in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); } @@ -1288,34 +1040,44 @@ in_purgemaddrs(struct ifnet *ifp) IN_MULTI_UNLOCK(); } -#include -#include - struct in_llentry { struct llentry base; - struct sockaddr_in l3_addr4; }; +#define IN_LLTBL_DEFAULT_HSIZE 32 +#define IN_LLTBL_HASH(k, h) \ + (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) + /* - * Deletes an address from the address table. - * This function is called by the timer functions - * such as arptimer() and nd6_llinfo_timer(), and - * the caller does the locking. + * Do actual deallocation of @lle. */ static void -in_lltable_free(struct lltable *llt, struct llentry *lle) +in_lltable_destroy_lle_unlocked(struct llentry *lle) { - LLE_WUNLOCK(lle); + LLE_LOCK_DESTROY(lle); + LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } +/* + * Called by LLE_FREE_LOCKED when number of references + * drops to zero. + */ +static void +in_lltable_destroy_lle(struct llentry *lle) +{ + + LLE_WUNLOCK(lle); + in_lltable_destroy_lle_unlocked(lle); +} + static struct llentry * -in_lltable_new(const struct sockaddr *l3addr, u_int flags) +in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; - lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_DONTWAIT | M_ZERO); + lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; @@ -1324,82 +1086,123 @@ in_lltable_new(const struct sockaddr *l3addr, u_int flags) * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ - lle->l3_addr4 = *(const struct sockaddr_in *)l3addr; + lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; - lle->base.lle_free = in_lltable_free; + lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); - callout_init_rw(&lle->base.la_timer, &lle->base.lle_lock, - CALLOUT_RETURNUNLOCKED); + LLE_REQ_INIT(&lle->base); + callout_init(&lle->base.lle_timer, 1); return (&lle->base); } -#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ - (((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 ) +#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ + ((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 ) + +static int +in_lltable_match_prefix(const struct sockaddr *saddr, + const struct sockaddr *smask, u_int flags, struct llentry *lle) +{ + struct in_addr addr, mask, lle_addr; + + addr = ((const struct sockaddr_in *)saddr)->sin_addr; + mask = ((const struct sockaddr_in *)smask)->sin_addr; + lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); + + if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) + return (0); + + if (lle->la_flags & LLE_IFADDR) { + + /* + * Delete LLE_IFADDR records IFF address & flag matches. + * Note that addr is the interface address within prefix + * being matched. + * Note also we should handle 'ifdown' cases without removing + * ifaddr macs. + */ + if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) + return (1); + return (0); + } + + /* flags & LLE_STATIC means deleting both dynamic and static entries */ + if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) + return (1); + + return (0); +} static void -in_lltable_prefix_free(struct lltable *llt, const struct sockaddr *prefix, - const struct sockaddr *mask, u_int flags) +in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { - const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix; - const struct sockaddr_in *msk = (const struct sockaddr_in *)mask; - struct llentry *lle, *next; - int i; + struct ifnet *ifp; size_t pkts_dropped; - IF_AFDATA_WLOCK(llt->llt_ifp); - for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { - LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { - /* - * (flags & LLE_STATIC) means deleting all entries - * including static ARP entries. - */ - if (IN_ARE_MASKED_ADDR_EQUAL(satosin(L3_ADDR(lle)), - pfx, msk) && ((flags & LLE_STATIC) || - !(lle->la_flags & LLE_STATIC))) { - LLE_WLOCK(lle); - if (callout_stop(&lle->la_timer)) - LLE_REMREF(lle); - pkts_dropped = llentry_free(lle); - ARPSTAT_ADD(dropped, pkts_dropped); - } - } + LLE_WLOCK_ASSERT(lle); + KASSERT(llt != NULL, ("lltable is NULL")); + + /* Unlink entry from table if not already */ + if ((lle->la_flags & LLE_LINKED) != 0) { + ifp = llt->llt_ifp; + IF_AFDATA_WLOCK_ASSERT(ifp); + lltable_unlink_entry(llt, lle); } - IF_AFDATA_WUNLOCK(llt->llt_ifp); -} + /* cancel timer */ + if (callout_stop(&lle->lle_timer) > 0) + LLE_REMREF(lle); + + /* Drop hold queue */ + pkts_dropped = llentry_free(lle); + ARPSTAT_ADD(dropped, pkts_dropped); +} static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { - struct rtentry *rt; + struct rt_addrinfo info; + struct sockaddr_in rt_key, rt_mask; + struct sockaddr rt_gateway; + int rt_flags; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); - /* XXX rtalloc1_fib should take a const param */ - rt = rtalloc1_fib(__DECONST(struct sockaddr *, l3addr), 0, 0, - ifp->if_fib); + bzero(&rt_key, sizeof(rt_key)); + rt_key.sin_len = sizeof(rt_key); + bzero(&rt_mask, sizeof(rt_mask)); + rt_mask.sin_len = sizeof(rt_mask); + bzero(&rt_gateway, sizeof(rt_gateway)); + rt_gateway.sa_len = sizeof(rt_gateway); + + bzero(&info, sizeof(info)); + info.rti_info[RTAX_DST] = (struct sockaddr *)&rt_key; + info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&rt_mask; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&rt_gateway; - if (rt == NULL) + if (rib_lookup_info(ifp->if_fib, l3addr, NHR_REF, 0, &info) != 0) return (EINVAL); + rt_flags = info.rti_flags; + /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ - if (rt->rt_flags & RTF_GATEWAY) { - if (!(rt->rt_flags & RTF_HOST) || !rt->rt_ifp || - rt->rt_ifp->if_type != IFT_ETHER || - (rt->rt_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || - memcmp(rt->rt_gateway->sa_data, l3addr->sa_data, + if (rt_flags & RTF_GATEWAY) { + if (!(rt_flags & RTF_HOST) || !info.rti_ifp || + info.rti_ifp->if_type != IFT_ETHER || + (info.rti_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || + memcmp(rt_gateway.sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { - RTFREE_LOCKED(rt); + rib_free_info(&info); return (EINVAL); } } + rib_free_info(&info); /* * Make sure that at least the destination address is covered @@ -1408,21 +1211,19 @@ in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr * on one interface and the corresponding outgoing packet leaves * another interface. */ - if (!(rt->rt_flags & RTF_HOST) && rt->rt_ifp != ifp) { + if (!(rt_flags & RTF_HOST) && info.rti_ifp != ifp) { const char *sa, *mask, *addr, *lim; int len; - mask = (const char *)rt_mask(rt); + mask = (const char *)&rt_mask; /* * Just being extra cautious to avoid some custom * code getting into trouble. */ - if (mask == NULL) { - RTFREE_LOCKED(rt); + if ((info.rti_addrs & RTA_NETMASK) == 0) return (EINVAL); - } - sa = (const char *)rt_key(rt); + sa = (const char *)&rt_key; addr = (const char *)l3addr; len = ((const struct sockaddr_in *)l3addr)->sin_len; lim = addr + len; @@ -1433,151 +1234,188 @@ in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n", inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr)); #endif - RTFREE_LOCKED(rt); return (EINVAL); } } } - RTFREE_LOCKED(rt); return (0); } -/* - * Return NULL if not found or marked for deletion. - * If found return lle read locked. - */ -static struct llentry * -in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) +static inline uint32_t +in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) +{ + + return (IN_LLTBL_HASH(dst.s_addr, hsize)); +} + +static uint32_t +in_lltable_hash(const struct llentry *lle, uint32_t hsize) +{ + + return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); +} + +static void +in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) +{ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)sa; + bzero(sin, sizeof(*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = lle->r_l3addr.addr4; +} + +static inline struct llentry * +in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { - const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; - struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; struct llentries *lleh; - u_int hashkey; - - IF_AFDATA_LOCK_ASSERT(ifp); - KASSERT(l3addr->sa_family == AF_INET, - ("sin_family %d", l3addr->sa_family)); + u_int hashidx; - hashkey = sin->sin_addr.s_addr; - lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)]; + hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); + lleh = &llt->lle_head[hashidx]; LIST_FOREACH(lle, lleh, lle_next) { - struct sockaddr_in *sa2 = satosin(L3_ADDR(lle)); if (lle->la_flags & LLE_DELETED) continue; - if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr) + if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } - if (lle == NULL) { -#ifdef DIAGNOSTIC - if (flags & LLE_DELETE) - log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle); -#endif - if (!(flags & LLE_CREATE)) - return (NULL); - IF_AFDATA_WLOCK_ASSERT(ifp); - /* - * A route that covers the given address must have - * been installed 1st because we are doing a resolution, - * verify this. - */ - if (!(flags & LLE_IFADDR) && - in_lltable_rtcheck(ifp, flags, l3addr) != 0) - goto done; - - lle = in_lltable_new(l3addr, flags); - if (lle == NULL) { - log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); - goto done; - } - lle->la_flags = flags & ~LLE_CREATE; - if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) { - bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen); - lle->la_flags |= (LLE_VALID | LLE_STATIC); - } - lle->lle_tbl = llt; - lle->lle_head = lleh; - lle->la_flags |= LLE_LINKED; - LIST_INSERT_HEAD(lleh, lle, lle_next); - } else if (flags & LLE_DELETE) { - if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) { - LLE_WLOCK(lle); - lle->la_flags |= LLE_DELETED; - EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); + return (lle); +} + +static void +in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) +{ + + lle->la_flags |= LLE_DELETED; + EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC - log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); + log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif - if ((lle->la_flags & - (LLE_STATIC | LLE_IFADDR)) == LLE_STATIC) - llentry_free(lle); - else - LLE_WUNLOCK(lle); - } - lle = (void *)-1; + llentry_free(lle); +} + +static struct llentry * +in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; + struct ifnet *ifp = llt->llt_ifp; + struct llentry *lle; + char linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; + KASSERT(l3addr->sa_family == AF_INET, + ("sin_family %d", l3addr->sa_family)); + + /* + * A route that covers the given address must have + * been installed 1st because we are doing a resolution, + * verify this. + */ + if (!(flags & LLE_IFADDR) && + in_lltable_rtcheck(ifp, flags, l3addr) != 0) + return (NULL); + + lle = in_lltable_new(sin->sin_addr, flags); + if (lle == NULL) { + log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); + return (NULL); } - if (LLE_IS_VALID(lle)) { - if (flags & LLE_EXCLUSIVE) - LLE_WLOCK(lle); - else - LLE_RLOCK(lle); + lle->la_flags = flags; + if (flags & LLE_STATIC) + lle->r_flags |= RLLE_VALID; + if ((flags & LLE_IFADDR) == LLE_IFADDR) { + linkhdrsize = LLE_MAX_LINKHDR; + if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), + linkhdr, &linkhdrsize, &lladdr_off) != 0) { + in_lltable_destroy_lle_unlocked(lle); + return (NULL); + } + lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, + lladdr_off); + lle->la_flags |= LLE_STATIC; + lle->r_flags |= (RLLE_VALID | RLLE_IFADDR); } -done: + + return (lle); +} + +/* + * Return NULL if not found or marked for deletion. + * If found return lle read locked. + */ +static struct llentry * +in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; + struct llentry *lle; + + IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); + KASSERT(l3addr->sa_family == AF_INET, + ("sin_family %d", l3addr->sa_family)); + lle = in_lltable_find_dst(llt, sin->sin_addr); + + if (lle == NULL) + return (NULL); + + KASSERT((flags & (LLE_UNLOCKED|LLE_EXCLUSIVE)) != + (LLE_UNLOCKED|LLE_EXCLUSIVE),("wrong lle request flags: 0x%X", + flags)); + + if (flags & LLE_UNLOCKED) + return (lle); + + if (flags & LLE_EXCLUSIVE) + LLE_WLOCK(lle); + else + LLE_RLOCK(lle); + return (lle); } static int -in_lltable_dump(struct lltable *llt, struct sysctl_req *wr) +in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, + struct sysctl_req *wr) { -#define SIN(lle) ((struct sockaddr_in *) L3_ADDR(lle)) struct ifnet *ifp = llt->llt_ifp; - struct llentry *lle; /* XXX stack use */ struct { struct rt_msghdr rtm; - struct sockaddr_inarp sin; + struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; - int error, i; - - LLTABLE_LOCK_ASSERT(); - - error = 0; - for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { - LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { - struct sockaddr_dl *sdl; + struct sockaddr_dl *sdl; + int error; + bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) - continue; + return (0); /* Skip if jailed and not a valid IP of the prison. */ - if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0) - continue; + lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); + if (prison_if(wr->td->td_ucred, + (struct sockaddr *)&arpc.sin) != 0) + return (0); /* * produce a msg made of: * struct rt_msghdr; - * struct sockaddr_inarp; (IPv4) + * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ - bzero(&arpc, sizeof(arpc)); arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; - arpc.sin.sin_family = AF_INET; - arpc.sin.sin_len = sizeof(arpc.sin); - arpc.sin.sin_addr.s_addr = SIN(lle)->sin_addr.s_addr; /* publish */ - if (lle->la_flags & LLE_PUB) { + if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; - /* proxy only */ - if (lle->la_flags & LLE_PROXY) - arpc.sin.sin_other = SIN_PROXY; - } sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; @@ -1586,7 +1424,7 @@ in_lltable_dump(struct lltable *llt, struct sysctl_req *wr) sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; - bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); + bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); @@ -1597,35 +1435,47 @@ in_lltable_dump(struct lltable *llt, struct sysctl_req *wr) arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; + if (lle->la_flags & LLE_IFADDR) + arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); - if (error) - break; - } - } - return error; -#undef SIN + + return (error); +} + +static struct lltable * +in_lltattach(struct ifnet *ifp) +{ + struct lltable *llt; + + llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); + llt->llt_af = AF_INET; + llt->llt_ifp = ifp; + + llt->llt_lookup = in_lltable_lookup; + llt->llt_alloc_entry = in_lltable_alloc; + llt->llt_delete_entry = in_lltable_delete_entry; + llt->llt_dump_entry = in_lltable_dump_entry; + llt->llt_hash = in_lltable_hash; + llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; + llt->llt_free_entry = in_lltable_free_entry; + llt->llt_match_prefix = in_lltable_match_prefix; + lltable_link(llt); + + return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; - struct lltable *llt; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); - llt = lltable_init(ifp, AF_INET); - if (llt != NULL) { - llt->llt_prefix_free = in_lltable_prefix_free; - llt->llt_lookup = in_lltable_lookup; - llt->llt_dump = in_lltable_dump; - } - ii->ii_llt = llt; - + ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); - return ii; + return (ii); } void diff --git a/freebsd/sys/netinet/in.h b/freebsd/sys/netinet/in.h index 06f9b793..b06e3334 100644 --- a/freebsd/sys/netinet/in.h +++ b/freebsd/sys/netinet/in.h @@ -47,8 +47,8 @@ #define IPPROTO_TCP 6 /* tcp */ #define IPPROTO_UDP 17 /* user datagram protocol */ -#define INADDR_ANY (u_int32_t)0x00000000 -#define INADDR_BROADCAST (u_int32_t)0xffffffff /* must be masked */ +#define INADDR_ANY ((in_addr_t)0x00000000) +#define INADDR_BROADCAST ((in_addr_t)0xffffffff) /* must be masked */ #ifndef _UINT8_T_DECLARED typedef __uint8_t uint8_t; @@ -104,7 +104,7 @@ struct sockaddr_in { char sin_zero[8]; }; -#if !defined(_KERNEL) && __BSD_VISIBLE +#if !defined(_KERNEL) && __POSIX_VISIBLE >= 200112 #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED @@ -124,7 +124,7 @@ __END_DECLS #define ntohs(x) __ntohs(x) #endif -#endif /* !_KERNEL && __BSD_VISIBLE */ +#endif /* !_KERNEL && __POSIX_VISIBLE >= 200112 */ #if __POSIX_VISIBLE >= 200112 #define IPPROTO_IPV6 41 /* IP6 header */ @@ -241,12 +241,17 @@ __END_DECLS #define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ #define IPPROTO_SCTP 132 /* SCTP */ #define IPPROTO_MH 135 /* IPv6 Mobility Header */ +#define IPPROTO_UDPLITE 136 /* UDP-Lite */ +#define IPPROTO_HIP 139 /* IP6 Host Identity Protocol */ +#define IPPROTO_SHIM6 140 /* IP6 Shim6 Protocol */ /* 101-254: Partly Unassigned */ #define IPPROTO_PIM 103 /* Protocol Independent Mcast */ #define IPPROTO_CARP 112 /* CARP */ #define IPPROTO_PGM 113 /* PGM */ #define IPPROTO_MPLS 137 /* MPLS-in-IP */ #define IPPROTO_PFSYNC 240 /* PFSYNC */ +#define IPPROTO_RESERVED_253 253 /* Reserved */ +#define IPPROTO_RESERVED_254 254 /* Reserved */ /* 255: Reserved */ /* BSD Private, local use, namespace incursion, no longer used */ #define IPPROTO_OLD_DIVERT 254 /* OLD divert pseudo-proto */ @@ -343,61 +348,61 @@ __END_DECLS * On subnets, the decomposition of addresses to host and net parts * is done according to subnet mask, not the masks here. */ -#define IN_CLASSA(i) (((u_int32_t)(i) & 0x80000000) == 0) +#define IN_CLASSA(i) (((in_addr_t)(i) & 0x80000000) == 0) #define IN_CLASSA_NET 0xff000000 #define IN_CLASSA_NSHIFT 24 #define IN_CLASSA_HOST 0x00ffffff #define IN_CLASSA_MAX 128 -#define IN_CLASSB(i) (((u_int32_t)(i) & 0xc0000000) == 0x80000000) +#define IN_CLASSB(i) (((in_addr_t)(i) & 0xc0000000) == 0x80000000) #define IN_CLASSB_NET 0xffff0000 #define IN_CLASSB_NSHIFT 16 #define IN_CLASSB_HOST 0x0000ffff #define IN_CLASSB_MAX 65536 -#define IN_CLASSC(i) (((u_int32_t)(i) & 0xe0000000) == 0xc0000000) +#define IN_CLASSC(i) (((in_addr_t)(i) & 0xe0000000) == 0xc0000000) #define IN_CLASSC_NET 0xffffff00 #define IN_CLASSC_NSHIFT 8 #define IN_CLASSC_HOST 0x000000ff -#define IN_CLASSD(i) (((u_int32_t)(i) & 0xf0000000) == 0xe0000000) +#define IN_CLASSD(i) (((in_addr_t)(i) & 0xf0000000) == 0xe0000000) #define IN_CLASSD_NET 0xf0000000 /* These ones aren't really */ #define IN_CLASSD_NSHIFT 28 /* net and host fields, but */ #define IN_CLASSD_HOST 0x0fffffff /* routing needn't know. */ #define IN_MULTICAST(i) IN_CLASSD(i) -#define IN_EXPERIMENTAL(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) -#define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) +#define IN_EXPERIMENTAL(i) (((in_addr_t)(i) & 0xf0000000) == 0xf0000000) +#define IN_BADCLASS(i) (((in_addr_t)(i) & 0xf0000000) == 0xf0000000) -#define IN_LINKLOCAL(i) (((u_int32_t)(i) & 0xffff0000) == 0xa9fe0000) -#define IN_LOOPBACK(i) (((u_int32_t)(i) & 0xff000000) == 0x7f000000) -#define IN_ZERONET(i) (((u_int32_t)(i) & 0xff000000) == 0) +#define IN_LINKLOCAL(i) (((in_addr_t)(i) & 0xffff0000) == 0xa9fe0000) +#define IN_LOOPBACK(i) (((in_addr_t)(i) & 0xff000000) == 0x7f000000) +#define IN_ZERONET(i) (((in_addr_t)(i) & 0xff000000) == 0) -#define IN_PRIVATE(i) ((((u_int32_t)(i) & 0xff000000) == 0x0a000000) || \ - (((u_int32_t)(i) & 0xfff00000) == 0xac100000) || \ - (((u_int32_t)(i) & 0xffff0000) == 0xc0a80000)) +#define IN_PRIVATE(i) ((((in_addr_t)(i) & 0xff000000) == 0x0a000000) || \ + (((in_addr_t)(i) & 0xfff00000) == 0xac100000) || \ + (((in_addr_t)(i) & 0xffff0000) == 0xc0a80000)) -#define IN_LOCAL_GROUP(i) (((u_int32_t)(i) & 0xffffff00) == 0xe0000000) +#define IN_LOCAL_GROUP(i) (((in_addr_t)(i) & 0xffffff00) == 0xe0000000) #define IN_ANY_LOCAL(i) (IN_LINKLOCAL(i) || IN_LOCAL_GROUP(i)) -#define INADDR_LOOPBACK (u_int32_t)0x7f000001 +#define INADDR_LOOPBACK ((in_addr_t)0x7f000001) #ifndef _KERNEL -#define INADDR_NONE 0xffffffff /* -1 return */ +#define INADDR_NONE ((in_addr_t)0xffffffff) /* -1 return */ #endif -#define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */ -#define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */ -#define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */ -#define INADDR_ALLRPTS_GROUP (u_int32_t)0xe0000016 /* 224.0.0.22, IGMPv3 */ -#define INADDR_CARP_GROUP (u_int32_t)0xe0000012 /* 224.0.0.18 */ -#define INADDR_PFSYNC_GROUP (u_int32_t)0xe00000f0 /* 224.0.0.240 */ -#define INADDR_ALLMDNS_GROUP (u_int32_t)0xe00000fb /* 224.0.0.251 */ -#define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */ +#define INADDR_UNSPEC_GROUP ((in_addr_t)0xe0000000) /* 224.0.0.0 */ +#define INADDR_ALLHOSTS_GROUP ((in_addr_t)0xe0000001) /* 224.0.0.1 */ +#define INADDR_ALLRTRS_GROUP ((in_addr_t)0xe0000002) /* 224.0.0.2 */ +#define INADDR_ALLRPTS_GROUP ((in_addr_t)0xe0000016) /* 224.0.0.22, IGMPv3 */ +#define INADDR_CARP_GROUP ((in_addr_t)0xe0000012) /* 224.0.0.18 */ +#define INADDR_PFSYNC_GROUP ((in_addr_t)0xe00000f0) /* 224.0.0.240 */ +#define INADDR_ALLMDNS_GROUP ((in_addr_t)0xe00000fb) /* 224.0.0.251 */ +#define INADDR_MAX_LOCAL_GROUP ((in_addr_t)0xe00000ff) /* 224.0.0.255 */ #define IN_LOOPBACKNET 127 /* official! */ -#define IN_RFC3021_MASK (u_int32_t)0xfffffffe +#define IN_RFC3021_MASK ((in_addr_t)0xfffffffe) /* * Options for use with [gs]etsockopt at the IP level. @@ -427,10 +432,11 @@ __END_DECLS #define IP_RECVIF 20 /* bool; receive reception if w/dgram */ /* for IPSEC */ #define IP_IPSEC_POLICY 21 /* int; set/get security policy */ -#define IP_FAITH 22 /* bool; accept FAITH'ed connections */ - + /* unused; was IP_FAITH */ #define IP_ONESBCAST 23 /* bool: send all-ones broadcast */ #define IP_BINDANY 24 /* bool: allow bind to any address */ +#define IP_BINDMULTI 25 /* bool: allow multiple listeners on a tuple */ +#define IP_RSS_LISTEN_BUCKET 26 /* int; set RSS listen bucket */ /* * Options for controlling the firewall and dummynet. @@ -485,6 +491,13 @@ __END_DECLS #define MCAST_BLOCK_SOURCE 84 /* block a source */ #define MCAST_UNBLOCK_SOURCE 85 /* unblock a source */ +/* Flow and RSS definitions */ +#define IP_FLOWID 90 /* get flow id for the given socket/inp */ +#define IP_FLOWTYPE 91 /* get flow type (M_HASHTYPE) */ +#define IP_RSSBUCKETID 92 /* get RSS flowid -> bucket mapping */ +#define IP_RECVFLOWID 93 /* bool; receive IP flowid/flowtype w/ datagram */ +#define IP_RECVRSSBUCKETID 94 /* bool; receive IP RSS bucket id w/ datagram */ + /* * Defaults and limits for options */ @@ -602,86 +615,7 @@ int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, #define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ /* - * Definitions for inet sysctl operations. - * - * Third level is protocol number. - * Fourth level is desired variable within that protocol. - */ -#define IPPROTO_MAXID (IPPROTO_AH + 1) /* don't list to IPPROTO_MAX */ - -#define CTL_IPPROTO_NAMES { \ - { "ip", CTLTYPE_NODE }, \ - { "icmp", CTLTYPE_NODE }, \ - { "igmp", CTLTYPE_NODE }, \ - { "ggp", CTLTYPE_NODE }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { "tcp", CTLTYPE_NODE }, \ - { 0, 0 }, \ - { "egp", CTLTYPE_NODE }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { "pup", CTLTYPE_NODE }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { "udp", CTLTYPE_NODE }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { "idp", CTLTYPE_NODE }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { "ipsec", CTLTYPE_NODE }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { 0, 0 }, \ - { "pim", CTLTYPE_NODE }, \ -} - -/* - * Names for IP sysctl objects + * Identifiers for IP sysctl nodes */ #define IPCTL_FORWARDING 1 /* act as router */ #define IPCTL_SENDREDIRECTS 2 /* may send redirects when forwarding */ @@ -689,9 +623,9 @@ int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, #ifdef notyet #define IPCTL_DEFMTU 4 /* default MTU */ #endif -#define IPCTL_RTEXPIRE 5 /* cloned route expiration time */ -#define IPCTL_RTMINEXPIRE 6 /* min value for expiration time */ -#define IPCTL_RTMAXCACHE 7 /* trigger level for dynamic expire */ +/* IPCTL_RTEXPIRE 5 deprecated */ +/* IPCTL_RTMINEXPIRE 6 deprecated */ +/* IPCTL_RTMAXCACHE 7 deprecated */ #define IPCTL_SOURCEROUTE 8 /* may perform source routes */ #define IPCTL_DIRECTEDBROADCAST 9 /* may re-broadcast received packets */ #define IPCTL_INTRQMAXLEN 10 /* max length of netisr queue */ @@ -699,38 +633,22 @@ int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, #define IPCTL_STATS 12 /* ipstat structure */ #define IPCTL_ACCEPTSOURCEROUTE 13 /* may accept source routed packets */ #define IPCTL_FASTFORWARDING 14 /* use fast IP forwarding code */ -#define IPCTL_KEEPFAITH 15 /* FAITH IPv4->IPv6 translater ctl */ + /* 15, unused, was: IPCTL_KEEPFAITH */ #define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */ -#define IPCTL_MAXID 17 - -#define IPCTL_NAMES { \ - { 0, 0 }, \ - { "forwarding", CTLTYPE_INT }, \ - { "redirect", CTLTYPE_INT }, \ - { "ttl", CTLTYPE_INT }, \ - { "mtu", CTLTYPE_INT }, \ - { "rtexpire", CTLTYPE_INT }, \ - { "rtminexpire", CTLTYPE_INT }, \ - { "rtmaxcache", CTLTYPE_INT }, \ - { "sourceroute", CTLTYPE_INT }, \ - { "directed-broadcast", CTLTYPE_INT }, \ - { "intr-queue-maxlen", CTLTYPE_INT }, \ - { "intr-queue-drops", CTLTYPE_INT }, \ - { "stats", CTLTYPE_STRUCT }, \ - { "accept_sourceroute", CTLTYPE_INT }, \ - { "fastforwarding", CTLTYPE_INT }, \ -} #endif /* __BSD_VISIBLE */ #ifdef _KERNEL struct ifnet; struct mbuf; /* forward declarations for Standard C */ +struct in_ifaddr; int in_broadcast(struct in_addr, struct ifnet *); +int in_ifaddr_broadcast(struct in_addr, struct in_ifaddr *); int in_canforward(struct in_addr); int in_localaddr(struct in_addr); int in_localip(struct in_addr); +int in_ifhasaddr(struct ifnet *, struct in_addr); int inet_aton(const char *, struct in_addr *); /* in libkern */ char *inet_ntoa(struct in_addr); /* in libkern */ char *inet_ntoa_r(struct in_addr ina, char *buf); /* in libkern */ @@ -745,33 +663,6 @@ void in_ifdetach(struct ifnet *); #define satosin(sa) ((struct sockaddr_in *)(sa)) #define sintosa(sin) ((struct sockaddr *)(sin)) #define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) - -/* - * Historically, BSD keeps ip_len and ip_off in host format - * when doing layer 3 processing, and this often requires - * to translate the format back and forth. - * To make the process explicit, we define a couple of macros - * that also take into account the fact that at some point - * we may want to keep those fields always in net format. - */ - -#if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN) -#define SET_NET_IPLEN(p) do {} while (0) -#define SET_HOST_IPLEN(p) do {} while (0) -#else -#define SET_NET_IPLEN(p) do { \ - struct ip *h_ip = (p); \ - h_ip->ip_len = htons(h_ip->ip_len); \ - h_ip->ip_off = htons(h_ip->ip_off); \ - } while (0) - -#define SET_HOST_IPLEN(p) do { \ - struct ip *h_ip = (p); \ - h_ip->ip_len = ntohs(h_ip->ip_len); \ - h_ip->ip_off = ntohs(h_ip->ip_off); \ - } while (0) -#endif /* !HAVE_NET_IPLEN */ - #endif /* _KERNEL */ /* INET6 stuff */ diff --git a/freebsd/sys/netinet/in_fib.c b/freebsd/sys/netinet/in_fib.c new file mode 100644 index 00000000..f1edf976 --- /dev/null +++ b/freebsd/sys/netinet/in_fib.c @@ -0,0 +1,235 @@ +#include + +/*- + * Copyright (c) 2015 + * Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef RADIX_MPATH +#include +#endif + +#include +#include +#include + +#ifdef INET +static void fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst, + uint32_t flags, struct nhop4_basic *pnh4); +static void fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, + uint32_t flags, struct nhop4_extended *pnh4); + +#define RNTORT(p) ((struct rtentry *)(p)) + +static void +fib4_rte_to_nh_basic(struct rtentry *rte, struct in_addr dst, + uint32_t flags, struct nhop4_basic *pnh4) +{ + struct sockaddr_in *gw; + + if ((flags & NHR_IFAIF) != 0) + pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; + else + pnh4->nh_ifp = rte->rt_ifp; + pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); + if (rte->rt_flags & RTF_GATEWAY) { + gw = (struct sockaddr_in *)rte->rt_gateway; + pnh4->nh_addr = gw->sin_addr; + } else + pnh4->nh_addr = dst; + /* Set flags */ + pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); + gw = (struct sockaddr_in *)rt_key(rte); + if (gw->sin_addr.s_addr == 0) + pnh4->nh_flags |= NHF_DEFAULT; + /* TODO: Handle RTF_BROADCAST here */ +} + +static void +fib4_rte_to_nh_extended(struct rtentry *rte, struct in_addr dst, + uint32_t flags, struct nhop4_extended *pnh4) +{ + struct sockaddr_in *gw; + struct in_ifaddr *ia; + + if ((flags & NHR_IFAIF) != 0) + pnh4->nh_ifp = rte->rt_ifa->ifa_ifp; + else + pnh4->nh_ifp = rte->rt_ifp; + pnh4->nh_mtu = min(rte->rt_mtu, rte->rt_ifp->if_mtu); + if (rte->rt_flags & RTF_GATEWAY) { + gw = (struct sockaddr_in *)rte->rt_gateway; + pnh4->nh_addr = gw->sin_addr; + } else + pnh4->nh_addr = dst; + /* Set flags */ + pnh4->nh_flags = fib_rte_to_nh_flags(rte->rt_flags); + gw = (struct sockaddr_in *)rt_key(rte); + if (gw->sin_addr.s_addr == 0) + pnh4->nh_flags |= NHF_DEFAULT; + /* XXX: Set RTF_BROADCAST if GW address is broadcast */ + + ia = ifatoia(rte->rt_ifa); + pnh4->nh_src = IA_SIN(ia)->sin_addr; +} + +/* + * Performs IPv4 route table lookup on @dst. Returns 0 on success. + * Stores nexthop info provided @pnh4 structure. + * Note that + * - nh_ifp cannot be safely dereferenced + * - nh_ifp represents logical transmit interface (rt_ifp) (e.g. if + * looking up address on interface "ix0" pointer to "lo0" interface + * will be returned instead of "ix0") + * - nh_ifp represents "address" interface if NHR_IFAIF flag is passed + * - howewer mtu from "transmit" interface will be returned. + */ +int +fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags, + uint32_t flowid, struct nhop4_basic *pnh4) +{ + struct rib_head *rh; + struct radix_node *rn; + struct sockaddr_in sin; + struct rtentry *rte; + + KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_basic: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET); + if (rh == NULL) + return (ENOENT); + + /* Prepare lookup key */ + memset(&sin, 0, sizeof(sin)); + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr = dst; + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + rte = RNTORT(rn); + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(rte->rt_ifp)) { + fib4_rte_to_nh_basic(rte, dst, flags, pnh4); + RIB_RUNLOCK(rh); + + return (0); + } + } + RIB_RUNLOCK(rh); + + return (ENOENT); +} + +/* + * Performs IPv4 route table lookup on @dst. Returns 0 on success. + * Stores extende nexthop info provided @pnh4 structure. + * Note that + * - nh_ifp cannot be safely dereferenced unless NHR_REF is specified. + * - in that case you need to call fib4_free_nh_ext() + * - nh_ifp represents logical transmit interface (rt_ifp) (e.g. if + * looking up address of interface "ix0" pointer to "lo0" interface + * will be returned instead of "ix0") + * - nh_ifp represents "address" interface if NHR_IFAIF flag is passed + * - howewer mtu from "transmit" interface will be returned. + */ +int +fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags, + uint32_t flowid, struct nhop4_extended *pnh4) +{ + struct rib_head *rh; + struct radix_node *rn; + struct sockaddr_in sin; + struct rtentry *rte; + + KASSERT((fibnum < rt_numfibs), ("fib4_lookup_nh_ext: bad fibnum")); + rh = rt_tables_get_rnh(fibnum, AF_INET); + if (rh == NULL) + return (ENOENT); + + /* Prepare lookup key */ + memset(&sin, 0, sizeof(sin)); + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr = dst; + + RIB_RLOCK(rh); + rn = rh->rnh_matchaddr((void *)&sin, &rh->head); + if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { + rte = RNTORT(rn); +#ifdef RADIX_MPATH + rte = rt_mpath_select(rte, flowid); + if (rte == NULL) { + RIB_RUNLOCK(rh); + return (ENOENT); + } +#endif + /* Ensure route & ifp is UP */ + if (RT_LINK_IS_UP(rte->rt_ifp)) { + fib4_rte_to_nh_extended(rte, dst, flags, pnh4); + if ((flags & NHR_REF) != 0) { + /* TODO: lwref on egress ifp's ? */ + } + RIB_RUNLOCK(rh); + + return (0); + } + } + RIB_RUNLOCK(rh); + + return (ENOENT); +} + +void +fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4) +{ + +} + +#endif diff --git a/freebsd/sys/netinet/in_fib.h b/freebsd/sys/netinet/in_fib.h new file mode 100644 index 00000000..754a2e3c --- /dev/null +++ b/freebsd/sys/netinet/in_fib.h @@ -0,0 +1,61 @@ +/*- + * Copyright (c) 2015 + * Alexander V. Chernikov + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_FIB_H_ +#define _NETINET_IN_FIB_H_ + +/* Basic nexthop info used for uRPF/mtu checks */ +struct nhop4_basic { + struct ifnet *nh_ifp; /* Logical egress interface */ + uint16_t nh_mtu; /* nexthop mtu */ + uint16_t nh_flags; /* nhop flags */ + struct in_addr nh_addr; /* GW/DST IPv4 address */ +}; + +/* Extended nexthop info used for control protocols */ +struct nhop4_extended { + struct ifnet *nh_ifp; /* Logical egress interface */ + uint16_t nh_mtu; /* nexthop mtu */ + uint16_t nh_flags; /* nhop flags */ + uint8_t spare[4]; + struct in_addr nh_addr; /* GW/DST IPv4 address */ + struct in_addr nh_src; /* default source IPv4 address */ + uint64_t spare2[2]; +}; + +int fib4_lookup_nh_basic(uint32_t fibnum, struct in_addr dst, uint32_t flags, + uint32_t flowid, struct nhop4_basic *pnh4); +int fib4_lookup_nh_ext(uint32_t fibnum, struct in_addr dst, uint32_t flags, + uint32_t flowid, struct nhop4_extended *pnh4); +void fib4_free_nh_ext(uint32_t fibnum, struct nhop4_extended *pnh4); + +#endif + diff --git a/freebsd/sys/netinet/in_gif.c b/freebsd/sys/netinet/in_gif.c index 332d7ff4..02e2efd8 100644 --- a/freebsd/sys/netinet/in_gif.c +++ b/freebsd/sys/netinet/in_gif.c @@ -1,7 +1,5 @@ #include -/* $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */ - /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. @@ -29,16 +27,19 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); -#include #include #include #include +#include +#include #include #include #include @@ -50,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -57,162 +59,56 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include +#include #ifdef INET6 #include #endif -#ifdef MROUTING -#include -#endif /* MROUTING */ - -#include +#include -static int gif_validate4(const struct ip *, struct gif_softc *, - struct ifnet *); +static int in_gif_input(struct mbuf **, int *, int); extern struct domain inetdomain; -struct protosw in_gif_protosw = { +static struct protosw in_gif_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, .pr_protocol = 0/* IPPROTO_IPV[46] */, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = in_gif_input, - .pr_output = (pr_output_t*)rip_output, + .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; -VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL; +#define GIF_TTL 30 +static VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL; #define V_ip_gif_ttl VNET(ip_gif_ttl) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_gif_ttl), 0, ""); int -in_gif_output(struct ifnet *ifp, int family, struct mbuf *m) +in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn) { + GIF_RLOCK_TRACKER; struct gif_softc *sc = ifp->if_softc; - struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; - struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc; - struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst; - struct ip iphdr; /* capsule IP header, host byte ordered */ - struct etherip_header eiphdr; - int error, len, proto; - u_int8_t tos; - - GIF_LOCK_ASSERT(sc); - - if (sin_src == NULL || sin_dst == NULL || - sin_src->sin_family != AF_INET || - sin_dst->sin_family != AF_INET) { - m_freem(m); - return EAFNOSUPPORT; - } - - switch (family) { -#ifdef INET - case AF_INET: - { - struct ip *ip; - - proto = IPPROTO_IPV4; - if (m->m_len < sizeof(*ip)) { - m = m_pullup(m, sizeof(*ip)); - if (!m) - return ENOBUFS; - } - ip = mtod(m, struct ip *); - tos = ip->ip_tos; - break; - } -#endif /* INET */ -#ifdef INET6 - case AF_INET6: - { - struct ip6_hdr *ip6; - proto = IPPROTO_IPV6; - if (m->m_len < sizeof(*ip6)) { - m = m_pullup(m, sizeof(*ip6)); - if (!m) - return ENOBUFS; - } - ip6 = mtod(m, struct ip6_hdr *); - tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; - break; - } -#endif /* INET6 */ - case AF_LINK: - proto = IPPROTO_ETHERIP; - - /* - * GIF_SEND_REVETHIP (disabled by default) intentionally - * sends an EtherIP packet with revered version field in - * the header. This is a knob for backward compatibility - * with FreeBSD 7.2R or prior. - */ - if ((sc->gif_options & GIF_SEND_REVETHIP)) { - eiphdr.eip_ver = 0; - eiphdr.eip_resvl = ETHERIP_VERSION; - eiphdr.eip_resvh = 0; - } else { - eiphdr.eip_ver = ETHERIP_VERSION; - eiphdr.eip_resvl = 0; - eiphdr.eip_resvh = 0; - } - /* prepend Ethernet-in-IP header */ - M_PREPEND(m, sizeof(struct etherip_header), M_DONTWAIT); - if (m && m->m_len < sizeof(struct etherip_header)) - m = m_pullup(m, sizeof(struct etherip_header)); - if (m == NULL) - return ENOBUFS; - bcopy(&eiphdr, mtod(m, struct etherip_header *), - sizeof(struct etherip_header)); - break; - - default: -#ifdef DEBUG - printf("in_gif_output: warning: unknown family %d passed\n", - family); -#endif - m_freem(m); - return EAFNOSUPPORT; - } - - bzero(&iphdr, sizeof(iphdr)); - iphdr.ip_src = sin_src->sin_addr; - /* bidirectional configured tunnel mode */ - if (sin_dst->sin_addr.s_addr != INADDR_ANY) - iphdr.ip_dst = sin_dst->sin_addr; - else { - m_freem(m); - return ENETUNREACH; - } - iphdr.ip_p = proto; - /* version will be set in ip_output() */ - iphdr.ip_ttl = V_ip_gif_ttl; - iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip); - ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE, - &iphdr.ip_tos, &tos); + struct ip *ip; + int len; /* prepend new IP header */ len = sizeof(struct ip); #ifndef __NO_STRICT_ALIGNMENT - if (family == AF_LINK) + if (proto == IPPROTO_ETHERIP) len += ETHERIP_ALIGN; #endif - M_PREPEND(m, len, M_DONTWAIT); - if (m != NULL && m->m_len < len) - m = m_pullup(m, len); - if (m == NULL) { - printf("ENOBUFS in in_gif_output %d\n", __LINE__); - return ENOBUFS; - } + M_PREPEND(m, len, M_NOWAIT); + if (m == NULL) + return (ENOBUFS); #ifndef __NO_STRICT_ALIGNMENT - if (family == AF_LINK) { + if (proto == IPPROTO_ETHERIP) { len = mtod(m, vm_offset_t) & 3; KASSERT(len == 0 || len == ETHERIP_ALIGN, ("in_gif_output: unexpected misalignment")); @@ -220,212 +116,51 @@ in_gif_output(struct ifnet *ifp, int family, struct mbuf *m) m->m_len -= ETHERIP_ALIGN; } #endif - bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip)); - - M_SETFIB(m, sc->gif_fibnum); - - if (dst->sin_family != sin_dst->sin_family || - dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) { - /* cache route doesn't match */ - bzero(dst, sizeof(*dst)); - dst->sin_family = sin_dst->sin_family; - dst->sin_len = sizeof(struct sockaddr_in); - dst->sin_addr = sin_dst->sin_addr; - if (sc->gif_ro.ro_rt) { - RTFREE(sc->gif_ro.ro_rt); - sc->gif_ro.ro_rt = NULL; - } -#if 0 - GIF2IFP(sc)->if_mtu = GIF_MTU; -#endif - } - - if (sc->gif_ro.ro_rt == NULL) { - in_rtalloc_ign(&sc->gif_ro, 0, sc->gif_fibnum); - if (sc->gif_ro.ro_rt == NULL) { - m_freem(m); - return ENETUNREACH; - } - - /* if it constitutes infinite encapsulation, punt. */ - if (sc->gif_ro.ro_rt->rt_ifp == ifp) { - m_freem(m); - return ENETUNREACH; /* XXX */ - } -#if 0 - ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu - - sizeof(struct ip); -#endif + ip = mtod(m, struct ip *); + GIF_RLOCK(sc); + if (sc->gif_family != AF_INET) { + m_freem(m); + GIF_RUNLOCK(sc); + return (ENETDOWN); } + bcopy(sc->gif_iphdr, ip, sizeof(struct ip)); + GIF_RUNLOCK(sc); - m_addr_changed(m); - - error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL); - - if (!(GIF2IFP(sc)->if_flags & IFF_LINK0) && - sc->gif_ro.ro_rt != NULL) { - RTFREE(sc->gif_ro.ro_rt); - sc->gif_ro.ro_rt = NULL; - } + ip->ip_p = proto; + /* version will be set in ip_output() */ + ip->ip_ttl = V_ip_gif_ttl; + ip->ip_len = htons(m->m_pkthdr.len); + ip->ip_tos = ecn; - return (error); + return (ip_output(m, NULL, NULL, 0, NULL, NULL)); } -void -in_gif_input(struct mbuf *m, int off) +static int +in_gif_input(struct mbuf **mp, int *offp, int proto) { - struct ifnet *gifp = NULL; + struct mbuf *m = *mp; struct gif_softc *sc; + struct ifnet *gifp; struct ip *ip; - int af; - u_int8_t otos; - int proto; + uint8_t ecn; - ip = mtod(m, struct ip *); - proto = ip->ip_p; - - sc = (struct gif_softc *)encap_getarg(m); + sc = encap_getarg(m); if (sc == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_nogif); - return; + return (IPPROTO_DONE); } - gifp = GIF2IFP(sc); - if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { - m_freem(m); - KMOD_IPSTAT_INC(ips_nogif); - return; - } - - otos = ip->ip_tos; - m_adj(m, off); - - switch (proto) { -#ifdef INET - case IPPROTO_IPV4: - { - struct ip *ip; - af = AF_INET; - if (m->m_len < sizeof(*ip)) { - m = m_pullup(m, sizeof(*ip)); - if (!m) - return; - } + if ((gifp->if_flags & IFF_UP) != 0) { ip = mtod(m, struct ip *); - if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ? - ECN_ALLOWED : ECN_NOCARE, - &otos, &ip->ip_tos) == 0) { - m_freem(m); - return; - } - break; - } -#endif -#ifdef INET6 - case IPPROTO_IPV6: - { - struct ip6_hdr *ip6; - u_int8_t itos, oitos; - - af = AF_INET6; - if (m->m_len < sizeof(*ip6)) { - m = m_pullup(m, sizeof(*ip6)); - if (!m) - return; - } - ip6 = mtod(m, struct ip6_hdr *); - itos = oitos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; - if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ? - ECN_ALLOWED : ECN_NOCARE, - &otos, &itos) == 0) { - m_freem(m); - return; - } - if (itos != oitos) { - ip6->ip6_flow &= ~htonl(0xff << 20); - ip6->ip6_flow |= htonl((u_int32_t)itos << 20); - } - break; - } -#endif /* INET6 */ - case IPPROTO_ETHERIP: - af = AF_LINK; - break; - - default: - KMOD_IPSTAT_INC(ips_nogif); + ecn = ip->ip_tos; + m_adj(m, *offp); + gif_input(m, gifp, proto, ecn); + } else { m_freem(m); - return; - } - gif_input(m, af, gifp); - return; -} - -/* - * validate outer address. - */ -static int -gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp) -{ - struct sockaddr_in *src, *dst; - struct in_ifaddr *ia4; - - src = (struct sockaddr_in *)sc->gif_psrc; - dst = (struct sockaddr_in *)sc->gif_pdst; - - /* check for address match */ - if (src->sin_addr.s_addr != ip->ip_dst.s_addr || - dst->sin_addr.s_addr != ip->ip_src.s_addr) - return 0; - - /* martian filters on outer source - NOT done in ip_input! */ - if (IN_MULTICAST(ntohl(ip->ip_src.s_addr))) - return 0; - switch ((ntohl(ip->ip_src.s_addr) & 0xff000000) >> 24) { - case 0: case 127: case 255: - return 0; - } - - /* reject packets with broadcast on source */ - /* XXXRW: should use hash lists? */ - IN_IFADDR_RLOCK(); - TAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { - if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) - continue; - if (ip->ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { - IN_IFADDR_RUNLOCK(); - return 0; - } - } - IN_IFADDR_RUNLOCK(); - - /* ingress filters on outer source */ - if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) { - struct sockaddr_in sin; - struct rtentry *rt; - - bzero(&sin, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_len = sizeof(struct sockaddr_in); - sin.sin_addr = ip->ip_src; - /* XXX MRT check for the interface we would use on output */ - rt = in_rtalloc1((struct sockaddr *)&sin, 0, - 0UL, sc->gif_fibnum); - if (!rt || rt->rt_ifp != ifp) { -#if 0 - log(LOG_WARNING, "%s: packet from 0x%x dropped " - "due to ingress filter\n", if_name(GIF2IFP(sc)), - (u_int32_t)ntohl(sin.sin_addr.s_addr)); -#endif - if (rt) - RTFREE_LOCKED(rt); - return 0; - } - RTFREE_LOCKED(rt); + KMOD_IPSTAT_INC(ips_nogif); } - - return 32 * 2; + return (IPPROTO_DONE); } /* @@ -433,39 +168,51 @@ gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp) * matched the physical addr family. see gif_encapcheck(). */ int -gif_encapcheck4(const struct mbuf *m, int off, int proto, void *arg) +in_gif_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { - struct ip ip; + const struct ip *ip; struct gif_softc *sc; - struct ifnet *ifp; + int ret; /* sanity check done in caller */ sc = (struct gif_softc *)arg; + GIF_RLOCK_ASSERT(sc); - /* LINTED const cast */ - m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); - ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL; + /* check for address match */ + ip = mtod(m, const struct ip *); + if (sc->gif_iphdr->ip_src.s_addr != ip->ip_dst.s_addr) + return (0); + ret = 32; + if (sc->gif_iphdr->ip_dst.s_addr != ip->ip_src.s_addr) { + if ((sc->gif_options & GIF_IGNORE_SOURCE) == 0) + return (0); + } else + ret += 32; - return gif_validate4(&ip, sc, ifp); -} + /* ingress filters on outer source */ + if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0) { + struct nhop4_basic nh4; + struct in_addr dst; -int -in_gif_attach(struct gif_softc *sc) -{ - sc->encap_cookie4 = encap_attach_func(AF_INET, -1, gif_encapcheck, - &in_gif_protosw, sc); - if (sc->encap_cookie4 == NULL) - return EEXIST; - return 0; + dst = ip->ip_src; + + if (fib4_lookup_nh_basic(sc->gif_fibnum, dst, 0, 0, &nh4) != 0) + return (0); + + if (nh4.nh_ifp != m->m_pkthdr.rcvif) + return (0); + } + return (ret); } int -in_gif_detach(struct gif_softc *sc) +in_gif_attach(struct gif_softc *sc) { - int error; - error = encap_detach(sc->encap_cookie4); - if (error == 0) - sc->encap_cookie4 = NULL; - return error; + KASSERT(sc->gif_ecookie == NULL, ("gif_ecookie isn't NULL")); + sc->gif_ecookie = encap_attach_func(AF_INET, -1, gif_encapcheck, + &in_gif_protosw, sc); + if (sc->gif_ecookie == NULL) + return (EEXIST); + return (0); } diff --git a/freebsd/sys/netinet/in_kdtrace.h b/freebsd/sys/netinet/in_kdtrace.h new file mode 100644 index 00000000..a36991ef --- /dev/null +++ b/freebsd/sys/netinet/in_kdtrace.h @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 2013 Mark Johnston + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_IN_KDTRACE_H_ +#define _SYS_IN_KDTRACE_H_ + +#define IP_PROBE(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ + SDT_PROBE6(ip, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) +#define UDP_PROBE(probe, arg0, arg1, arg2, arg3, arg4) \ + SDT_PROBE5(udp, , , probe, arg0, arg1, arg2, arg3, arg4) +#define TCP_PROBE1(probe, arg0) \ + SDT_PROBE1(tcp, , , probe, arg0) +#define TCP_PROBE2(probe, arg0, arg1) \ + SDT_PROBE2(tcp, , , probe, arg0, arg1) +#define TCP_PROBE3(probe, arg0, arg1, arg2) \ + SDT_PROBE3(tcp, , , probe, arg0, arg1, arg2) +#define TCP_PROBE4(probe, arg0, arg1, arg2, arg3) \ + SDT_PROBE4(tcp, , , probe, arg0, arg1, arg2, arg3) +#define TCP_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ + SDT_PROBE5(tcp, , , probe, arg0, arg1, arg2, arg3, arg4) +#define TCP_PROBE6(probe, arg0, arg1, arg2, arg3, arg4, arg5) \ + SDT_PROBE6(tcp, , , probe, arg0, arg1, arg2, arg3, arg4, arg5) + +SDT_PROVIDER_DECLARE(ip); +SDT_PROVIDER_DECLARE(tcp); +SDT_PROVIDER_DECLARE(udp); + +SDT_PROBE_DECLARE(ip, , , receive); +SDT_PROBE_DECLARE(ip, , , send); + +SDT_PROBE_DECLARE(tcp, , , accept__established); +SDT_PROBE_DECLARE(tcp, , , accept__refused); +SDT_PROBE_DECLARE(tcp, , , connect__established); +SDT_PROBE_DECLARE(tcp, , , connect__refused); +SDT_PROBE_DECLARE(tcp, , , connect__request); +SDT_PROBE_DECLARE(tcp, , , receive); +SDT_PROBE_DECLARE(tcp, , , send); +SDT_PROBE_DECLARE(tcp, , , siftr); +SDT_PROBE_DECLARE(tcp, , , state__change); +SDT_PROBE_DECLARE(tcp, , , debug__input); +SDT_PROBE_DECLARE(tcp, , , debug__output); +SDT_PROBE_DECLARE(tcp, , , debug__user); +SDT_PROBE_DECLARE(tcp, , , debug__drop); + +SDT_PROBE_DECLARE(udp, , , receive); +SDT_PROBE_DECLARE(udp, , , send); + +#endif diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c index 4112046c..3d68718e 100644 --- a/freebsd/sys/netinet/in_mcast.c +++ b/freebsd/sys/netinet/in_mcast.c @@ -40,23 +40,28 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include +#include #include #include #include #include #include +#include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -148,6 +153,8 @@ static void inm_purge(struct in_multi *); static void inm_reap(struct in_multi *); static struct ip_moptions * inp_findmoptions(struct inpcb *); +static void inp_freemoptions_internal(struct ip_moptions *); +static void inp_gcmoptions(void *, int); static int inp_get_source_filters(struct inpcb *, struct sockopt *); static int inp_join_group(struct inpcb *, struct sockopt *); static int inp_leave_group(struct inpcb *, struct sockopt *); @@ -164,25 +171,26 @@ static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, - CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_maxgrpsrc, 0, + CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0, "Max source filters per group"); -TUNABLE_ULONG("net.inet.ip.mcast.maxgrpsrc", &in_mcast_maxgrpsrc); static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, - CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_maxsocksrc, 0, + CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0, "Max source filters per socket"); -TUNABLE_ULONG("net.inet.ip.mcast.maxsocksrc", &in_mcast_maxsocksrc); int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; -SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_TUN, +SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN, &in_mcast_loop, 0, "Loopback multicast datagrams by default"); -TUNABLE_INT("net.inet.ip.mcast.loop", &in_mcast_loop); static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, "Per-interface stack-wide source filters"); +static STAILQ_HEAD(, ip_moptions) imo_gc_list = + STAILQ_HEAD_INITIALIZER(imo_gc_list); +static struct task imo_gc_task = TASK_INITIALIZER(0, inp_gcmoptions, NULL); + #ifdef KTR /* * Inline function which wraps assertions for a valid ifp. @@ -221,6 +229,49 @@ imf_init(struct in_mfilter *imf, const int st0, const int st1) imf->imf_st[1] = st1; } +/* + * Function for looking up an in_multi record for an IPv4 multicast address + * on a given interface. ifp must be valid. If no record found, return NULL. + * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. + */ +struct in_multi * +inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) +{ + struct ifmultiaddr *ifma; + struct in_multi *inm; + + IN_MULTI_LOCK_ASSERT(); + IF_ADDR_LOCK_ASSERT(ifp); + + inm = NULL; + TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { + if (ifma->ifma_addr->sa_family == AF_INET) { + inm = (struct in_multi *)ifma->ifma_protospec; + if (inm->inm_addr.s_addr == ina.s_addr) + break; + inm = NULL; + } + } + return (inm); +} + +/* + * Wrapper for inm_lookup_locked(). + * The IF_ADDR_LOCK will be taken on ifp and released on return. + */ +struct in_multi * +inm_lookup(struct ifnet *ifp, const struct in_addr ina) +{ + struct in_multi *inm; + + IN_MULTI_LOCK_ASSERT(); + IF_ADDR_RLOCK(ifp); + inm = inm_lookup_locked(ifp, ina); + IF_ADDR_RUNLOCK(ifp); + + return (inm); +} + /* * Resize the ip_moptions vector to the next power-of-two minus 1. * May be called with locks held; do not sleep. @@ -467,8 +518,8 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group, */ inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); if (inm == NULL) { - if_delmulti_ifma(ifma); IF_ADDR_WUNLOCK(ifp); + if_delmulti_ifma(ifma); return (ENOMEM); } inm->inm_addr = *group; @@ -477,12 +528,7 @@ in_getmulti(struct ifnet *ifp, const struct in_addr *group, inm->inm_ifma = ifma; inm->inm_refcount = 1; inm->inm_state = IGMP_NOT_MEMBER; - - /* - * Pending state-changes per group are subject to a bounds check. - */ - IFQ_SET_MAXLEN(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); - + mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; RB_INIT(&inm->inm_srcs); @@ -575,7 +621,7 @@ inm_clear_recorded(struct in_multi *inm) * * Return 0 if the source didn't exist or was already marked as recorded. * Return 1 if the source was marked as recorded by this function. - * Return <0 if any error occured (negated errno code). + * Return <0 if any error occurred (negated errno code). */ int inm_record_source(struct in_multi *inm, const in_addr_t naddr) @@ -1177,11 +1223,8 @@ out_inm_release: int in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) { - struct ifnet *ifp; int error; - ifp = inm->inm_ifp; - IN_MULTI_LOCK(); error = in_leavegroup_locked(inm, imf); IN_MULTI_UNLOCK(); @@ -1238,7 +1281,9 @@ in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + CURVNET_SET(inm->inm_ifp->if_vnet); error = igmp_change_state(inm); + CURVNET_RESTORE(); if (error) CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); @@ -1526,17 +1571,29 @@ inp_findmoptions(struct inpcb *inp) } /* - * Discard the IP multicast options (and source filters). + * Discard the IP multicast options (and source filters). To minimize + * the amount of work done while holding locks such as the INP's + * pcbinfo lock (which is used in the receive path), the free + * operation is performed asynchronously in a separate task. * * SMPng: NOTE: assumes INP write lock is held. */ void inp_freemoptions(struct ip_moptions *imo) { - struct in_mfilter *imf; - size_t idx, nmships; KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__)); + IN_MULTI_LOCK(); + STAILQ_INSERT_TAIL(&imo_gc_list, imo, imo_link); + IN_MULTI_UNLOCK(); + taskqueue_enqueue(taskqueue_thread, &imo_gc_task); +} + +static void +inp_freemoptions_internal(struct ip_moptions *imo) +{ + struct in_mfilter *imf; + size_t idx, nmships; nmships = imo->imo_num_memberships; for (idx = 0; idx < nmships; ++idx) { @@ -1554,6 +1611,22 @@ inp_freemoptions(struct ip_moptions *imo) free(imo, M_IPMOPTS); } +static void +inp_gcmoptions(void *context, int pending) +{ + struct ip_moptions *imo; + + IN_MULTI_LOCK(); + while (!STAILQ_EMPTY(&imo_gc_list)) { + imo = STAILQ_FIRST(&imo_gc_list); + STAILQ_REMOVE_HEAD(&imo_gc_list, imo_link); + IN_MULTI_UNLOCK(); + inp_freemoptions_internal(imo); + IN_MULTI_LOCK(); + } + IN_MULTI_UNLOCK(); +} + /* * Atomically get source filters on a socket for an IPv4 multicast group. * Called with INP lock held; returns with lock released. @@ -1680,6 +1753,7 @@ inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) int inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) { + struct rm_priotracker in_ifa_tracker; struct ip_mreqn mreqn; struct ip_moptions *imo; struct ifnet *ifp; @@ -1719,7 +1793,7 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) mreqn.imr_address = imo->imo_multicast_addr; } else if (ifp != NULL) { mreqn.imr_ifindex = ifp->if_index; - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); if (ia != NULL) { mreqn.imr_address = IA_SIN(ia)->sin_addr; @@ -1738,7 +1812,7 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) break; case IP_MULTICAST_TTL: - if (imo == 0) + if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_TTL; else optval = coptval = imo->imo_multicast_ttl; @@ -1750,7 +1824,7 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) break; case IP_MULTICAST_LOOP: - if (imo == 0) + if (imo == NULL) optval = coptval = IP_DEFAULT_MULTICAST_LOOP; else optval = coptval = imo->imo_multicast_loop; @@ -1810,7 +1884,10 @@ static struct ifnet * inp_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in *gsin, const struct in_addr ina) { + struct rm_priotracker in_ifa_tracker; struct ifnet *ifp; + struct nhop4_basic nh4; + uint32_t fibnum; KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), @@ -1820,21 +1897,15 @@ inp_lookup_mcast_ifp(const struct inpcb *inp, if (!in_nullhost(ina)) { INADDR_TO_IFP(ina, ifp); } else { - struct route ro; - - ro.ro_rt = NULL; - memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in)); - in_rtalloc_ign(&ro, 0, inp ? inp->inp_inc.inc_fibnum : 0); - if (ro.ro_rt != NULL) { - ifp = ro.ro_rt->rt_ifp; - KASSERT(ifp != NULL, ("%s: null ifp", __func__)); - RTFREE(ro.ro_rt); - } else { + fibnum = inp ? inp->inp_inc.inc_fibnum : 0; + if (fib4_lookup_nh_basic(fibnum, gsin->sin_addr, 0, 0, &nh4)==0) + ifp = nh4.nh_ifp; + else { struct in_ifaddr *ia; struct ifnet *mifp; mifp = NULL; - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { mifp = ia->ia_ifp; if (!(mifp->if_flags & IFF_LOOPBACK) && @@ -1843,7 +1914,7 @@ inp_lookup_mcast_ifp(const struct inpcb *inp, break; } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } @@ -2855,7 +2926,7 @@ sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) return (retval); } -#ifdef KTR +#if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3) static const char *inm_modestrs[] = { "un", "in", "ex" }; @@ -2910,7 +2981,7 @@ inm_print(const struct in_multi *inm) inm->inm_timer, inm_state_str(inm->inm_state), inm->inm_refcount, - inm->inm_scq.ifq_len); + inm->inm_scq.mq_len); printf("igi %p nsrc %lu sctimer %u scrv %u\n", inm->inm_igi, inm->inm_nsrc, @@ -2927,7 +2998,7 @@ inm_print(const struct in_multi *inm) printf("%s: --- end inm %p ---\n", __func__, inm); } -#else /* !KTR */ +#else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */ void inm_print(const struct in_multi *inm) @@ -2935,6 +3006,6 @@ inm_print(const struct in_multi *inm) } -#endif /* KTR */ +#endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */ RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c index b93abadf..f8790938 100644 --- a/freebsd/sys/netinet/in_pcb.c +++ b/freebsd/sys/netinet/in_pcb.c @@ -49,14 +49,18 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include +#include #include #include #include +#include #include #include +#include #include #include #include @@ -73,8 +77,11 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include +#include #include +#include #include #if defined(INET) || defined(INET6) @@ -150,11 +157,7 @@ sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; -#ifdef VIMAGE - error = vnet_sysctl_handle_int(oidp, arg1, arg2, req); -#else error = sysctl_handle_int(oidp, arg1, arg2, req); -#endif if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); @@ -171,38 +174,42 @@ sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, - &sysctl_net_ipport_check, "I", ""); -SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, - CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); -SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); +SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, + CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, + &VNET_NAME(ipport_reservedhigh), 0, ""); +SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); -SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); -SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); -SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); -#endif +#endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. @@ -225,6 +232,7 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ + INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif @@ -242,6 +250,8 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, NULL, NULL, inpcbzone_init, inpcbzone_fini, UMA_ALIGN_PTR, inpcbzone_flags); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); + uma_zone_set_warning(pcbinfo->ipi_zone, + "kern.ipc.maxsockets limit reached"); } /* @@ -261,6 +271,7 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); + INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } @@ -275,7 +286,14 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) struct inpcb *inp; int error; - INP_INFO_WLOCK_ASSERT(pcbinfo); +#ifdef INVARIANTS + if (pcbinfo == &V_tcbinfo) { + INP_INFO_RLOCK_ASSERT(pcbinfo); + } else { + INP_INFO_WLOCK_ASSERT(pcbinfo); + } +#endif + error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) @@ -307,6 +325,8 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif + INP_WLOCK(inp); + INP_LIST_WLOCK(pcbinfo); LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; @@ -314,9 +334,9 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif - INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ + INP_LIST_WUNLOCK(pcbinfo); #if defined(IPSEC) || defined(MAC) out: if (error != 0) { @@ -338,8 +358,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); - anonport = inp->inp_lport == 0 && (nam == NULL || - ((struct sockaddr_in *)nam)->sin_port == 0); + anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) @@ -355,6 +374,9 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) } #endif +/* + * Select a local port (number) to use. + */ #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, @@ -395,13 +417,14 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, lastport = &pcbinfo->ipi_lastport; } /* - * For UDP, use random port allocation as long as the user + * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && - (!V_ipport_stoprandom || pcbinfo == &V_udbinfo)) + (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || + pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; @@ -411,8 +434,8 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, */ if (first == last) dorandom = 0; - /* Make sure to not include UDP packets in the count. */ - if (pcbinfo != &V_udbinfo) + /* Make sure to not include UDP(-Lite) packets in the count. */ + if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down @@ -467,7 +490,7 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, #ifdef INET if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) laddrp->s_addr = laddr.s_addr; -#endif +#endif *lportp = lport; return (0); @@ -491,6 +514,38 @@ inp_so_options(const struct inpcb *inp) } #endif /* INET || INET6 */ +/* + * Check if a new BINDMULTI socket is allowed to be created. + * + * ni points to the new inp. + * oi points to the exisitng inp. + * + * This checks whether the existing inp also has BINDMULTI and + * whether the credentials match. + */ +int +in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) +{ + /* Check permissions match */ +#ifndef __rtems__ + if ((ni->inp_flags2 & INP_BINDMULTI) && + (ni->inp_cred->cr_uid != + oi->inp_cred->cr_uid)) + return (0); +#endif /* __rtems__ */ + + /* Check the existing inp has BINDMULTI set */ + if ((ni->inp_flags2 & INP_BINDMULTI) && + ((oi->inp_flags2 & INP_BINDMULTI) == 0)) + return (0); + + /* + * We're okay - either INP_BINDMULTI isn't set on ni, or + * it is and it matches the checks. + */ + return (1); +} + #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation @@ -594,6 +649,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, * This entire block sorely needs a rewrite. */ if (t && + ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && @@ -607,6 +663,15 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, 0) #endif /* __rtems__ */ return (EADDRINUSE); + + /* + * If the socket is a BINDMULTI socket, then + * the credentials need to match and the + * original socket also has to have been bound + * with BINDMULTI. + */ + if (t && (! in_pcbbind_check_bindmulti(inp, t))) + return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); @@ -621,7 +686,9 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, if (tw == NULL || (reuseport & tw->tw_so_options) == 0) return (EADDRINUSE); - } else if (t && (reuseport & inp_so_options(t)) == 0) { + } else if (t && + ((inp->inp_flags2 & INP_BINDMULTI) == 0) && + (reuseport & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || @@ -631,6 +698,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); + if (t && (! in_pcbbind_check_bindmulti(inp, t))) + return (EADDRINUSE); } } } @@ -706,7 +775,7 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ -static int +int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { @@ -754,9 +823,11 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct in_ifaddr *ia; struct ifnet *ifp; - ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin)); + ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, + inp->inp_socket->so_fibnum)); if (ia == NULL) - ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0)); + ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, + inp->inp_socket->so_fibnum)); if (ia == NULL) { error = ENETUNREACH; goto done; @@ -871,9 +942,11 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; - ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain))); + ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), + inp->inp_socket->so_fibnum)); if (ia == NULL) - ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0)); + ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0, + inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); @@ -946,6 +1019,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { + struct rm_priotracker in_ifa_tracker; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; @@ -982,20 +1056,20 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&TAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (laddr.s_addr == INADDR_ANY) { @@ -1013,7 +1087,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || @@ -1027,7 +1101,7 @@ in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, laddr = ia->ia_addr.sin_addr; error = 0; } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (error) @@ -1064,7 +1138,7 @@ in_pcbdisconnect(struct inpcb *inp) inp->inp_fport = 0; in_pcbrehash(inp); } -#endif +#endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. @@ -1160,8 +1234,17 @@ in_pcbrele_wlocked(struct inpcb *inp) INP_WLOCK_ASSERT(inp); - if (refcount_release(&inp->inp_refcount) == 0) + if (refcount_release(&inp->inp_refcount) == 0) { + /* + * If the inpcb has been freed, let the caller know, even if + * this isn't the last reference. + */ + if (inp->inp_flags2 & INP_FREED) { + INP_WUNLOCK(inp); + return (1); + } return (0); + } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); @@ -1197,16 +1280,24 @@ in_pcbfree(struct inpcb *inp) KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - INP_INFO_WLOCK_ASSERT(pcbinfo); +#ifdef INVARIANTS + if (pcbinfo == &V_tcbinfo) { + INP_INFO_LOCK_ASSERT(pcbinfo); + } else { + INP_INFO_WLOCK_ASSERT(pcbinfo); + } +#endif INP_WLOCK_ASSERT(inp); /* XXXRW: Do as much as possible here. */ #ifdef IPSEC if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); -#endif /* IPSEC */ +#endif + INP_LIST_WLOCK(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); + INP_LIST_WUNLOCK(pcbinfo); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); @@ -1220,6 +1311,13 @@ in_pcbfree(struct inpcb *inp) if (inp->inp_moptions != NULL) inp_freemoptions(inp->inp_moptions); #endif + if (inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } + if (inp->inp_route.ro_lle) + LLE_FREE(inp->inp_route.ro_lle); /* zeros ro_lle */ + inp->inp_vflag = 0; inp->inp_flags2 |= INP_FREED; crfree(inp->inp_cred); @@ -1363,7 +1461,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) struct ip_moptions *imo; int i, gap; - INP_INFO_RLOCK(pcbinfo); + INP_INFO_WLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; @@ -1393,7 +1491,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_WUNLOCK(pcbinfo); } /* @@ -1565,6 +1663,83 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, goto found; } +#ifdef RSS + /* + * For incoming connections, we may wish to do a wildcard + * match for an RSS-local socket. + */ + if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { + struct inpcb *local_wild = NULL, *local_exact = NULL; +#ifdef INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + struct inpcb *jail_wild = NULL; + struct inpcbhead *head; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + + head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, + lport, 0, pcbgroup->ipg_hashmask)]; + LIST_FOREACH(inp, head, inp_pcbgrouphash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY || + inp->inp_lport != lport) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP4); + if (injail) { + if (prison_check_ip4(inp->inp_cred, + &laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if (injail) + goto found; + else + local_exact = inp; + } else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#ifdef INET6 + /* XXX inp locking, NULL check */ + if (inp->inp_vflag & INP_IPV6PROTO) + local_wild_mapped = inp; + else +#endif + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + + inp = jail_wild; + if (inp == NULL) + inp = local_exact; + if (inp == NULL) + inp = local_wild; +#ifdef INET6 + if (inp == NULL) + inp = local_wild_mapped; +#endif + if (inp != NULL) + goto found; + } +#endif + /* * Then look for a wildcard match, if requested. */ @@ -1596,11 +1771,6 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, inp->inp_lport != lport) continue; - /* XXX inp locking */ - if (ifp && ifp->if_type == IFT_FAITH && - (inp->inp_flags & INP_FAITH) == 0) - continue; - injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, @@ -1622,7 +1792,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else -#endif /* INET6 */ +#endif if (injail) jail_wild = inp; else @@ -1637,7 +1807,7 @@ in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; -#endif /* defined(INET6) */ +#endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ @@ -1741,11 +1911,6 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, inp->inp_lport != lport) continue; - /* XXX inp locking */ - if (ifp && ifp->if_type == IFT_FAITH && - (inp->inp_flags & INP_FAITH) == 0) - continue; - injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, @@ -1767,7 +1932,7 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else -#endif /* INET6 */ +#endif if (injail) jail_wild = inp; else @@ -1783,7 +1948,7 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); -#endif /* defined(INET6) */ +#endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); @@ -1832,7 +1997,7 @@ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { -#if defined(PCBGROUP) +#if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif @@ -1841,7 +2006,17 @@ in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); -#if defined(PCBGROUP) + /* + * When not using RSS, use connection groups in preference to the + * reservation table when looking up 4-tuples. When using RSS, just + * use the reservation table, due to the cost of the Toeplitz hash + * in software. + * + * XXXRW: This policy belongs in the pcbgroup code, as in principle + * we could be doing RSS with a non-Toeplitz hash that is affordable + * in software. + */ +#if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); @@ -1868,16 +2043,27 @@ in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP - if (in_pcbgroup_enabled(pcbinfo)) { + /* + * If we can use a hardware-generated hash to look up the connection + * group, use that connection group to find the inpcb. Otherwise + * fall back on a software hash -- or the reservation table if we're + * using RSS. + * + * XXXRW: As above, that policy belongs in the pcbgroup code. + */ + if (in_pcbgroup_enabled(pcbinfo) && + !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); +#ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); +#endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, @@ -1905,9 +2091,9 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) #ifdef INET6 if (inp->inp_vflag & INP_IPV6) - hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; + hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else -#endif /* INET6 */ +#endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, @@ -1992,9 +2178,9 @@ in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) #ifdef INET6 if (inp->inp_vflag & INP_IPV6) - hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; + hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else -#endif /* INET6 */ +#endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, @@ -2026,8 +2212,16 @@ in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - INP_INFO_WLOCK_ASSERT(pcbinfo); +#ifdef INVARIANTS + if (pcbinfo == &V_tcbinfo) { + INP_INFO_RLOCK_ASSERT(pcbinfo); + } else { + INP_INFO_WLOCK_ASSERT(pcbinfo); + } +#endif + INP_WLOCK_ASSERT(inp); + INP_LIST_WLOCK_ASSERT(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { @@ -2050,6 +2244,25 @@ in_pcbremlists(struct inpcb *inp) #endif } +/* + * Check for alternatives when higher level complains + * about service problems. For now, invalidate cached + * routing information. If the route was created dynamically + * (by a redirect), time to try a default gateway again. + */ +void +in_losing(struct inpcb *inp) +{ + + if (inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } + if (inp->inp_route.ro_lle) + LLE_FREE(inp->inp_route.ro_lle); /* zeros ro_lle */ + return; +} + /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. @@ -2115,7 +2328,7 @@ ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ - callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); + callout_init(&ipport_tick_callout, 1); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); @@ -2172,13 +2385,13 @@ inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * @@ -2262,14 +2475,13 @@ db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); - } else { + } else #endif + { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); -#ifdef INET6 } -#endif db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); @@ -2320,10 +2532,6 @@ db_print_inpflags(int inp_flags) db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } - if (inp_flags & INP_FAITH) { - db_printf("%sINP_FAITH", comma ? ", " : ""); - comma = 1; - } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; @@ -2486,4 +2694,4 @@ DB_SHOW_COMMAND(inpcb, db_show_inpcb) db_print_inpcb(inp, "inpcb", 0); } -#endif +#endif /* DDB */ diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h index a78c6ab6..ea47d6b2 100644 --- a/freebsd/sys/netinet/in_pcb.h +++ b/freebsd/sys/netinet/in_pcb.h @@ -42,6 +42,7 @@ #include #include #include +#include #ifdef _KERNEL #include @@ -79,6 +80,8 @@ struct in_addr_4in6 { /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has * some extra padding to accomplish this. + * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport, + * lport, faddr to generate hash, so these fields shouldn't be moved. */ struct in_endpoints { u_int16_t ie_fport; /* foreign port */ @@ -94,6 +97,7 @@ struct in_endpoints { struct in_addr_4in6 ie46_local; struct in6_addr ie6_local; } ie_dependladdr; + u_int32_t ie6_zoneid; /* scope zone id */ }; #define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 #define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 @@ -117,34 +121,47 @@ struct in_conninfo { */ #define INC_ISIPV6 0x01 -#define inc_isipv6 inc_flags /* temp compatability */ +#define inc_isipv6 inc_flags /* temp compatibility */ #define inc_fport inc_ie.ie_fport #define inc_lport inc_ie.ie_lport #define inc_faddr inc_ie.ie_faddr #define inc_laddr inc_ie.ie_laddr #define inc6_faddr inc_ie.ie6_faddr #define inc6_laddr inc_ie.ie6_laddr +#define inc6_zoneid inc_ie.ie6_zoneid struct icmp6_filter; /*- - * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 - * and IPv6 sockets. In the case of TCP, further per-connection state is + * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and + * IPv6 sockets. In the case of TCP and UDP, further per-connection state is * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb * are static after creation or protected by a per-inpcb rwlock, inp_lock. A - * few fields also require the global pcbinfo lock for the inpcb to be held, - * when modified, such as the global connection lists and hashes, as well as - * binding information (which affects which hash a connection is on). This - * model means that connections can be looked up without holding the - * per-connection lock, which is important for performance when attempting to - * find the connection for a packet given its IP and port tuple. Writing to - * these fields that write locks be held on both the inpcb and global locks. + * few fields are protected by multiple locks as indicated in the locking notes + * below. For these fields, all of the listed locks must be write-locked for + * any modifications. However, these fields can be safely read while any one of + * the listed locks are read-locked. This model can permit greater concurrency + * for read operations. For example, connections can be looked up while only + * holding a read lock on the global pcblist lock. This is important for + * performance when attempting to find the connection for a packet given its IP + * and port tuple. + * + * One noteworthy exception is that the global pcbinfo lock follows a different + * set of rules in relation to the inp_list field. Rather than being + * write-locked for modifications and read-locked for list iterations, it must + * be read-locked during modifications and write-locked during list iterations. + * This ensures that the relatively rare global list iterations safely walk a + * stable snapshot of connections while allowing more common list modifications + * to safely grab the pcblist lock just while adding or removing a connection + * from the global list. * * Key: * (c) - Constant after initialization * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb + * (l) - Protected by the pcblist lock for the inpcb + * (h) - Protected by the pcbhash lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking * @@ -159,15 +176,21 @@ struct icmp6_filter; * socket has been freed), or there may be close(2)-related races. * * The inp_vflag field is overloaded, and would otherwise ideally be (c). + * + * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock + * read-lock usage during modification, this model can be applied to other + * protocols (especially SCTP). */ struct inpcb { - LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ + LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */ LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ - LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ + LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ + /* (p[w]) for list iteration */ + /* (p[r]/l) for addition/removal */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ - LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */ + LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ @@ -179,12 +202,14 @@ struct inpcb { u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ u_int inp_refcount; /* (i) refcount */ - void *inp_pspare[5]; /* (x) route caching / general use */ - u_int inp_ispare[6]; /* (x) route caching / user cookie / + void *inp_pspare[5]; /* (x) packet pacing / general use */ + uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ + uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */ + u_int inp_ispare[4]; /* (x) packet pacing / user cookie / * general use */ /* Local and foreign ports, local and foreign addr. */ - struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */ + struct in_conninfo inp_inc; /* (i) list for PCB's local port */ /* MAC and IPSEC policy information. */ struct label *inp_label; /* (i) MAC label */ @@ -209,13 +234,19 @@ struct inpcb { int inp6_cksum; short inp6_hops; } inp_depend6; - LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */ - struct inpcbport *inp_phd; /* (i/p) head of this list */ + LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */ + struct inpcbport *inp_phd; /* (i/h) head of this list */ #define inp_zero_size offsetof(struct inpcb, inp_gencnt) inp_gen_t inp_gencnt; /* (c) generation count */ struct llentry *inp_lle; /* cached L2 information */ - struct rtentry *inp_rt; /* cached L3 information */ struct rwlock inp_lock; + rt_gen_t inp_rt_cookie; /* generation for route entry */ + union { /* cached L3 information */ + struct route inpu_route; + struct route_in6 inpu_route6; + } inp_rtu; +#define inp_route inp_rtu.inpu_route +#define inp_route6 inp_rtu.inpu_route6 }; #define inp_fport inp_inc.inc_fport #define inp_lport inp_inc.inc_lport @@ -227,6 +258,7 @@ struct inpcb { #define in6p_faddr inp_inc.inc6_faddr #define in6p_laddr inp_inc.inc6_laddr +#define in6p_zoneid inp_inc.inc6_zoneid #define in6p_hops inp_depend6.inp6_hops /* default hop limit */ #define in6p_flowinfo inp_flow #define in6p_options inp_depend6.inp6_options @@ -274,37 +306,46 @@ struct inpcbport { * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. * - * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock, - * the former covering mutable global fields (such as the global pcb list), - * and the latter covering the hashed lookup tables. The lock order is: + * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and + * ipi_list_lock: + * - ipi_lock covering the global pcb list stability during loop iteration, + * - ipi_hash_lock covering the hashed lookup tables, + * - ipi_list_lock covering mutable global fields (such as the global + * pcb list) + * + * The lock order is: * - * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks} + * ipi_lock (before) + * inpcb locks (before) + * ipi_list locks (before) + * {ipi_hash_lock, pcbgroup locks} * * Locking key: * * (c) Constant or nearly constant after initialisation * (g) Locked by ipi_lock + * (l) Locked by ipi_list_lock * (h) Read using either ipi_hash_lock or inpcb lock; write requires both * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* - * Global lock protecting global inpcb list, inpcb count, etc. + * Global lock protecting full inpcb list traversal */ struct rwlock ipi_lock; /* * Global list of inpcbs on the protocol. */ - struct inpcbhead *ipi_listhead; /* (g) */ - u_int ipi_count; /* (g) */ + struct inpcbhead *ipi_listhead; /* (g/l) */ + u_int ipi_count; /* (l) */ /* * Generation count -- incremented each time a connection is allocated * or freed. */ - u_quad_t ipi_gencnt; /* (g) */ + u_quad_t ipi_gencnt; /* (l) */ /* * Fields associated with port lookup and allocation. @@ -362,6 +403,11 @@ struct inpcbinfo { * general use 2 */ void *ipi_pspare[2]; + + /* + * Global lock protecting global inpcb list, inpcb count, etc. + */ + struct rwlock ipi_list_lock; }; #ifdef _KERNEL @@ -454,6 +500,7 @@ short inp_so_options(const struct inpcb *inp); #define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock) #define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock) +#define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock) #define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) #define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) #define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) @@ -461,6 +508,25 @@ short inp_so_options(const struct inpcb *inp); #define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) #define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) +#define INP_LIST_LOCK_INIT(ipi, d) \ + rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) +#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock) +#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock) +#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock) +#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock) +#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock) +#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock) +#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock) +#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock) +#define INP_LIST_LOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED) +#define INP_LIST_RLOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED) +#define INP_LIST_WLOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED) +#define INP_LIST_UNLOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED) + #define INP_HASH_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) #define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) @@ -485,6 +551,7 @@ short inp_so_options(const struct inpcb *inp); (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) +#define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3]) /* * Flags for inp_vflags -- historically version flags only @@ -505,7 +572,7 @@ short inp_so_options(const struct inpcb *inp); #define INP_ANONPORT 0x00000040 /* port chosen for user */ #define INP_RECVIF 0x00000080 /* receive incoming interface */ #define INP_MTUDISC 0x00000100 /* user can do MTU discovery */ -#define INP_FAITH 0x00000200 /* accept FAITH'ed connections */ + /* 0x000200 unused: was INP_FAITH */ #define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */ #define INP_DONTFRAG 0x00000800 /* don't fragment packet */ #define INP_BINDANY 0x00001000 /* allow bind to any address */ @@ -524,8 +591,8 @@ short inp_so_options(const struct inpcb *inp); #define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */ #define INP_DROPPED 0x04000000 /* protocol drop flag */ #define INP_SOCKREF 0x08000000 /* strong socket reference */ -#define INP_SW_FLOWID 0x10000000 /* software generated flow id */ -#define INP_HW_FLOWID 0x20000000 /* hardware generated flow id */ +#define INP_RESERVED_0 0x10000000 /* reserved field */ +#define INP_RESERVED_1 0x20000000 /* reserved field */ #define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */ #define IN6P_MTU 0x80000000 /* receive path MTU */ @@ -545,6 +612,10 @@ short inp_so_options(const struct inpcb *inp); #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ #define INP_FREED 0x00000010 /* inp itself is not valid */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ +#define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ +#define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ +#define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */ +#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ /* * Flags passed to in_pcblookup*() functions. @@ -603,6 +674,9 @@ void in_pcbinfo_destroy(struct inpcbinfo *); void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, int, int, char *, uma_init, uma_fini, uint32_t, u_int); +int in_pcbbind_check_bindmulti(const struct inpcb *ni, + const struct inpcb *oi); + struct inpcbgroup * in_pcbgroup_byhash(struct inpcbinfo *, u_int, uint32_t); struct inpcbgroup * @@ -636,6 +710,8 @@ void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); int in_pcbinshash_nopcbgroup(struct inpcb *); +int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *, + struct ucred *); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); @@ -653,6 +729,7 @@ void in_pcbrehash_mbuf(struct inpcb *, struct mbuf *); int in_pcbrele(struct inpcb *); int in_pcbrele_rlocked(struct inpcb *); int in_pcbrele_wlocked(struct inpcb *); +void in_losing(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); int in_getsockaddr(struct socket *so, struct sockaddr **nam); diff --git a/freebsd/sys/netinet/in_proto.c b/freebsd/sys/netinet/in_proto.c index 1eef2c72..8c3efa4d 100644 --- a/freebsd/sys/netinet/in_proto.c +++ b/freebsd/sys/netinet/in_proto.c @@ -34,7 +34,6 @@ #include __FBSDID("$FreeBSD$"); -#include #include #include #include @@ -45,6 +44,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -60,6 +60,7 @@ __FBSDID("$FreeBSD$"); */ #ifdef INET #include +#include #include #ifdef RADIX_MPATH #include @@ -120,9 +121,6 @@ struct protosw inetsw[] = { .pr_domain = &inetdomain, .pr_protocol = IPPROTO_IP, .pr_init = ip_init, -#ifdef VIMAGE - .pr_destroy = ip_destroy, -#endif .pr_slowtimo = ip_slowtimo, .pr_drain = ip_drain, .pr_usrreqs = &nousrreqs @@ -136,9 +134,6 @@ struct protosw inetsw[] = { .pr_ctlinput = udp_ctlinput, .pr_ctloutput = udp_ctloutput, .pr_init = udp_init, -#ifdef VIMAGE - .pr_destroy = udp_destroy, -#endif .pr_usrreqs = &udp_usrreqs }, { @@ -150,9 +145,6 @@ struct protosw inetsw[] = { .pr_ctlinput = tcp_ctlinput, .pr_ctloutput = tcp_ctloutput, .pr_init = tcp_init, -#ifdef VIMAGE - .pr_destroy = tcp_destroy, -#endif .pr_slowtimo = tcp_slowtimo, .pr_drain = tcp_drain, .pr_usrreqs = &tcp_usrreqs @@ -167,9 +159,6 @@ struct protosw inetsw[] = { .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_init = sctp_init, -#ifdef VIMAGE - .pr_destroy = sctp_finish, -#endif .pr_drain = sctp_drain, .pr_usrreqs = &sctp_usrreqs }, @@ -177,7 +166,7 @@ struct protosw inetsw[] = { .pr_type = SOCK_STREAM, .pr_domain = &inetdomain, .pr_protocol = IPPROTO_SCTP, - .pr_flags = PR_WANTRCVD, + .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD, .pr_input = sctp_input, .pr_ctlinput = sctp_ctlinput, .pr_ctloutput = sctp_ctloutput, @@ -185,6 +174,17 @@ struct protosw inetsw[] = { .pr_usrreqs = &sctp_usrreqs }, #endif /* SCTP */ +{ + .pr_type = SOCK_DGRAM, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_UDPLITE, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = udp_input, + .pr_ctlinput = udplite_ctlinput, + .pr_ctloutput = udp_ctloutput, + .pr_init = udplite_init, + .pr_usrreqs = &udp_usrreqs +}, { .pr_type = SOCK_RAW, .pr_domain = &inetdomain, @@ -330,9 +330,6 @@ IPPROTOSPACER, .pr_input = rip_input, .pr_ctloutput = rip_ctloutput, .pr_init = rip_init, -#ifdef VIMAGE - .pr_destroy = rip_destroy, -#endif .pr_usrreqs = &rip_usrreqs }, }; @@ -344,7 +341,7 @@ struct domain inetdomain = { .dom_family = AF_INET, .dom_name = "internet", .dom_protosw = inetsw, - .dom_protoswNPROTOSW = &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], + .dom_protoswNPROTOSW = &inetsw[nitems(inetsw)], #ifdef RADIX_MPATH .dom_rtattach = rn4_mpath_inithead, #else @@ -353,8 +350,6 @@ struct domain inetdomain = { #ifdef VIMAGE .dom_rtdetach = in_detachhead, #endif - .dom_rtoffset = 32, - .dom_maxrtkey = sizeof(struct sockaddr_in), .dom_ifattach = in_domifattach, .dom_ifdetach = in_domifdetach }; @@ -382,3 +377,5 @@ SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW, 0, "IPCOMP"); SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW, 0, "IPIP"); #endif /* IPSEC */ SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW"); +SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW, 0, + "Accept filters"); diff --git a/freebsd/sys/netinet/in_rmx.c b/freebsd/sys/netinet/in_rmx.c index 939193f6..2062d1d1 100644 --- a/freebsd/sys/netinet/in_rmx.c +++ b/freebsd/sys/netinet/in_rmx.c @@ -38,11 +38,11 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include -#include #include +#include #include +#include #include #include @@ -56,19 +56,16 @@ extern int in_inithead(void **head, int off); extern int in_detachhead(void **head, int off); #endif -#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ - /* * Do what we need to do when inserting a route. */ static struct radix_node * -in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, +in_addroute(void *v_arg, void *n_arg, struct radix_head *head, struct radix_node *treenodes) { struct rtentry *rt = (struct rtentry *)treenodes; struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); - RADIX_NODE_HEAD_WLOCK_ASSERT(head); /* * A little bit of help for both IP output and input: * For host routes, we make sure that RTF_BROADCAST @@ -95,247 +92,20 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) rt->rt_flags |= RTF_MULTICAST; - if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp) - rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; - - return (rn_addroute(v_arg, n_arg, head, treenodes)); -} - -/* - * This code is the inverse of in_clsroute: on first reference, if we - * were managing the route, stop doing so and set the expiration timer - * back off again. - */ -static struct radix_node * -in_matroute(void *v_arg, struct radix_node_head *head) -{ - struct radix_node *rn = rn_match(v_arg, head); - struct rtentry *rt = (struct rtentry *)rn; - - if (rt) { - RT_LOCK(rt); - if (rt->rt_flags & RTPRF_OURS) { - rt->rt_flags &= ~RTPRF_OURS; - rt->rt_rmx.rmx_expire = 0; - } - RT_UNLOCK(rt); - } - return rn; -} - -static VNET_DEFINE(int, rtq_reallyold) = 60*60; /* one hour is "really old" */ -#define V_rtq_reallyold VNET(rtq_reallyold) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, - &VNET_NAME(rtq_reallyold), 0, - "Default expiration time on dynamically learned routes"); - -/* never automatically crank down to less */ -static VNET_DEFINE(int, rtq_minreallyold) = 10; -#define V_rtq_minreallyold VNET(rtq_minreallyold) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, - &VNET_NAME(rtq_minreallyold), 0, - "Minimum time to attempt to hold onto dynamically learned routes"); - -/* 128 cached routes is "too many" */ -static VNET_DEFINE(int, rtq_toomany) = 128; -#define V_rtq_toomany VNET(rtq_toomany) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, - &VNET_NAME(rtq_toomany), 0, - "Upper limit on dynamically learned routes"); - -/* - * On last reference drop, mark the route as belong to us so that it can be - * timed out. - */ -static void -in_clsroute(struct radix_node *rn, struct radix_node_head *head) -{ - struct rtentry *rt = (struct rtentry *)rn; - - RT_LOCK_ASSERT(rt); - - if (!(rt->rt_flags & RTF_UP)) - return; /* prophylactic measures */ - - if (rt->rt_flags & RTPRF_OURS) - return; - - if (!(rt->rt_flags & RTF_DYNAMIC)) - return; - - /* - * If rtq_reallyold is 0, just delete the route without - * waiting for a timeout cycle to kill it. - */ - if (V_rtq_reallyold != 0) { - rt->rt_flags |= RTPRF_OURS; - rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold; - } else { - rtexpunge(rt); - } -} - -struct rtqk_arg { - struct radix_node_head *rnh; - int draining; - int killed; - int found; - int updating; - time_t nextstop; -}; - -/* - * Get rid of old routes. When draining, this deletes everything, even when - * the timeout is not expired yet. When updating, this makes sure that - * nothing has a timeout longer than the current value of rtq_reallyold. - */ -static int -in_rtqkill(struct radix_node *rn, void *rock) -{ - struct rtqk_arg *ap = rock; - struct rtentry *rt = (struct rtentry *)rn; - int err; - - RADIX_NODE_HEAD_WLOCK_ASSERT(ap->rnh); - - if (rt->rt_flags & RTPRF_OURS) { - ap->found++; - - if (ap->draining || rt->rt_rmx.rmx_expire <= time_uptime) { - if (rt->rt_refcnt > 0) - panic("rtqkill route really not free"); - - err = in_rtrequest(RTM_DELETE, - (struct sockaddr *)rt_key(rt), - rt->rt_gateway, rt_mask(rt), - rt->rt_flags | RTF_RNH_LOCKED, 0, - rt->rt_fibnum); - if (err) { - log(LOG_WARNING, "in_rtqkill: error %d\n", err); - } else { - ap->killed++; - } - } else { - if (ap->updating && - (rt->rt_rmx.rmx_expire - time_uptime > - V_rtq_reallyold)) { - rt->rt_rmx.rmx_expire = - time_uptime + V_rtq_reallyold; - } - ap->nextstop = lmin(ap->nextstop, - rt->rt_rmx.rmx_expire); - } - } - - return 0; -} - -#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ -static VNET_DEFINE(int, rtq_timeout) = RTQ_TIMEOUT; -static VNET_DEFINE(struct callout, rtq_timer); - -#define V_rtq_timeout VNET(rtq_timeout) -#define V_rtq_timer VNET(rtq_timer) - -static void in_rtqtimo_one(void *rock); - -static void -in_rtqtimo(void *rock) -{ - CURVNET_SET((struct vnet *) rock); - int fibnum; - void *newrock; - struct timeval atv; - - for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { - newrock = rt_tables_get_rnh(fibnum, AF_INET); - if (newrock != NULL) - in_rtqtimo_one(newrock); - } - atv.tv_usec = 0; - atv.tv_sec = V_rtq_timeout; - callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); - CURVNET_RESTORE(); -} + if (rt->rt_ifp != NULL) { -static void -in_rtqtimo_one(void *rock) -{ - struct radix_node_head *rnh = rock; - struct rtqk_arg arg; - static time_t last_adjusted_timeout = 0; - - arg.found = arg.killed = 0; - arg.rnh = rnh; - arg.nextstop = time_uptime + V_rtq_timeout; - arg.draining = arg.updating = 0; - RADIX_NODE_HEAD_LOCK(rnh); - rnh->rnh_walktree(rnh, in_rtqkill, &arg); - RADIX_NODE_HEAD_UNLOCK(rnh); - - /* - * Attempt to be somewhat dynamic about this: - * If there are ``too many'' routes sitting around taking up space, - * then crank down the timeout, and see if we can't make some more - * go away. However, we make sure that we will never adjust more - * than once in rtq_timeout seconds, to keep from cranking down too - * hard. - */ - if ((arg.found - arg.killed > V_rtq_toomany) && - (time_uptime - last_adjusted_timeout >= V_rtq_timeout) && - V_rtq_reallyold > V_rtq_minreallyold) { - V_rtq_reallyold = 2 * V_rtq_reallyold / 3; - if (V_rtq_reallyold < V_rtq_minreallyold) { - V_rtq_reallyold = V_rtq_minreallyold; - } - - last_adjusted_timeout = time_uptime; -#ifdef DIAGNOSTIC - log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", - V_rtq_reallyold); -#endif - arg.found = arg.killed = 0; - arg.updating = 1; - RADIX_NODE_HEAD_LOCK(rnh); - rnh->rnh_walktree(rnh, in_rtqkill, &arg); - RADIX_NODE_HEAD_UNLOCK(rnh); - } - -} - -void -in_rtqdrain(void) -{ - VNET_ITERATOR_DECL(vnet_iter); - struct radix_node_head *rnh; - struct rtqk_arg arg; - int fibnum; - - VNET_LIST_RLOCK_NOSLEEP(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); - - for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { - rnh = rt_tables_get_rnh(fibnum, AF_INET); - arg.found = arg.killed = 0; - arg.rnh = rnh; - arg.nextstop = 0; - arg.draining = 1; - arg.updating = 0; - RADIX_NODE_HEAD_LOCK(rnh); - rnh->rnh_walktree(rnh, in_rtqkill, &arg); - RADIX_NODE_HEAD_UNLOCK(rnh); - } - CURVNET_RESTORE(); + /* + * Check route MTU: + * inherit interface MTU if not set or + * check if MTU is too large. + */ + if (rt->rt_mtu == 0) { + rt->rt_mtu = rt->rt_ifp->if_mtu; + } else if (rt->rt_mtu > rt->rt_ifp->if_mtu) + rt->rt_mtu = rt->rt_ifp->if_mtu; } - VNET_LIST_RUNLOCK_NOSLEEP(); -} -void -in_setmatchfunc(struct radix_node_head *rnh, int val) -{ - - rnh->rnh_matchaddr = (val != 0) ? rn_match : in_matroute; + return (rn_addroute(v_arg, n_arg, head, treenodes)); } static int _in_rt_was_here; @@ -345,29 +115,16 @@ static int _in_rt_was_here; int in_inithead(void **head, int off) { - struct radix_node_head *rnh; + struct rib_head *rh; - /* XXX MRT - * This can be called from vfs_export.c too in which case 'off' - * will be 0. We know the correct value so just use that and - * return directly if it was 0. - * This is a hack that replaces an even worse hack on a bad hack - * on a bad design. After RELENG_7 this should be fixed but that - * will change the ABI, so for now do it this way. - */ - if (!rn_inithead(head, 32)) - return 0; + rh = rt_table_init(32); + if (rh == NULL) + return (0); - if (off == 0) /* XXX MRT see above */ - return 1; /* only do the rest for a real routing table */ + rh->rnh_addaddr = in_addroute; + *head = (void *)rh; - rnh = *head; - rnh->rnh_addaddr = in_addroute; - in_setmatchfunc(rnh, V_drop_redirect); - rnh->rnh_close = in_clsroute; if (_in_rt_was_here == 0 ) { - callout_init(&V_rtq_timer, CALLOUT_MPSAFE); - callout_reset(&V_rtq_timer, 1, in_rtqtimo, curvnet); _in_rt_was_here = 1; } return 1; @@ -378,7 +135,7 @@ int in_detachhead(void **head, int off) { - callout_drain(&V_rtq_timer); + rt_table_destroy((struct rib_head *)(*head)); return (1); } #endif @@ -398,62 +155,32 @@ struct in_ifadown_arg { }; static int -in_ifadownkill(struct radix_node *rn, void *xap) +in_ifadownkill(const struct rtentry *rt, void *xap) { struct in_ifadown_arg *ap = xap; - struct rtentry *rt = (struct rtentry *)rn; - RT_LOCK(rt); - if (rt->rt_ifa == ap->ifa && - (ap->del || !(rt->rt_flags & RTF_STATIC))) { - /* - * Aquire a reference so that it can later be freed - * as the refcount would be 0 here in case of at least - * ap->del. - */ - RT_ADDREF(rt); - /* - * Disconnect it from the tree and permit protocols - * to cleanup. - */ - rtexpunge(rt); - /* - * At this point it is an rttrash node, and in case - * the above is the only reference we must free it. - * If we do not noone will have a pointer and the - * rtentry will be leaked forever. - * In case someone else holds a reference, we are - * fine as we only decrement the refcount. In that - * case if the other entity calls RT_REMREF, we - * will still be leaking but at least we tried. - */ - RTFREE_LOCKED(rt); + if (rt->rt_ifa != ap->ifa) return (0); - } - RT_UNLOCK(rt); - return 0; + + if ((rt->rt_flags & RTF_STATIC) != 0 && ap->del == 0) + return (0); + + return (1); } -int +void in_ifadown(struct ifaddr *ifa, int delete) { struct in_ifadown_arg arg; - struct radix_node_head *rnh; - int fibnum; - if (ifa->ifa_addr->sa_family != AF_INET) - return 1; + KASSERT(ifa->ifa_addr->sa_family == AF_INET, + ("%s: wrong family", __func__)); - for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { - rnh = rt_tables_get_rnh(fibnum, AF_INET); - arg.ifa = ifa; - arg.del = delete; - RADIX_NODE_HEAD_LOCK(rnh); - rnh->rnh_walktree(rnh, in_ifadownkill, &arg); - RADIX_NODE_HEAD_UNLOCK(rnh); - ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */ - } - return 0; + arg.ifa = ifa; + arg.del = delete; + + rt_foreach_fib_walk_del(AF_INET, in_ifadownkill, &arg); + ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */ } /* @@ -467,25 +194,6 @@ in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) rtalloc_ign_fib(ro, ignflags, fibnum); } -int -in_rtrequest( int req, - struct sockaddr *dst, - struct sockaddr *gateway, - struct sockaddr *netmask, - int flags, - struct rtentry **ret_nrt, - u_int fibnum) -{ - return (rtrequest_fib(req, dst, gateway, netmask, - flags, ret_nrt, fibnum)); -} - -struct rtentry * -in_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) -{ - return (rtalloc1_fib(dst, report, ignflags, fibnum)); -} - void in_rtredirect(struct sockaddr *dst, struct sockaddr *gateway, @@ -497,16 +205,3 @@ in_rtredirect(struct sockaddr *dst, rtredirect_fib(dst, gateway, netmask, flags, src, fibnum); } -void -in_rtalloc(struct route *ro, u_int fibnum) -{ - rtalloc_ign_fib(ro, 0UL, fibnum); -} - -#if 0 -int in_rt_getifa(struct rt_addrinfo *, u_int fibnum); -int in_rtioctl(u_long, caddr_t, u_int); -int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int); -#endif - - diff --git a/freebsd/sys/netinet/in_rss.h b/freebsd/sys/netinet/in_rss.h new file mode 100644 index 00000000..fd300ac5 --- /dev/null +++ b/freebsd/sys/netinet/in_rss.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2010-2011 Juniper Networks, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson under contract + * to Juniper Networks, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_RSS_H_ +#define _NETINET_IN_RSS_H_ + +#include /* in_addr_t */ + +/* + * Network stack interface to generate a hash for a protocol tuple. + */ +uint32_t rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, + struct in_addr dst, u_short dstport); +uint32_t rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst); + +/* + * Functions to calculate a software RSS hash for a given mbuf or + * packet detail. + */ +int rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, + uint32_t *hashval, uint32_t *hashtype); +int rss_proto_software_hash_v4(struct in_addr src, + struct in_addr dst, u_short src_port, u_short dst_port, + int proto, uint32_t *hashval, + uint32_t *hashtype); +struct mbuf * rss_soft_m2cpuid_v4(struct mbuf *m, uintptr_t source, + u_int *cpuid); + +#endif /* !_NETINET_IN_RSS_H_ */ diff --git a/freebsd/sys/netinet/in_systm.h b/freebsd/sys/netinet/in_systm.h index 4b34aa00..a4a56833 100644 --- a/freebsd/sys/netinet/in_systm.h +++ b/freebsd/sys/netinet/in_systm.h @@ -44,14 +44,26 @@ * Internally the system keeps counters in the headers with the bytes * swapped so that VAX instructions will work on them. It reverses * the bytes before transmission at each protocol level. The n_ types - * represent the types with the bytes in ``high-ender'' order. + * represent the types with the bytes in ``high-ender'' order. Network + * byte order is usually referered to as big-endian these days rather + * than high-ender, which sadly invokes an Orson Scott Card novel, or + * worse, the movie. */ typedef u_int16_t n_short; /* short as received from the net */ typedef u_int32_t n_long; /* long as received from the net */ -typedef u_int32_t n_time; /* ms since 00:00 GMT, byte rev */ +typedef u_int32_t n_time; /* ms since 00:00 UTC, byte rev */ #ifdef _KERNEL +struct inpcb; +struct ucred; + +#ifndef __rtems__ +int cr_canseeinpcb(struct ucred *cred, struct inpcb *inp); +#else /* __rtems__ */ +#define cr_canseeinpcb(cred, inp) 0 +#endif /* __rtems__ */ + uint32_t iptime(void); #endif diff --git a/freebsd/sys/netinet/in_var.h b/freebsd/sys/netinet/in_var.h index b8477309..af83e9a1 100644 --- a/freebsd/sys/netinet/in_var.h +++ b/freebsd/sys/netinet/in_var.h @@ -33,11 +33,24 @@ #ifndef _NETINET_IN_VAR_H_ #define _NETINET_IN_VAR_H_ +/* + * Argument structure for SIOCAIFADDR. + */ +struct in_aliasreq { + char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr_in ifra_addr; + struct sockaddr_in ifra_broadaddr; +#define ifra_dstaddr ifra_broadaddr + struct sockaddr_in ifra_mask; + int ifra_vhid; +}; + +#ifdef _KERNEL #include #include #include -struct igmp_ifinfo; +struct igmp_ifsoftc; struct in_multi; struct lltable; @@ -46,7 +59,7 @@ struct lltable; */ struct in_ifinfo { struct lltable *ii_llt; /* ARP state */ - struct igmp_ifinfo *ii_igmp; /* IGMP state */ + struct igmp_ifsoftc *ii_igmp; /* IGMP state */ struct in_multi *ii_allhosts; /* 224.0.0.1 membership */ }; @@ -71,25 +84,17 @@ struct in_ifaddr { struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ }; -struct in_aliasreq { - char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ - struct sockaddr_in ifra_addr; - struct sockaddr_in ifra_broadaddr; -#define ifra_dstaddr ifra_broadaddr - struct sockaddr_in ifra_mask; -}; /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. */ #define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) #define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) +#define IA_MASKSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_sockmask)) #define IN_LNAOF(in, ifa) \ ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) - -#ifdef _KERNEL extern u_char inetctlerrmap[]; #define LLTABLE(ifp) \ @@ -114,15 +119,15 @@ VNET_DECLARE(u_long, in_ifaddrhmask); /* mask for hash table */ #define INADDR_HASH(x) \ (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) -extern struct rwlock in_ifaddr_lock; +extern struct rmlock in_ifaddr_lock; -#define IN_IFADDR_LOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_LOCKED) -#define IN_IFADDR_RLOCK() rw_rlock(&in_ifaddr_lock) -#define IN_IFADDR_RLOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_RLOCKED) -#define IN_IFADDR_RUNLOCK() rw_runlock(&in_ifaddr_lock) -#define IN_IFADDR_WLOCK() rw_wlock(&in_ifaddr_lock) -#define IN_IFADDR_WLOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_WLOCKED) -#define IN_IFADDR_WUNLOCK() rw_wunlock(&in_ifaddr_lock) +#define IN_IFADDR_LOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_LOCKED) +#define IN_IFADDR_RLOCK(t) rm_rlock(&in_ifaddr_lock, (t)) +#define IN_IFADDR_RLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_RLOCKED) +#define IN_IFADDR_RUNLOCK(t) rm_runlock(&in_ifaddr_lock, (t)) +#define IN_IFADDR_WLOCK() rm_wlock(&in_ifaddr_lock) +#define IN_IFADDR_WLOCK_ASSERT() rm_assert(&in_ifaddr_lock, RA_WLOCKED) +#define IN_IFADDR_WUNLOCK() rm_wunlock(&in_ifaddr_lock) /* * Macro for finding the internet address structure (in_ifaddr) @@ -156,29 +161,20 @@ do { \ * Macro for finding the internet address structure (in_ifaddr) corresponding * to a given interface (ifnet structure). */ -#define IFP_TO_IA(ifp, ia) \ +#define IFP_TO_IA(ifp, ia, t) \ /* struct ifnet *ifp; */ \ /* struct in_ifaddr *ia; */ \ + /* struct rm_priotracker *t; */ \ do { \ - IN_IFADDR_RLOCK(); \ + IN_IFADDR_RLOCK((t)); \ for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \ (ia) != NULL && (ia)->ia_ifp != (ifp); \ (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ if ((ia) != NULL) \ ifa_ref(&(ia)->ia_ifa); \ - IN_IFADDR_RUNLOCK(); \ + IN_IFADDR_RUNLOCK((t)); \ } while (0) -#endif - -/* - * IP datagram reassembly. - */ -#define IPREASS_NHASH_LOG2 6 -#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) -#define IPREASS_HMASK (IPREASS_NHASH - 1) -#define IPREASS_HASH(x,y) \ - (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) /* * Legacy IPv4 IGMP per-link structure. @@ -190,28 +186,6 @@ struct router_info { SLIST_ENTRY(router_info) rti_list; }; -/* - * Per-interface IGMP router version information. - */ -struct igmp_ifinfo { - LIST_ENTRY(igmp_ifinfo) igi_link; - struct ifnet *igi_ifp; /* interface this instance belongs to */ - uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ - uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ - uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ - uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ - uint32_t igi_flags; /* IGMP per-interface flags */ - uint32_t igi_rv; /* IGMPv3 Robustness Variable */ - uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ - uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ - uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ - SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */ - struct ifqueue igi_gq; /* queue of general query responses */ -}; - -#define IGIF_SILENT 0x00000001 /* Do not use IGMP on this ifp */ -#define IGIF_LOOPBACK 0x00000002 /* Send IGMP reports to loopback */ - /* * IPv4 multicast IGMP-layer source entry. */ @@ -290,12 +264,12 @@ struct in_multi { u_int inm_refcount; /* reference count */ /* New fields for IGMPv3 follow. */ - struct igmp_ifinfo *inm_igi; /* IGMP info */ + struct igmp_ifsoftc *inm_igi; /* IGMP info */ SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */ struct ip_msource_tree inm_srcs; /* tree of sources */ u_long inm_nsrc; /* # of tree entries */ - struct ifqueue inm_scq; /* queue of pending + struct mbufq inm_scq; /* queue of pending * state-change packets */ struct timeval inm_lastgsrtv; /* Time of last G-S-R query */ uint16_t inm_sctimer; /* state-change timer */ @@ -339,8 +313,6 @@ ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims, return (MCAST_UNDEFINED); } -#ifdef _KERNEL - #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); @@ -359,49 +331,6 @@ extern struct mtx in_multi_mtx; #define IN_MULTI_LOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_OWNED) #define IN_MULTI_UNLOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_NOTOWNED) -/* - * Function for looking up an in_multi record for an IPv4 multicast address - * on a given interface. ifp must be valid. If no record found, return NULL. - * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. - */ -static __inline struct in_multi * -inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) -{ - struct ifmultiaddr *ifma; - struct in_multi *inm; - - IN_MULTI_LOCK_ASSERT(); - IF_ADDR_LOCK_ASSERT(ifp); - - inm = NULL; - TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { - if (ifma->ifma_addr->sa_family == AF_INET) { - inm = (struct in_multi *)ifma->ifma_protospec; - if (inm->inm_addr.s_addr == ina.s_addr) - break; - inm = NULL; - } - } - return (inm); -} - -/* - * Wrapper for inm_lookup_locked(). - * The IF_ADDR_LOCK will be taken on ifp and released on return. - */ -static __inline struct in_multi * -inm_lookup(struct ifnet *ifp, const struct in_addr ina) -{ - struct in_multi *inm; - - IN_MULTI_LOCK_ASSERT(); - IF_ADDR_RLOCK(ifp); - inm = inm_lookup_locked(ifp, ina); - IF_ADDR_RUNLOCK(ifp); - - return (inm); -} - /* Acquire an in_multi record. */ static __inline void inm_acquire_locked(struct in_multi *inm) @@ -422,8 +351,9 @@ inm_acquire_locked(struct in_multi *inm) struct rtentry; struct route; struct ip_moptions; -struct radix_node_head; +struct in_multi *inm_lookup_locked(struct ifnet *, const struct in_addr); +struct in_multi *inm_lookup(struct ifnet *, const struct in_addr); int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, const struct sockaddr *, const struct sockaddr *); void inm_commit(struct in_multi *); @@ -444,30 +374,21 @@ int in_leavegroup_locked(struct in_multi *, /*const*/ struct in_mfilter *); int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct thread *); -void in_rtqdrain(void); +int in_addprefix(struct in_ifaddr *, int); +int in_scrubprefix(struct in_ifaddr *, u_int); +void in_ifscrub_all(void); void ip_input(struct mbuf *); -int in_ifadown(struct ifaddr *ifa, int); -void in_ifscrub(struct ifnet *, struct in_ifaddr *, u_int); -struct mbuf *ip_fastforward(struct mbuf *); +void ip_direct_input(struct mbuf *); +void in_ifadown(struct ifaddr *ifa, int); +struct mbuf *ip_tryforward(struct mbuf *); void *in_domifattach(struct ifnet *); void in_domifdetach(struct ifnet *, void *); /* XXX */ void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); -void in_rtalloc(struct route *ro, u_int fibnum); -struct rtentry *in_rtalloc1(struct sockaddr *, int, u_long, u_int); void in_rtredirect(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); -int in_rtrequest(int, struct sockaddr *, - struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); -void in_setmatchfunc(struct radix_node_head *, int); - -#if 0 -int in_rt_getifa(struct rt_addrinfo *, u_int fibnum); -int in_rtioctl(u_long, caddr_t, u_int); -int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int); -#endif #endif /* _KERNEL */ /* INET6 stuff */ diff --git a/freebsd/sys/netinet/ip.h b/freebsd/sys/netinet/ip.h index 79afeb8f..98bd1e99 100644 --- a/freebsd/sys/netinet/ip.h +++ b/freebsd/sys/netinet/ip.h @@ -67,7 +67,7 @@ struct ip { u_char ip_p; /* protocol */ u_short ip_sum; /* checksum */ struct in_addr ip_src,ip_dst; /* source and dest address */ -} __packed __aligned(4); +} __packed __aligned(2); #define IP_MAXPACKET 65535 /* maximum packet size */ @@ -80,19 +80,19 @@ struct ip { #define IPTOS_MINCOST 0x02 /* - * Definitions for IP precedence (also in ip_tos) (hopefully unused). + * Definitions for IP precedence (also in ip_tos) (deprecated). */ -#define IPTOS_PREC_NETCONTROL 0xe0 -#define IPTOS_PREC_INTERNETCONTROL 0xc0 -#define IPTOS_PREC_CRITIC_ECP 0xa0 -#define IPTOS_PREC_FLASHOVERRIDE 0x80 -#define IPTOS_PREC_FLASH 0x60 -#define IPTOS_PREC_IMMEDIATE 0x40 -#define IPTOS_PREC_PRIORITY 0x20 -#define IPTOS_PREC_ROUTINE 0x00 +#define IPTOS_PREC_NETCONTROL IPTOS_DSCP_CS7 +#define IPTOS_PREC_INTERNETCONTROL IPTOS_DSCP_CS6 +#define IPTOS_PREC_CRITIC_ECP IPTOS_DSCP_CS5 +#define IPTOS_PREC_FLASHOVERRIDE IPTOS_DSCP_CS4 +#define IPTOS_PREC_FLASH IPTOS_DSCP_CS3 +#define IPTOS_PREC_IMMEDIATE IPTOS_DSCP_CS2 +#define IPTOS_PREC_PRIORITY IPTOS_DSCP_CS1 +#define IPTOS_PREC_ROUTINE IPTOS_DSCP_CS0 /* - * Definitions for DiffServ Codepoints as per RFC2474 + * Definitions for DiffServ Codepoints as per RFC2474 and RFC5865. */ #define IPTOS_DSCP_CS0 0x00 #define IPTOS_DSCP_CS1 0x20 @@ -112,6 +112,7 @@ struct ip { #define IPTOS_DSCP_AF42 0x90 #define IPTOS_DSCP_AF43 0x98 #define IPTOS_DSCP_CS5 0xa0 +#define IPTOS_DSCP_VA 0xb0 #define IPTOS_DSCP_EF 0xb8 #define IPTOS_DSCP_CS6 0xc0 #define IPTOS_DSCP_CS7 0xe0 @@ -146,7 +147,7 @@ struct ip { #define IPOPT_SECURITY 130 /* provide s,c,h,tcc */ #define IPOPT_LSRR 131 /* loose source route */ #define IPOPT_ESO 133 /* extended security */ -#define IPOPT_CIPSO 134 /* commerical security */ +#define IPOPT_CIPSO 134 /* commercial security */ #define IPOPT_SATID 136 /* satnet id */ #define IPOPT_SSRR 137 /* strict source route */ #define IPOPT_RA 148 /* router alert */ diff --git a/freebsd/sys/netinet/ip6.h b/freebsd/sys/netinet/ip6.h index 8f498410..ff870579 100644 --- a/freebsd/sys/netinet/ip6.h +++ b/freebsd/sys/netinet/ip6.h @@ -277,12 +277,6 @@ do { \ (((m) = m_pullup((m), (off) + (hlen))) == NULL)) { \ IP6STAT_INC(ip6s_exthdrtoolong); \ return ret; \ - } else if ((m)->m_flags & M_EXT) { \ - if ((m)->m_len < (off) + (hlen)) { \ - IP6STAT_INC(ip6s_exthdrtoolong); \ - m_freem(m); \ - return ret; \ - } \ } else { \ if ((m)->m_len < (off) + (hlen)) { \ IP6STAT_INC(ip6s_exthdrtoolong); \ diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c index 330023b1..6b683f45 100644 --- a/freebsd/sys/netinet/ip_carp.c +++ b/freebsd/sys/netinet/ip_carp.c @@ -1,8 +1,10 @@ #include -/* - * Copyright (c) 2002 Michael Shalayeff. All rights reserved. - * Copyright (c) 2003 Ryan McBride. All rights reserved. +/*- + * Copyright (c) 2002 Michael Shalayeff. + * Copyright (c) 2003 Ryan McBride. + * Copyright (c) 2011 Gleb Smirnoff + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -33,38 +35,33 @@ __FBSDID("$FreeBSD$"); #include #include -#include #include #include -#include +#include +#include #include #include #include #include #include -#include #include #include #include +#include +#include #include #include -#include -#include -#include +#include +#include -#include -#include - -#include - -#include #include #include -#include #include -#include +#include #include +#include #include +#include #include #include @@ -73,12 +70,9 @@ __FBSDID("$FreeBSD$"); #include #include #include - #include #endif - #ifdef INET -#include #include #include #endif @@ -86,182 +80,254 @@ __FBSDID("$FreeBSD$"); #ifdef INET6 #include #include -#include +#include #include #include -#include #include #endif #include -#define CARP_IFNAME "carp" -static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces"); -SYSCTL_DECL(_net_inet_carp); +static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses"); struct carp_softc { - struct ifnet *sc_ifp; /* Interface clue */ - struct ifnet *sc_carpdev; /* Pointer to parent interface */ - struct in_ifaddr *sc_ia; /* primary iface address */ + struct ifnet *sc_carpdev; /* Pointer to parent ifnet. */ + struct ifaddr **sc_ifas; /* Our ifaddrs. */ + struct sockaddr_dl sc_addr; /* Our link level address. */ + struct callout sc_ad_tmo; /* Advertising timeout. */ #ifdef INET - struct ip_moptions sc_imo; + struct callout sc_md_tmo; /* Master down timeout. */ #endif #ifdef INET6 - struct in6_ifaddr *sc_ia6; /* primary iface address v6 */ - struct ip6_moptions sc_im6o; -#endif /* INET6 */ - TAILQ_ENTRY(carp_softc) sc_list; - - enum { INIT = 0, BACKUP, MASTER } sc_state; + struct callout sc_md6_tmo; /* XXX: Master down timeout. */ +#endif + struct mtx sc_mtx; - int sc_flags_backup; - int sc_suppress; + int sc_vhid; + int sc_advskew; + int sc_advbase; - int sc_sendad_errors; + int sc_naddrs; + int sc_naddrs6; + int sc_ifasiz; + enum { INIT = 0, BACKUP, MASTER } sc_state; + int sc_suppress; + int sc_sendad_errors; #define CARP_SENDAD_MAX_ERRORS 3 - int sc_sendad_success; + int sc_sendad_success; #define CARP_SENDAD_MIN_SUCCESS 3 - int sc_vhid; - int sc_advskew; - int sc_naddrs; - int sc_naddrs6; - int sc_advbase; /* seconds */ - int sc_init_counter; - u_int64_t sc_counter; + int sc_init_counter; + uint64_t sc_counter; /* authentication */ -#define CARP_HMAC_PAD 64 +#define CARP_HMAC_PAD 64 unsigned char sc_key[CARP_KEY_LEN]; unsigned char sc_pad[CARP_HMAC_PAD]; SHA1_CTX sc_sha1; - struct callout sc_ad_tmo; /* advertisement timeout */ - struct callout sc_md_tmo; /* master down timeout */ - struct callout sc_md6_tmo; /* master down timeout */ - - LIST_ENTRY(carp_softc) sc_next; /* Interface clue */ + TAILQ_ENTRY(carp_softc) sc_list; /* On the carp_if list. */ + LIST_ENTRY(carp_softc) sc_next; /* On the global list. */ }; -#define SC2IFP(sc) ((sc)->sc_ifp) - -int carp_suppress_preempt = 0; -int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */ -SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); -SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW, - &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets"); -SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW, - &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode"); -SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW, - &carp_opts[CARPCTL_LOG], 0, "log bad carp packets"); -SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW, - &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses"); -SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD, - &carp_suppress_preempt, 0, "Preemption is suppressed"); - -struct carpstats carpstats; -SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW, - &carpstats, carpstats, - "CARP statistics (struct carpstats, netinet/ip_carp.h)"); struct carp_if { - TAILQ_HEAD(, carp_softc) vhif_vrs; - int vhif_nvrs; - - struct ifnet *vhif_ifp; - struct mtx vhif_mtx; +#ifdef INET + int cif_naddrs; +#endif +#ifdef INET6 + int cif_naddrs6; +#endif + TAILQ_HEAD(, carp_softc) cif_vrs; +#ifdef INET + struct ip_moptions cif_imo; +#endif +#ifdef INET6 + struct ip6_moptions cif_im6o; +#endif + struct ifnet *cif_ifp; + struct mtx cif_mtx; + uint32_t cif_flags; +#define CIF_PROMISC 0x00000001 }; #define CARP_INET 0 #define CARP_INET6 1 static int proto_reg[] = {-1, -1}; -/* Get carp_if from softc. Valid after carp_set_addr{,6}. */ -#define SC2CIF(sc) ((struct carp_if *)(sc)->sc_carpdev->if_carp) +/* + * Brief design of carp(4). + * + * Any carp-capable ifnet may have a list of carp softcs hanging off + * its ifp->if_carp pointer. Each softc represents one unique virtual + * host id, or vhid. The softc has a back pointer to the ifnet. All + * softcs are joined in a global list, which has quite limited use. + * + * Any interface address that takes part in CARP negotiation has a + * pointer to the softc of its vhid, ifa->ifa_carp. That could be either + * AF_INET or AF_INET6 address. + * + * Although, one can get the softc's backpointer to ifnet and traverse + * through its ifp->if_addrhead queue to find all interface addresses + * involved in CARP, we keep a growable array of ifaddr pointers. This + * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that + * do calls into the network stack, thus avoiding LORs. + * + * Locking: + * + * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(), + * callout-driven events and ioctl()s. + * + * To traverse the list of softcs on an ifnet we use CIF_LOCK(), to + * traverse the global list we use the mutex carp_mtx. + * + * Known issues with locking: + * + * - Sending ad, we put the pointer to the softc in an mtag, and no reference + * counting is done on the softc. + * - On module unload we may race (?) with packet processing thread + * dereferencing our function pointers. + */ + +/* Accept incoming CARP packets. */ +static VNET_DEFINE(int, carp_allow) = 1; +#define V_carp_allow VNET(carp_allow) -/* lock per carp_if queue */ -#define CARP_LOCK_INIT(cif) mtx_init(&(cif)->vhif_mtx, "carp_if", \ - NULL, MTX_DEF) -#define CARP_LOCK_DESTROY(cif) mtx_destroy(&(cif)->vhif_mtx) -#define CARP_LOCK_ASSERT(cif) mtx_assert(&(cif)->vhif_mtx, MA_OWNED) -#define CARP_LOCK(cif) mtx_lock(&(cif)->vhif_mtx) -#define CARP_UNLOCK(cif) mtx_unlock(&(cif)->vhif_mtx) +/* Preempt slower nodes. */ +static VNET_DEFINE(int, carp_preempt) = 0; +#define V_carp_preempt VNET(carp_preempt) + +/* Log level. */ +static VNET_DEFINE(int, carp_log) = 1; +#define V_carp_log VNET(carp_log) + +/* Global advskew demotion. */ +static VNET_DEFINE(int, carp_demotion) = 0; +#define V_carp_demotion VNET(carp_demotion) + +/* Send error demotion factor. */ +static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW; +#define V_carp_senderr_adj VNET(carp_senderr_adj) -#define CARP_SCLOCK(sc) mtx_lock(&SC2CIF(sc)->vhif_mtx) -#define CARP_SCUNLOCK(sc) mtx_unlock(&SC2CIF(sc)->vhif_mtx) -#define CARP_SCLOCK_ASSERT(sc) mtx_assert(&SC2CIF(sc)->vhif_mtx, MA_OWNED) +/* Iface down demotion factor. */ +static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW; +#define V_carp_ifdown_adj VNET(carp_ifdown_adj) + +static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS); + +SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); +SYSCTL_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(carp_allow), 0, "Accept incoming CARP packets"); +SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode"); +SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(carp_log), 0, "CARP log level"); +SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, + 0, 0, carp_demote_adj_sysctl, "I", + "Adjust demotion factor (skew of advskew)"); +SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, + CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment"); +SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, + CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(carp_ifdown_adj), 0, + "Interface down demotion factor adjustment"); + +VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats); +VNET_PCPUSTAT_SYSINIT(carpstats); +VNET_PCPUSTAT_SYSUNINIT(carpstats); + +#define CARPSTATS_ADD(name, val) \ + counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \ + sizeof(uint64_t)], (val)) +#define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) + +SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats, + carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)"); + +#define CARP_LOCK_INIT(sc) mtx_init(&(sc)->sc_mtx, "carp_softc", \ + NULL, MTX_DEF) +#define CARP_LOCK_DESTROY(sc) mtx_destroy(&(sc)->sc_mtx) +#define CARP_LOCK_ASSERT(sc) mtx_assert(&(sc)->sc_mtx, MA_OWNED) +#define CARP_LOCK(sc) mtx_lock(&(sc)->sc_mtx) +#define CARP_UNLOCK(sc) mtx_unlock(&(sc)->sc_mtx) +#define CIF_LOCK_INIT(cif) mtx_init(&(cif)->cif_mtx, "carp_if", \ + NULL, MTX_DEF) +#define CIF_LOCK_DESTROY(cif) mtx_destroy(&(cif)->cif_mtx) +#define CIF_LOCK_ASSERT(cif) mtx_assert(&(cif)->cif_mtx, MA_OWNED) +#define CIF_LOCK(cif) mtx_lock(&(cif)->cif_mtx) +#define CIF_UNLOCK(cif) mtx_unlock(&(cif)->cif_mtx) +#define CIF_FREE(cif) do { \ + CIF_LOCK(cif); \ + if (TAILQ_EMPTY(&(cif)->cif_vrs)) \ + carp_free_if(cif); \ + else \ + CIF_UNLOCK(cif); \ +} while (0) #define CARP_LOG(...) do { \ - if (carp_opts[CARPCTL_LOG] > 0) \ - log(LOG_INFO, __VA_ARGS__); \ + if (V_carp_log > 0) \ + log(LOG_INFO, "carp: " __VA_ARGS__); \ } while (0) #define CARP_DEBUG(...) do { \ - if (carp_opts[CARPCTL_LOG] > 1) \ + if (V_carp_log > 1) \ log(LOG_DEBUG, __VA_ARGS__); \ } while (0) -static void carp_hmac_prepare(struct carp_softc *); -static void carp_hmac_generate(struct carp_softc *, u_int32_t *, - unsigned char *); -static int carp_hmac_verify(struct carp_softc *, u_int32_t *, - unsigned char *); -static void carp_setroute(struct carp_softc *, int); -static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t); -static int carp_clone_create(struct if_clone *, int, caddr_t); -static void carp_clone_destroy(struct ifnet *); -static void carpdetach(struct carp_softc *, int); -static int carp_prepare_ad(struct mbuf *, struct carp_softc *, - struct carp_header *); -static void carp_send_ad_all(void); -static void carp_send_ad(void *); -static void carp_send_ad_locked(struct carp_softc *); -#ifdef INET -static void carp_send_arp(struct carp_softc *); -#endif -static void carp_master_down(void *); -static void carp_master_down_locked(struct carp_softc *); -static int carp_ioctl(struct ifnet *, u_long, caddr_t); -static int carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *, - struct route *); -static void carp_start(struct ifnet *); -static void carp_setrun(struct carp_softc *, sa_family_t); -static void carp_set_state(struct carp_softc *, int); -#ifdef INET -static int carp_addrcount(struct carp_if *, struct in_ifaddr *, int); -#endif -enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING }; +#define IFNET_FOREACH_IFA(ifp, ifa) \ + IF_ADDR_LOCK_ASSERT(ifp); \ + TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link) \ + if ((ifa)->ifa_carp != NULL) -#ifdef INET -static void carp_multicast_cleanup(struct carp_softc *, int dofree); -static int carp_set_addr(struct carp_softc *, struct sockaddr_in *); -static int carp_del_addr(struct carp_softc *, struct sockaddr_in *); -#endif -static void carp_carpdev_state_locked(struct carp_if *); -static void carp_sc_state_locked(struct carp_softc *); -#ifdef INET6 -static void carp_send_na(struct carp_softc *); -static int carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *); -static int carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *); -static void carp_multicast6_cleanup(struct carp_softc *, int dofree); -#endif +#define CARP_FOREACH_IFA(sc, ifa) \ + CARP_LOCK_ASSERT(sc); \ + for (int _i = 0; \ + _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 && \ + ((ifa) = sc->sc_ifas[_i]) != NULL; \ + ++_i) -static LIST_HEAD(, carp_softc) carpif_list; -static struct mtx carp_mtx; -IFC_SIMPLE_DECLARE(carp, 0); +#define IFNET_FOREACH_CARP(ifp, sc) \ + CIF_LOCK_ASSERT(ifp->if_carp); \ + TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list) -static eventhandler_tag if_detach_event_tag; +#define DEMOTE_ADVSKEW(sc) \ + (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ? \ + CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion)) -static __inline u_int16_t -carp_cksum(struct mbuf *m, int len) -{ - return (in_cksum(m, len)); -} +static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t); +static struct carp_softc + *carp_alloc(struct ifnet *); +static void carp_destroy(struct carp_softc *); +static struct carp_if + *carp_alloc_if(struct ifnet *); +static void carp_free_if(struct carp_if *); +static void carp_set_state(struct carp_softc *, int, const char* reason); +static void carp_sc_state(struct carp_softc *); +static void carp_setrun(struct carp_softc *, sa_family_t); +static void carp_master_down(void *); +static void carp_master_down_locked(struct carp_softc *, + const char* reason); +static void carp_send_ad(void *); +static void carp_send_ad_locked(struct carp_softc *); +static void carp_addroute(struct carp_softc *); +static void carp_ifa_addroute(struct ifaddr *); +static void carp_delroute(struct carp_softc *); +static void carp_ifa_delroute(struct ifaddr *); +static void carp_send_ad_all(void *, int); +static void carp_demote_adj(int, char *); + +static LIST_HEAD(, carp_softc) carp_list; +static struct mtx carp_mtx; +static struct sx carp_sx; +static struct task carp_sendall_task = + TASK_INITIALIZER(0, carp_send_ad_all, NULL); static void carp_hmac_prepare(struct carp_softc *sc) { - u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; - u_int8_t vhid = sc->sc_vhid & 0xff; + uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; + uint8_t vhid = sc->sc_vhid & 0xff; struct ifaddr *ifa; int i, found; #ifdef INET @@ -271,18 +337,15 @@ carp_hmac_prepare(struct carp_softc *sc) struct in6_addr last6, cur6, in6; #endif - if (sc->sc_carpdev) - CARP_SCLOCK(sc); + CARP_LOCK_ASSERT(sc); - /* XXX: possible race here */ - - /* compute ipad from key */ + /* Compute ipad from key. */ bzero(sc->sc_pad, sizeof(sc->sc_pad)); bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36; - /* precompute first part of inner hash */ + /* Precompute first part of inner hash. */ SHA1Init(&sc->sc_sha1); SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); @@ -294,8 +357,7 @@ carp_hmac_prepare(struct carp_softc *sc) found = 0; last = cur; cur.s_addr = 0xffffffff; - IF_ADDR_RLOCK(SC2IFP(sc)); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + CARP_FOREACH_IFA(sc, ifa) { in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; if (ifa->ifa_addr->sa_family == AF_INET && ntohl(in.s_addr) > ntohl(last.s_addr) && @@ -304,7 +366,6 @@ carp_hmac_prepare(struct carp_softc *sc) found++; } } - IF_ADDR_RUNLOCK(SC2IFP(sc)); if (found) SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); } while (found); @@ -315,8 +376,7 @@ carp_hmac_prepare(struct carp_softc *sc) found = 0; last6 = cur6; memset(&cur6, 0xff, sizeof(cur6)); - IF_ADDR_RLOCK(SC2IFP(sc)); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + CARP_FOREACH_IFA(sc, ifa) { in6 = ifatoia6(ifa)->ia_addr.sin6_addr; if (IN6_IS_SCOPE_EMBED(&in6)) in6.s6_addr16[1] = 0; @@ -327,7 +387,6 @@ carp_hmac_prepare(struct carp_softc *sc) found++; } } - IF_ADDR_RUNLOCK(SC2IFP(sc)); if (found) SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); } while (found); @@ -336,17 +395,16 @@ carp_hmac_prepare(struct carp_softc *sc) /* convert ipad to opad */ for (i = 0; i < sizeof(sc->sc_pad); i++) sc->sc_pad[i] ^= 0x36 ^ 0x5c; - - if (sc->sc_carpdev) - CARP_SCUNLOCK(sc); } static void -carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2], +carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { SHA1_CTX sha1ctx; + CARP_LOCK_ASSERT(sc); + /* fetch first half of inner hash */ bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); @@ -361,260 +419,68 @@ carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2], } static int -carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2], +carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2], unsigned char md[20]) { unsigned char md2[20]; - CARP_SCLOCK_ASSERT(sc); + CARP_LOCK_ASSERT(sc); carp_hmac_generate(sc, counter, md2); return (bcmp(md, md2, sizeof(md2))); } -static void -carp_setroute(struct carp_softc *sc, int cmd) -{ - struct ifaddr *ifa; - int s; - - if (sc->sc_carpdev) - CARP_SCLOCK_ASSERT(sc); - - s = splnet(); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { -#ifdef INET - if (ifa->ifa_addr->sa_family == AF_INET && - sc->sc_carpdev != NULL) { - int count = carp_addrcount( - (struct carp_if *)sc->sc_carpdev->if_carp, - ifatoia(ifa), CARP_COUNT_MASTER); - - if ((cmd == RTM_ADD && count == 1) || - (cmd == RTM_DELETE && count == 0)) - rtinit(ifa, cmd, RTF_UP | RTF_HOST); - } -#endif - } - splx(s); -} - -static int -carp_clone_create(struct if_clone *ifc, int unit, caddr_t params) -{ - - struct carp_softc *sc; - struct ifnet *ifp; - - sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); - ifp = SC2IFP(sc) = if_alloc(IFT_ETHER); - if (ifp == NULL) { - free(sc, M_CARP); - return (ENOSPC); - } - - sc->sc_flags_backup = 0; - sc->sc_suppress = 0; - sc->sc_advbase = CARP_DFLTINTV; - sc->sc_vhid = -1; /* required setting */ - sc->sc_advskew = 0; - sc->sc_init_counter = 1; - sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */ -#ifdef INET - sc->sc_imo.imo_membership = (struct in_multi **)malloc( - (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, - M_WAITOK); - sc->sc_imo.imo_mfilters = NULL; - sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; - sc->sc_imo.imo_multicast_vif = -1; -#endif -#ifdef INET6 - sc->sc_im6o.im6o_membership = (struct in6_multi **)malloc( - (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, - M_WAITOK); - sc->sc_im6o.im6o_mfilters = NULL; - sc->sc_im6o.im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; - sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL; -#endif - - callout_init(&sc->sc_ad_tmo, CALLOUT_MPSAFE); - callout_init(&sc->sc_md_tmo, CALLOUT_MPSAFE); - callout_init(&sc->sc_md6_tmo, CALLOUT_MPSAFE); - - ifp->if_softc = sc; - if_initname(ifp, CARP_IFNAME, unit); - ifp->if_mtu = ETHERMTU; - ifp->if_flags = IFF_LOOPBACK; - ifp->if_ioctl = carp_ioctl; - ifp->if_output = carp_looutput; - ifp->if_start = carp_start; - ifp->if_type = IFT_CARP; - ifp->if_snd.ifq_maxlen = ifqmaxlen; - ifp->if_hdrlen = 0; - if_attach(ifp); - bpfattach(SC2IFP(sc), DLT_NULL, sizeof(u_int32_t)); - mtx_lock(&carp_mtx); - LIST_INSERT_HEAD(&carpif_list, sc, sc_next); - mtx_unlock(&carp_mtx); - return (0); -} - -static void -carp_clone_destroy(struct ifnet *ifp) -{ - struct carp_softc *sc = ifp->if_softc; - - if (sc->sc_carpdev) - CARP_SCLOCK(sc); - carpdetach(sc, 1); /* Returns unlocked. */ - - mtx_lock(&carp_mtx); - LIST_REMOVE(sc, sc_next); - mtx_unlock(&carp_mtx); - bpfdetach(ifp); - if_detach(ifp); - if_free_type(ifp, IFT_ETHER); -#ifdef INET - free(sc->sc_imo.imo_membership, M_CARP); -#endif -#ifdef INET6 - free(sc->sc_im6o.im6o_membership, M_CARP); -#endif - free(sc, M_CARP); -} - -/* - * This function can be called on CARP interface destroy path, - * and in case of the removal of the underlying interface as - * well. We differentiate these two cases: in case of destruction - * of the underlying interface, we do not cleanup our multicast - * memberships, since they are already freed. But we purge pointers - * to multicast structures, since they are no longer valid, to - * avoid panic in future calls to carpdetach(). Also, we do not - * release the lock on return, because the function will be - * called once more, for another CARP instance on the same - * interface. - */ -static void -carpdetach(struct carp_softc *sc, int unlock) -{ - struct carp_if *cif; - - callout_stop(&sc->sc_ad_tmo); - callout_stop(&sc->sc_md_tmo); - callout_stop(&sc->sc_md6_tmo); - - if (sc->sc_suppress) - carp_suppress_preempt--; - sc->sc_suppress = 0; - - if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) - carp_suppress_preempt--; - sc->sc_sendad_errors = 0; - - carp_set_state(sc, INIT); - SC2IFP(sc)->if_flags &= ~IFF_UP; - carp_setrun(sc, 0); -#ifdef INET - carp_multicast_cleanup(sc, unlock); -#endif -#ifdef INET6 - carp_multicast6_cleanup(sc, unlock); -#endif - - if (sc->sc_carpdev != NULL) { - cif = (struct carp_if *)sc->sc_carpdev->if_carp; - CARP_LOCK_ASSERT(cif); - TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); - if (!--cif->vhif_nvrs) { - ifpromisc(sc->sc_carpdev, 0); - sc->sc_carpdev->if_carp = NULL; - CARP_LOCK_DESTROY(cif); - free(cif, M_CARP); - } else if (unlock) - CARP_UNLOCK(cif); - sc->sc_carpdev = NULL; - } -} - -/* Detach an interface from the carp. */ -static void -carp_ifdetach(void *arg __unused, struct ifnet *ifp) -{ - struct carp_if *cif = (struct carp_if *)ifp->if_carp; - struct carp_softc *sc, *nextsc; - - if (cif == NULL) - return; - - /* - * XXX: At the end of for() cycle the lock will be destroyed. - */ - CARP_LOCK(cif); - for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) { - nextsc = TAILQ_NEXT(sc, sc_list); - carpdetach(sc, 0); - } -} - /* * process input packet. * we have rearranged checks order compared to the rfc, * but it seems more efficient this way or not possible otherwise. */ #ifdef INET -void -carp_input(struct mbuf *m, int hlen) +int +carp_input(struct mbuf **mp, int *offp, int proto) { + struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct carp_header *ch; int iplen, len; - CARPSTATS_INC(carps_ipackets); + iplen = *offp; + *mp = NULL; - if (!carp_opts[CARPCTL_ALLOW]) { - m_freem(m); - return; - } + CARPSTATS_INC(carps_ipackets); - /* check if received on a valid carp interface */ - if (m->m_pkthdr.rcvif->if_carp == NULL) { - CARPSTATS_INC(carps_badif); - CARP_DEBUG("carp_input: packet received on non-carp " - "interface: %s\n", - m->m_pkthdr.rcvif->if_xname); + if (!V_carp_allow) { m_freem(m); - return; + return (IPPROTO_DONE); } /* verify that the IP TTL is 255. */ if (ip->ip_ttl != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); - CARP_DEBUG("carp_input: received ttl %d != 255 on %s\n", + CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, ip->ip_ttl, m->m_pkthdr.rcvif->if_xname); m_freem(m); - return; + return (IPPROTO_DONE); } iplen = ip->ip_hl << 2; if (m->m_pkthdr.len < iplen + sizeof(*ch)) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("carp_input: received len %zd < " - "sizeof(struct carp_header) on %s\n", - m->m_len - sizeof(struct ip), + CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) " + "on %s\n", __func__, m->m_len - sizeof(struct ip), m->m_pkthdr.rcvif->if_xname); m_freem(m); - return; + return (IPPROTO_DONE); } if (iplen + sizeof(*ch) < m->m_len) { if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { CARPSTATS_INC(carps_hdrops); - CARP_DEBUG("carp_input: pullup failed\n"); - return; + CARP_DEBUG("%s: pullup failed\n", __func__); + return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } @@ -627,32 +493,33 @@ carp_input(struct mbuf *m, int hlen) len = iplen + sizeof(*ch); if (len > m->m_pkthdr.len) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("carp_input: packet too short %d on %s\n", + CARP_DEBUG("%s: packet too short %d on %s\n", __func__, m->m_pkthdr.len, m->m_pkthdr.rcvif->if_xname); m_freem(m); - return; + return (IPPROTO_DONE); } if ((m = m_pullup(m, len)) == NULL) { CARPSTATS_INC(carps_hdrops); - return; + return (IPPROTO_DONE); } ip = mtod(m, struct ip *); ch = (struct carp_header *)((char *)ip + iplen); /* verify the CARP checksum */ m->m_data += iplen; - if (carp_cksum(m, len - iplen)) { + if (in_cksum(m, len - iplen)) { CARPSTATS_INC(carps_badsum); - CARP_DEBUG("carp_input: checksum failed on %s\n", + CARP_DEBUG("%s: checksum failed on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); - return; + return (IPPROTO_DONE); } m->m_data -= iplen; carp_input_c(m, ch, AF_INET); + return (IPPROTO_DONE); } #endif @@ -667,7 +534,7 @@ carp6_input(struct mbuf **mp, int *offp, int proto) CARPSTATS_INC(carps_ipackets6); - if (!carp_opts[CARPCTL_ALLOW]) { + if (!V_carp_allow) { m_freem(m); return (IPPROTO_DONE); } @@ -675,9 +542,8 @@ carp6_input(struct mbuf **mp, int *offp, int proto) /* check if received on a valid carp interface */ if (m->m_pkthdr.rcvif->if_carp == NULL) { CARPSTATS_INC(carps_badif); - CARP_DEBUG("carp6_input: packet received on non-carp " - "interface: %s\n", - m->m_pkthdr.rcvif->if_xname); + CARP_DEBUG("%s: packet received on non-carp interface: %s\n", + __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } @@ -685,9 +551,8 @@ carp6_input(struct mbuf **mp, int *offp, int proto) /* verify that the IP TTL is 255 */ if (ip6->ip6_hlim != CARP_DFLTTL) { CARPSTATS_INC(carps_badttl); - CARP_DEBUG("carp6_input: received ttl %d != 255 on %s\n", - ip6->ip6_hlim, - m->m_pkthdr.rcvif->if_xname); + CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__, + ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); } @@ -697,16 +562,16 @@ carp6_input(struct mbuf **mp, int *offp, int proto) IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch)); if (ch == NULL) { CARPSTATS_INC(carps_badlen); - CARP_DEBUG("carp6_input: packet size %u too small\n", len); + CARP_DEBUG("%s: packet size %u too small\n", __func__, len); return (IPPROTO_DONE); } /* verify the CARP checksum */ m->m_data += *offp; - if (carp_cksum(m, sizeof(*ch))) { + if (in_cksum(m, sizeof(*ch))) { CARPSTATS_INC(carps_badsum); - CARP_DEBUG("carp6_input: checksum failed, on %s\n", + CARP_DEBUG("%s: checksum failed, on %s\n", __func__, m->m_pkthdr.rcvif->if_xname); m_freem(m); return (IPPROTO_DONE); @@ -722,62 +587,46 @@ static void carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) { struct ifnet *ifp = m->m_pkthdr.rcvif; + struct ifaddr *ifa; struct carp_softc *sc; - u_int64_t tmp_counter; + uint64_t tmp_counter; struct timeval sc_tv, ch_tv; /* verify that the VHID is valid on the receiving interface */ - CARP_LOCK(ifp->if_carp); - TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list) - if (sc->sc_vhid == ch->carp_vhid) + IF_ADDR_RLOCK(ifp); + IFNET_FOREACH_IFA(ifp, ifa) + if (ifa->ifa_addr->sa_family == af && + ifa->ifa_carp->sc_vhid == ch->carp_vhid) { + ifa_ref(ifa); break; + } + IF_ADDR_RUNLOCK(ifp); - if (!sc || !((SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) { + if (ifa == NULL) { CARPSTATS_INC(carps_badvhid); - CARP_UNLOCK(ifp->if_carp); m_freem(m); return; } - getmicrotime(&SC2IFP(sc)->if_lastchange); - SC2IFP(sc)->if_ipackets++; - SC2IFP(sc)->if_ibytes += m->m_pkthdr.len; - - if (bpf_peers_present(SC2IFP(sc)->if_bpf)) { - uint32_t af1 = af; -#ifdef INET - struct ip *ip = mtod(m, struct ip *); - - /* BPF wants net byte order */ - if (af == AF_INET) { - ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2)); - ip->ip_off = htons(ip->ip_off); - } -#endif - bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m); - } - /* verify the CARP version. */ if (ch->carp_version != CARP_VERSION) { CARPSTATS_INC(carps_badver); - SC2IFP(sc)->if_ierrors++; - CARP_UNLOCK(ifp->if_carp); - CARP_DEBUG("%s; invalid version %d\n", - SC2IFP(sc)->if_xname, + CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname, ch->carp_version); + ifa_free(ifa); m_freem(m); return; } - /* verify the hash */ + sc = ifa->ifa_carp; + CARP_LOCK(sc); + ifa_free(ifa); + if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { CARPSTATS_INC(carps_badauth); - SC2IFP(sc)->if_ierrors++; - CARP_UNLOCK(ifp->if_carp); - CARP_DEBUG("%s: incorrect hash\n", SC2IFP(sc)->if_xname); - m_freem(m); - return; + CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__, + sc->sc_vhid, ifp->if_xname); + goto out; } tmp_counter = ntohl(ch->carp_counter[0]); @@ -790,10 +639,7 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) sc->sc_counter = tmp_counter; sc_tv.tv_sec = sc->sc_advbase; - if (carp_suppress_preempt && sc->sc_advskew < 240) - sc_tv.tv_usec = 240 * 1000000 / 256; - else - sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256; + sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256; ch_tv.tv_sec = ch->carp_advbase; ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; @@ -808,12 +654,10 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) if (timevalcmp(&sc_tv, &ch_tv, >) || timevalcmp(&sc_tv, &ch_tv, ==)) { callout_stop(&sc->sc_ad_tmo); - CARP_LOG("%s: MASTER -> BACKUP " - "(more frequent advertisement received)\n", - SC2IFP(sc)->if_xname); - carp_set_state(sc, BACKUP); + carp_set_state(sc, BACKUP, + "more frequent advertisement received"); carp_setrun(sc, 0); - carp_setroute(sc, RTM_DELETE); + carp_delroute(sc); } break; case BACKUP: @@ -821,12 +665,9 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) * If we're pre-empting masters who advertise slower than us, * and this one claims to be slower, treat him as down. */ - if (carp_opts[CARPCTL_PREEMPT] && - timevalcmp(&sc_tv, &ch_tv, <)) { - CARP_LOG("%s: BACKUP -> MASTER " - "(preempting a slower master)\n", - SC2IFP(sc)->if_xname); - carp_master_down_locked(sc); + if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) { + carp_master_down_locked(sc, + "preempting a slower master"); break; } @@ -837,10 +678,7 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) */ sc_tv.tv_sec = sc->sc_advbase * 3; if (timevalcmp(&sc_tv, &ch_tv, <)) { - CARP_LOG("%s: BACKUP -> MASTER " - "(master timed out)\n", - SC2IFP(sc)->if_xname); - carp_master_down_locked(sc); + carp_master_down_locked(sc, "master will time out"); break; } @@ -852,17 +690,15 @@ carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) break; } - CARP_UNLOCK(ifp->if_carp); - +out: + CARP_UNLOCK(sc); m_freem(m); - return; } static int carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) { struct m_tag *mtag; - struct ifnet *ifp = SC2IFP(sc); if (sc->sc_init_counter) { /* this could also be seconds since unix epoch */ @@ -878,45 +714,79 @@ carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); /* Tag packet for carp_output */ - mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT); - if (mtag == NULL) { + if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *), + M_NOWAIT)) == NULL) { m_freem(m); - SC2IFP(sc)->if_oerrors++; + CARPSTATS_INC(carps_onomem); return (ENOMEM); } - bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *)); + bcopy(&sc, mtag + 1, sizeof(sc)); m_tag_prepend(m, mtag); return (0); } +/* + * To avoid LORs and possible recursions this function shouldn't + * be called directly, but scheduled via taskqueue. + */ static void -carp_send_ad_all(void) +carp_send_ad_all(void *ctx __unused, int pending __unused) { struct carp_softc *sc; mtx_lock(&carp_mtx); - LIST_FOREACH(sc, &carpif_list, sc_next) { - if (sc->sc_carpdev == NULL) - continue; - CARP_SCLOCK(sc); - if ((SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING) && - sc->sc_state == MASTER) + LIST_FOREACH(sc, &carp_list, sc_next) + if (sc->sc_state == MASTER) { + CARP_LOCK(sc); + CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); - CARP_SCUNLOCK(sc); - } + CURVNET_RESTORE(); + CARP_UNLOCK(sc); + } mtx_unlock(&carp_mtx); } +/* Send a periodic advertisement, executed in callout context. */ static void carp_send_ad(void *v) { struct carp_softc *sc = v; - CARP_SCLOCK(sc); + CARP_LOCK_ASSERT(sc); + CURVNET_SET(sc->sc_carpdev->if_vnet); carp_send_ad_locked(sc); - CARP_SCUNLOCK(sc); + CURVNET_RESTORE(); + CARP_UNLOCK(sc); +} + +static void +carp_send_ad_error(struct carp_softc *sc, int error) +{ + + if (error) { + if (sc->sc_sendad_errors < INT_MAX) + sc->sc_sendad_errors++; + if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { + static const char fmt[] = "send error %d on %s"; + char msg[sizeof(fmt) + IFNAMSIZ]; + + sprintf(msg, fmt, error, sc->sc_carpdev->if_xname); + carp_demote_adj(V_carp_senderr_adj, msg); + } + sc->sc_sendad_success = 0; + } else { + if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS && + ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) { + static const char fmt[] = "send ok on %s"; + char msg[sizeof(fmt) + IFNAMSIZ]; + + sprintf(msg, fmt, sc->sc_carpdev->if_xname); + carp_demote_adj(-V_carp_senderr_adj, msg); + sc->sc_sendad_errors = 0; + } else + sc->sc_sendad_errors = 0; + } } static void @@ -924,476 +794,393 @@ carp_send_ad_locked(struct carp_softc *sc) { struct carp_header ch; struct timeval tv; + struct sockaddr sa; + struct ifaddr *ifa; struct carp_header *ch_ptr; struct mbuf *m; - int len, advbase, advskew; + int len, advskew; - CARP_SCLOCK_ASSERT(sc); + CARP_LOCK_ASSERT(sc); - /* bow out if we've lost our UPness or RUNNINGuiness */ - if (!((SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) { - advbase = 255; - advskew = 255; - } else { - advbase = sc->sc_advbase; - if (!carp_suppress_preempt || sc->sc_advskew > 240) - advskew = sc->sc_advskew; - else - advskew = 240; - tv.tv_sec = advbase; - tv.tv_usec = advskew * 1000000 / 256; - } + advskew = DEMOTE_ADVSKEW(sc); + tv.tv_sec = sc->sc_advbase; + tv.tv_usec = advskew * 1000000 / 256; ch.carp_version = CARP_VERSION; ch.carp_type = CARP_ADVERTISEMENT; ch.carp_vhid = sc->sc_vhid; - ch.carp_advbase = advbase; + ch.carp_advbase = sc->sc_advbase; ch.carp_advskew = advskew; ch.carp_authlen = 7; /* XXX DEFINE */ ch.carp_pad1 = 0; /* must be zero */ ch.carp_cksum = 0; + /* XXXGL: OpenBSD picks first ifaddr with needed family. */ + #ifdef INET - if (sc->sc_ia) { + if (sc->sc_naddrs) { struct ip *ip; - MGETHDR(m, M_DONTWAIT, MT_HEADER); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { - SC2IFP(sc)->if_oerrors++; CARPSTATS_INC(carps_onomem); - /* XXX maybe less ? */ - if (advbase != 255 || advskew != 255) - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), - carp_send_ad, sc); - return; + goto resched; } len = sizeof(*ip) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; - MH_ALIGN(m, m->m_len); + M_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; - ip->ip_len = len; - ip->ip_id = ip_newid(); - ip->ip_off = IP_DF; + ip->ip_len = htons(len); + ip->ip_off = htons(IP_DF); ip->ip_ttl = CARP_DFLTTL; ip->ip_p = IPPROTO_CARP; ip->ip_sum = 0; - ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr; + ip_fillid(ip); + + bzero(&sa, sizeof(sa)); + sa.sa_family = AF_INET; + ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev); + if (ifa != NULL) { + ip->ip_src.s_addr = + ifatoia(ifa)->ia_addr.sin_addr.s_addr; + ifa_free(ifa); + } else + ip->ip_src.s_addr = 0; ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); ch_ptr = (struct carp_header *)(&ip[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) - return; + goto resched; m->m_data += sizeof(*ip); - ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip)); + ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip)); m->m_data -= sizeof(*ip); - getmicrotime(&SC2IFP(sc)->if_lastchange); - SC2IFP(sc)->if_opackets++; - SC2IFP(sc)->if_obytes += len; CARPSTATS_INC(carps_opackets); - if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) { - SC2IFP(sc)->if_oerrors++; - if (sc->sc_sendad_errors < INT_MAX) - sc->sc_sendad_errors++; - if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { - carp_suppress_preempt++; - if (carp_suppress_preempt == 1) { - CARP_SCUNLOCK(sc); - carp_send_ad_all(); - CARP_SCLOCK(sc); - } - } - sc->sc_sendad_success = 0; - } else { - if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { - if (++sc->sc_sendad_success >= - CARP_SENDAD_MIN_SUCCESS) { - carp_suppress_preempt--; - sc->sc_sendad_errors = 0; - } - } else - sc->sc_sendad_errors = 0; - } + carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT, + &sc->sc_carpdev->if_carp->cif_imo, NULL)); } #endif /* INET */ #ifdef INET6 - if (sc->sc_ia6) { + if (sc->sc_naddrs6) { struct ip6_hdr *ip6; - MGETHDR(m, M_DONTWAIT, MT_HEADER); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { - SC2IFP(sc)->if_oerrors++; CARPSTATS_INC(carps_onomem); - /* XXX maybe less ? */ - if (advbase != 255 || advskew != 255) - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), - carp_send_ad, sc); - return; + goto resched; } len = sizeof(*ip6) + sizeof(ch); m->m_pkthdr.len = len; m->m_pkthdr.rcvif = NULL; m->m_len = len; - MH_ALIGN(m, m->m_len); + M_ALIGN(m, m->m_len); m->m_flags |= M_MCAST; ip6 = mtod(m, struct ip6_hdr *); bzero(ip6, sizeof(*ip6)); ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_hlim = CARP_DFLTTL; ip6->ip6_nxt = IPPROTO_CARP; - bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src, - sizeof(struct in6_addr)); - /* set the multicast destination */ + bzero(&sa, sizeof(sa)); + + /* set the source address */ + sa.sa_family = AF_INET6; + ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev); + if (ifa != NULL) { + bcopy(IFA_IN6(ifa), &ip6->ip6_src, + sizeof(struct in6_addr)); + ifa_free(ifa); + } else + /* This should never happen with IPv6. */ + bzero(&ip6->ip6_src, sizeof(struct in6_addr)); + /* Set the multicast destination. */ ip6->ip6_dst.s6_addr16[0] = htons(0xff02); ip6->ip6_dst.s6_addr8[15] = 0x12; if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { - SC2IFP(sc)->if_oerrors++; m_freem(m); CARP_DEBUG("%s: in6_setscope failed\n", __func__); - return; + goto resched; } ch_ptr = (struct carp_header *)(&ip6[1]); bcopy(&ch, ch_ptr, sizeof(ch)); if (carp_prepare_ad(m, sc, ch_ptr)) - return; + goto resched; m->m_data += sizeof(*ip6); - ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6)); + ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6)); m->m_data -= sizeof(*ip6); - getmicrotime(&SC2IFP(sc)->if_lastchange); - SC2IFP(sc)->if_opackets++; - SC2IFP(sc)->if_obytes += len; CARPSTATS_INC(carps_opackets6); - if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) { - SC2IFP(sc)->if_oerrors++; - if (sc->sc_sendad_errors < INT_MAX) - sc->sc_sendad_errors++; - if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { - carp_suppress_preempt++; - if (carp_suppress_preempt == 1) { - CARP_SCUNLOCK(sc); - carp_send_ad_all(); - CARP_SCLOCK(sc); - } - } - sc->sc_sendad_success = 0; - } else { - if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { - if (++sc->sc_sendad_success >= - CARP_SENDAD_MIN_SUCCESS) { - carp_suppress_preempt--; - sc->sc_sendad_errors = 0; - } - } else - sc->sc_sendad_errors = 0; - } + carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0, + &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)); } #endif /* INET6 */ - if (advbase != 255 || advskew != 255) - callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), - carp_send_ad, sc); - +resched: + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc); } -#ifdef INET -/* - * Broadcast a gratuitous ARP request containing - * the virtual router MAC address for each IP address - * associated with the virtual router. - */ static void -carp_send_arp(struct carp_softc *sc) +carp_addroute(struct carp_softc *sc) { struct ifaddr *ifa; - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { - - if (ifa->ifa_addr->sa_family != AF_INET) - continue; + CARP_FOREACH_IFA(sc, ifa) + carp_ifa_addroute(ifa); +} -/* arprequest(sc->sc_carpdev, &in, &in, IF_LLADDR(sc->sc_ifp)); */ - arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp)); +static void +carp_ifa_addroute(struct ifaddr *ifa) +{ - DELAY(1000); /* XXX */ + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + in_addprefix(ifatoia(ifa), RTF_UP); + ifa_add_loopback_route(ifa, + (struct sockaddr *)&ifatoia(ifa)->ia_addr); + break; +#endif +#ifdef INET6 + case AF_INET6: + ifa_add_loopback_route(ifa, + (struct sockaddr *)&ifatoia6(ifa)->ia_addr); + nd6_add_ifa_lle(ifatoia6(ifa)); + break; +#endif } } -#endif -#ifdef INET6 static void -carp_send_na(struct carp_softc *sc) +carp_delroute(struct carp_softc *sc) { struct ifaddr *ifa; - struct in6_addr *in6; - static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + CARP_FOREACH_IFA(sc, ifa) + carp_ifa_delroute(ifa); +} - if (ifa->ifa_addr->sa_family != AF_INET6) - continue; +static void +carp_ifa_delroute(struct ifaddr *ifa) +{ - in6 = &ifatoia6(ifa)->ia_addr.sin6_addr; - nd6_na_output(sc->sc_carpdev, &mcast, in6, - ND_NA_FLAG_OVERRIDE, 1, NULL); - DELAY(1000); /* XXX */ + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + ifa_del_loopback_route(ifa, + (struct sockaddr *)&ifatoia(ifa)->ia_addr); + in_scrubprefix(ifatoia(ifa), LLE_STATIC); + break; +#endif +#ifdef INET6 + case AF_INET6: + ifa_del_loopback_route(ifa, + (struct sockaddr *)&ifatoia6(ifa)->ia_addr); + nd6_rem_ifa_lle(ifatoia6(ifa), 1); + break; +#endif } } -#endif /* INET6 */ + +int +carp_master(struct ifaddr *ifa) +{ + struct carp_softc *sc = ifa->ifa_carp; + + return (sc->sc_state == MASTER); +} #ifdef INET -static int -carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type) +/* + * Broadcast a gratuitous ARP request containing + * the virtual router MAC address for each IP address + * associated with the virtual router. + */ +static void +carp_send_arp(struct carp_softc *sc) { - struct carp_softc *vh; struct ifaddr *ifa; - int count = 0; - - CARP_LOCK_ASSERT(cif); - - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - if ((type == CARP_COUNT_RUNNING && - (SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) || - (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) { - IF_ADDR_RLOCK(SC2IFP(vh)); - TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, - ifa_list) { - if (ifa->ifa_addr->sa_family == AF_INET && - ia->ia_addr.sin_addr.s_addr == - ifatoia(ifa)->ia_addr.sin_addr.s_addr) - count++; - } - IF_ADDR_RUNLOCK(SC2IFP(vh)); - } + struct in_addr addr; + + CARP_FOREACH_IFA(sc, ifa) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr; + arp_announce_ifaddr(sc->sc_carpdev, addr, LLADDR(&sc->sc_addr)); } - return (count); } int -carp_iamatch(struct ifnet *ifp, struct in_ifaddr *ia, - struct in_addr *isaddr, u_int8_t **enaddr) +carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr) { - struct carp_if *cif; - struct carp_softc *vh; - int index, count = 0; - struct ifaddr *ifa; + struct carp_softc *sc = ifa->ifa_carp; - cif = ifp->if_carp; - CARP_LOCK(cif); - - if (carp_opts[CARPCTL_ARPBALANCE]) { - /* - * XXX proof of concept implementation. - * We use the source ip to decide which virtual host should - * handle the request. If we're master of that virtual host, - * then we respond, otherwise, just drop the arp packet on - * the floor. - */ - count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING); - if (count == 0) { - /* should never reach this */ - CARP_UNLOCK(cif); - return (0); - } - - /* this should be a hash, like pf_hash() */ - index = ntohl(isaddr->s_addr) % count; - count = 0; - - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - if ((SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) { - IF_ADDR_RLOCK(SC2IFP(vh)); - TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, - ifa_list) { - if (ifa->ifa_addr->sa_family == - AF_INET && - ia->ia_addr.sin_addr.s_addr == - ifatoia(ifa)->ia_addr.sin_addr.s_addr) { - if (count == index) { - if (vh->sc_state == - MASTER) { - *enaddr = IF_LLADDR(vh->sc_ifp); - IF_ADDR_RUNLOCK(SC2IFP(vh)); - CARP_UNLOCK(cif); - return (1); - } else { - IF_ADDR_RUNLOCK(SC2IFP(vh)); - CARP_UNLOCK(cif); - return (0); - } - } - count++; - } - } - IF_ADDR_RUNLOCK(SC2IFP(vh)); - } - } - } else { - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - if ((SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && - ia->ia_ifp == SC2IFP(vh) && - vh->sc_state == MASTER) { - *enaddr = IF_LLADDR(vh->sc_ifp); - CARP_UNLOCK(cif); - return (1); - } - } + if (sc->sc_state == MASTER) { + *enaddr = LLADDR(&sc->sc_addr); + return (1); } - CARP_UNLOCK(cif); + return (0); } #endif #ifdef INET6 +static void +carp_send_na(struct carp_softc *sc) +{ + static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; + struct ifaddr *ifa; + struct in6_addr *in6; + + CARP_FOREACH_IFA(sc, ifa) { + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + + in6 = IFA_IN6(ifa); + nd6_na_output(sc->sc_carpdev, &mcast, in6, + ND_NA_FLAG_OVERRIDE, 1, NULL); + DELAY(1000); /* XXX */ + } +} + +/* + * Returns ifa in case it's a carp address and it is MASTER, or if the address + * matches and is not a carp address. Returns NULL otherwise. + */ struct ifaddr * carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) { - struct carp_if *cif; - struct carp_softc *vh; struct ifaddr *ifa; - cif = ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { - IF_ADDR_RLOCK(SC2IFP(vh)); - TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) { - if (IN6_ARE_ADDR_EQUAL(taddr, - &ifatoia6(ifa)->ia_addr.sin6_addr) && - (SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && - vh->sc_state == MASTER) { - ifa_ref(ifa); - IF_ADDR_RUNLOCK(SC2IFP(vh)); - CARP_UNLOCK(cif); - return (ifa); - } - } - IF_ADDR_RUNLOCK(SC2IFP(vh)); + ifa = NULL; + IF_ADDR_RLOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) + continue; + if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER) + ifa = NULL; + else + ifa_ref(ifa); + break; } - CARP_UNLOCK(cif); - - return (NULL); + IF_ADDR_RUNLOCK(ifp); + + return (ifa); } caddr_t carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) { - struct m_tag *mtag; - struct carp_if *cif; - struct carp_softc *sc; struct ifaddr *ifa; - cif = ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) { - IF_ADDR_RLOCK(SC2IFP(sc)); - TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { - if (IN6_ARE_ADDR_EQUAL(taddr, - &ifatoia6(ifa)->ia_addr.sin6_addr) && - (SC2IFP(sc)->if_flags & IFF_UP) && - (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING)) { - struct ifnet *ifp = SC2IFP(sc); - mtag = m_tag_get(PACKET_TAG_CARP, - sizeof(struct ifnet *), M_NOWAIT); - if (mtag == NULL) { - /* better a bit than nothing */ - IF_ADDR_RUNLOCK(SC2IFP(sc)); - CARP_UNLOCK(cif); - return (IF_LLADDR(sc->sc_ifp)); - } - bcopy(&ifp, (caddr_t)(mtag + 1), - sizeof(struct ifnet *)); - m_tag_prepend(m, mtag); + IF_ADDR_RLOCK(ifp); + IFNET_FOREACH_IFA(ifp, ifa) + if (ifa->ifa_addr->sa_family == AF_INET6 && + IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) { + struct carp_softc *sc = ifa->ifa_carp; + struct m_tag *mtag; - IF_ADDR_RUNLOCK(SC2IFP(sc)); - CARP_UNLOCK(cif); - return (IF_LLADDR(sc->sc_ifp)); - } + IF_ADDR_RUNLOCK(ifp); + + mtag = m_tag_get(PACKET_TAG_CARP, + sizeof(struct carp_softc *), M_NOWAIT); + if (mtag == NULL) + /* Better a bit than nothing. */ + return (LLADDR(&sc->sc_addr)); + + bcopy(&sc, mtag + 1, sizeof(sc)); + m_tag_prepend(m, mtag); + + return (LLADDR(&sc->sc_addr)); } - IF_ADDR_RUNLOCK(SC2IFP(sc)); - } - CARP_UNLOCK(cif); + IF_ADDR_RUNLOCK(ifp); return (NULL); } -#endif +#endif /* INET6 */ -struct ifnet * +int carp_forus(struct ifnet *ifp, u_char *dhost) { - struct carp_if *cif; - struct carp_softc *vh; - u_int8_t *ena = dhost; + struct carp_softc *sc; + uint8_t *ena = dhost; if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) - return (NULL); - - cif = ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) - if ((SC2IFP(vh)->if_flags & IFF_UP) && - (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && - vh->sc_state == MASTER && - !bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) { - CARP_UNLOCK(cif); - return (SC2IFP(vh)); + return (0); + + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) { + CARP_LOCK(sc); + if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr), + ETHER_ADDR_LEN)) { + CARP_UNLOCK(sc); + CIF_UNLOCK(ifp->if_carp); + return (1); } + CARP_UNLOCK(sc); + } + CIF_UNLOCK(ifp->if_carp); - CARP_UNLOCK(cif); - return (NULL); + return (0); } +/* Master down timeout event, executed in callout context. */ static void carp_master_down(void *v) { struct carp_softc *sc = v; - CARP_SCLOCK(sc); - carp_master_down_locked(sc); - CARP_SCUNLOCK(sc); + CARP_LOCK_ASSERT(sc); + + CURVNET_SET(sc->sc_carpdev->if_vnet); + if (sc->sc_state == BACKUP) { + carp_master_down_locked(sc, "master timed out"); + } + CURVNET_RESTORE(); + + CARP_UNLOCK(sc); } static void -carp_master_down_locked(struct carp_softc *sc) +carp_master_down_locked(struct carp_softc *sc, const char *reason) { - if (sc->sc_carpdev) - CARP_SCLOCK_ASSERT(sc); + + CARP_LOCK_ASSERT(sc); switch (sc->sc_state) { - case INIT: - printf("%s: master_down event in INIT state\n", - SC2IFP(sc)->if_xname); - break; - case MASTER: - break; case BACKUP: - carp_set_state(sc, MASTER); + carp_set_state(sc, MASTER, reason); carp_send_ad_locked(sc); #ifdef INET carp_send_arp(sc); #endif #ifdef INET6 carp_send_na(sc); -#endif /* INET6 */ +#endif carp_setrun(sc, 0); - carp_setroute(sc, RTM_ADD); + carp_addroute(sc); + break; + case INIT: + case MASTER: +#ifdef INVARIANTS + panic("carp: VHID %u@%s: master_down event in %s state\n", + sc->sc_vhid, + sc->sc_carpdev->if_xname, + sc->sc_state ? "MASTER" : "INIT"); +#endif break; } } @@ -1407,28 +1194,16 @@ carp_setrun(struct carp_softc *sc, sa_family_t af) { struct timeval tv; - if (sc->sc_carpdev == NULL) { - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - carp_set_state(sc, INIT); - return; - } else - CARP_SCLOCK_ASSERT(sc); - - if (SC2IFP(sc)->if_flags & IFF_UP && - sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6) && - sc->sc_carpdev->if_link_state == LINK_STATE_UP) - SC2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; - else { - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - carp_setroute(sc, RTM_DELETE); + CARP_LOCK_ASSERT(sc); + + if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 || + sc->sc_carpdev->if_link_state != LINK_STATE_UP || + (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)) return; - } switch (sc->sc_state) { case INIT: - CARP_LOG("%s: INIT -> BACKUP\n", SC2IFP(sc)->if_xname); - carp_set_state(sc, BACKUP); - carp_setroute(sc, RTM_DELETE); + carp_set_state(sc, BACKUP, "initialization complete"); carp_setrun(sc, 0); break; case BACKUP: @@ -1441,20 +1216,24 @@ carp_setrun(struct carp_softc *sc, sa_family_t af) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); break; -#endif /* INET */ +#endif #ifdef INET6 case AF_INET6: callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); break; -#endif /* INET6 */ +#endif default: +#ifdef INET if (sc->sc_naddrs) callout_reset(&sc->sc_md_tmo, tvtohz(&tv), carp_master_down, sc); +#endif +#ifdef INET6 if (sc->sc_naddrs6) callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), carp_master_down, sc); +#endif break; } break; @@ -1467,842 +1246,779 @@ carp_setrun(struct carp_softc *sc, sa_family_t af) } } -#ifdef INET -static void -carp_multicast_cleanup(struct carp_softc *sc, int dofree) -{ - struct ip_moptions *imo = &sc->sc_imo; - u_int16_t n = imo->imo_num_memberships; - - /* Clean up our own multicast memberships */ - while (n-- > 0) { - if (imo->imo_membership[n] != NULL) { - if (dofree) - in_delmulti(imo->imo_membership[n]); - imo->imo_membership[n] = NULL; - } - } - KASSERT(imo->imo_mfilters == NULL, - ("%s: imo_mfilters != NULL", __func__)); - imo->imo_num_memberships = 0; - imo->imo_multicast_ifp = NULL; -} -#endif - -#ifdef INET6 -static void -carp_multicast6_cleanup(struct carp_softc *sc, int dofree) -{ - struct ip6_moptions *im6o = &sc->sc_im6o; - u_int16_t n = im6o->im6o_num_memberships; - - while (n-- > 0) { - if (im6o->im6o_membership[n] != NULL) { - if (dofree) - in6_mc_leave(im6o->im6o_membership[n], NULL); - im6o->im6o_membership[n] = NULL; - } - } - KASSERT(im6o->im6o_mfilters == NULL, - ("%s: im6o_mfilters != NULL", __func__)); - im6o->im6o_num_memberships = 0; - im6o->im6o_multicast_ifp = NULL; -} -#endif - -#ifdef INET +/* + * Setup multicast structures. + */ static int -carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin) +carp_multicast_setup(struct carp_if *cif, sa_family_t sa) { - struct ifnet *ifp; - struct carp_if *cif; - struct in_ifaddr *ia, *ia_if; - struct ip_moptions *imo = &sc->sc_imo; - struct in_addr addr; - u_long iaddr = htonl(sin->sin_addr.s_addr); - int own, error; - - if (sin->sin_addr.s_addr == 0) { - if (!(SC2IFP(sc)->if_flags & IFF_UP)) - carp_set_state(sc, INIT); - if (sc->sc_naddrs) - SC2IFP(sc)->if_flags |= IFF_UP; - if (sc->sc_carpdev) - CARP_SCLOCK(sc); - carp_setrun(sc, 0); - if (sc->sc_carpdev) - CARP_SCUNLOCK(sc); - return (0); - } - - /* we have to do it by hands to check we won't match on us */ - ia_if = NULL; own = 0; - IN_IFADDR_RLOCK(); - TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { - /* and, yeah, we need a multicast-capable iface too */ - if (ia->ia_ifp != SC2IFP(sc) && - (ia->ia_ifp->if_flags & IFF_MULTICAST) && - (iaddr & ia->ia_subnetmask) == ia->ia_subnet) { - if (!ia_if) - ia_if = ia; - if (sin->sin_addr.s_addr == - ia->ia_addr.sin_addr.s_addr) - own++; - } - } - - if (!ia_if) { - IN_IFADDR_RUNLOCK(); - return (EADDRNOTAVAIL); - } + struct ifnet *ifp = cif->cif_ifp; + int error = 0; - ia = ia_if; - ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + switch (sa) { +#ifdef INET + case AF_INET: + { + struct ip_moptions *imo = &cif->cif_imo; + struct in_addr addr; - ifp = ia->ia_ifp; + if (imo->imo_membership) + return (0); - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || - (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp)) { - ifa_free(&ia->ia_ifa); - return (EADDRNOTAVAIL); - } + imo->imo_membership = (struct in_multi **)malloc( + (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, + M_WAITOK); + imo->imo_mfilters = NULL; + imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; + imo->imo_multicast_vif = -1; - if (imo->imo_num_memberships == 0) { addr.s_addr = htonl(INADDR_CARP_GROUP); - if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) == - NULL) { - ifa_free(&ia->ia_ifa); - return (ENOBUFS); + if ((error = in_joingroup(ifp, &addr, NULL, + &imo->imo_membership[0])) != 0) { + free(imo->imo_membership, M_CARP); + break; } imo->imo_num_memberships++; imo->imo_multicast_ifp = ifp; imo->imo_multicast_ttl = CARP_DFLTTL; imo->imo_multicast_loop = 0; - } + break; + } +#endif +#ifdef INET6 + case AF_INET6: + { + struct ip6_moptions *im6o = &cif->cif_im6o; + struct in6_addr in6; + struct in6_multi *in6m; + + if (im6o->im6o_membership) + return (0); - if (!ifp->if_carp) { + im6o->im6o_membership = (struct in6_multi **)malloc( + (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, + M_ZERO | M_WAITOK); + im6o->im6o_mfilters = NULL; + im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; + im6o->im6o_multicast_hlim = CARP_DFLTTL; + im6o->im6o_multicast_ifp = ifp; - cif = malloc(sizeof(*cif), M_CARP, - M_WAITOK|M_ZERO); - if (!cif) { - error = ENOBUFS; - goto cleanup; + /* Join IPv6 CARP multicast group. */ + bzero(&in6, sizeof(in6)); + in6.s6_addr16[0] = htons(0xff02); + in6.s6_addr8[15] = 0x12; + if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { + free(im6o->im6o_membership, M_CARP); + break; } - if ((error = ifpromisc(ifp, 1))) { - free(cif, M_CARP); - goto cleanup; + in6m = NULL; + if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { + free(im6o->im6o_membership, M_CARP); + break; } - - CARP_LOCK_INIT(cif); - CARP_LOCK(cif); - cif->vhif_ifp = ifp; - TAILQ_INIT(&cif->vhif_vrs); - ifp->if_carp = cif; + im6o->im6o_membership[0] = in6m; + im6o->im6o_num_memberships++; - } else { - struct carp_softc *vr; - - cif = (struct carp_if *)ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) - if (vr != sc && vr->sc_vhid == sc->sc_vhid) { - CARP_UNLOCK(cif); - error = EEXIST; - goto cleanup; - } + /* Join solicited multicast address. */ + bzero(&in6, sizeof(in6)); + in6.s6_addr16[0] = htons(0xff02); + in6.s6_addr32[1] = 0; + in6.s6_addr32[2] = htonl(1); + in6.s6_addr32[3] = 0; + in6.s6_addr8[12] = 0xff; + if ((error = in6_setscope(&in6, ifp, NULL)) != 0) { + in6_mc_leave(im6o->im6o_membership[0], NULL); + free(im6o->im6o_membership, M_CARP); + break; + } + in6m = NULL; + if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) { + in6_mc_leave(im6o->im6o_membership[0], NULL); + free(im6o->im6o_membership, M_CARP); + break; + } + im6o->im6o_membership[1] = in6m; + im6o->im6o_num_memberships++; + break; + } +#endif } - sc->sc_ia = ia; - sc->sc_carpdev = ifp; - { /* XXX prevent endless loop if already in queue */ - struct carp_softc *vr, *after = NULL; - int myself = 0; - cif = (struct carp_if *)ifp->if_carp; + return (error); +} - /* XXX: cif should not change, right? So we still hold the lock */ - CARP_LOCK_ASSERT(cif); +/* + * Free multicast structures. + */ +static void +carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa) +{ - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { - if (vr == sc) - myself = 1; - if (vr->sc_vhid < sc->sc_vhid) - after = vr; - } + sx_assert(&carp_sx, SA_XLOCKED); + + switch (sa) { +#ifdef INET + case AF_INET: + if (cif->cif_naddrs == 0) { + struct ip_moptions *imo = &cif->cif_imo; + + in_leavegroup(imo->imo_membership[0], NULL); + KASSERT(imo->imo_mfilters == NULL, + ("%s: imo_mfilters != NULL", __func__)); + free(imo->imo_membership, M_CARP); + imo->imo_membership = NULL; - if (!myself) { - /* We're trying to keep things in order */ - if (after == NULL) { - TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); - } else { - TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); } - cif->vhif_nvrs++; - } + break; +#endif +#ifdef INET6 + case AF_INET6: + if (cif->cif_naddrs6 == 0) { + struct ip6_moptions *im6o = &cif->cif_im6o; + + in6_mc_leave(im6o->im6o_membership[0], NULL); + in6_mc_leave(im6o->im6o_membership[1], NULL); + KASSERT(im6o->im6o_mfilters == NULL, + ("%s: im6o_mfilters != NULL", __func__)); + free(im6o->im6o_membership, M_CARP); + im6o->im6o_membership = NULL; + } + break; +#endif } +} - sc->sc_naddrs++; - SC2IFP(sc)->if_flags |= IFF_UP; - if (own) - sc->sc_advskew = 0; - carp_sc_state_locked(sc); - carp_setrun(sc, 0); +int +carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa) +{ + struct m_tag *mtag; + struct carp_softc *sc; - CARP_UNLOCK(cif); - ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */ + if (!sa) + return (0); - return (0); + switch (sa->sa_family) { +#ifdef INET + case AF_INET: + break; +#endif +#ifdef INET6 + case AF_INET6: + break; +#endif + default: + return (0); + } -cleanup: - in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); - ifa_free(&ia->ia_ifa); - return (error); -} + mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); + if (mtag == NULL) + return (0); -static int -carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin) -{ - int error = 0; + bcopy(mtag + 1, &sc, sizeof(sc)); - if (!--sc->sc_naddrs) { - struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp; - struct ip_moptions *imo = &sc->sc_imo; + /* Set the source MAC address to the Virtual Router MAC Address. */ + switch (ifp->if_type) { + case IFT_ETHER: + case IFT_BRIDGE: + case IFT_L2VLAN: { + struct ether_header *eh; - CARP_LOCK(cif); - callout_stop(&sc->sc_ad_tmo); - SC2IFP(sc)->if_flags &= ~IFF_UP; - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - sc->sc_vhid = -1; - in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); - imo->imo_multicast_ifp = NULL; - TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); - if (!--cif->vhif_nvrs) { - sc->sc_carpdev->if_carp = NULL; - CARP_LOCK_DESTROY(cif); - free(cif, M_CARP); - } else { - CARP_UNLOCK(cif); + eh = mtod(m, struct ether_header *); + eh->ether_shost[0] = 0; + eh->ether_shost[1] = 0; + eh->ether_shost[2] = 0x5e; + eh->ether_shost[3] = 0; + eh->ether_shost[4] = 1; + eh->ether_shost[5] = sc->sc_vhid; } + break; + case IFT_FDDI: { + struct fddi_header *fh; + + fh = mtod(m, struct fddi_header *); + fh->fddi_shost[0] = 0; + fh->fddi_shost[1] = 0; + fh->fddi_shost[2] = 0x5e; + fh->fddi_shost[3] = 0; + fh->fddi_shost[4] = 1; + fh->fddi_shost[5] = sc->sc_vhid; + } + break; + case IFT_ISO88025: { + struct iso88025_header *th; + th = mtod(m, struct iso88025_header *); + th->iso88025_shost[0] = 3; + th->iso88025_shost[1] = 0; + th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1); + th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1); + th->iso88025_shost[4] = 0; + th->iso88025_shost[5] = 0; + } + break; + default: + printf("%s: carp is not supported for the %d interface type\n", + ifp->if_xname, ifp->if_type); + return (EOPNOTSUPP); } - return (error); + return (0); } -#endif -#ifdef INET6 -static int -carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) +static struct carp_softc* +carp_alloc(struct ifnet *ifp) { - struct ifnet *ifp; + struct carp_softc *sc; struct carp_if *cif; - struct in6_ifaddr *ia, *ia_if; - struct ip6_moptions *im6o = &sc->sc_im6o; - struct in6_addr in6; - int own, error; - - error = 0; - - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - if (!(SC2IFP(sc)->if_flags & IFF_UP)) - carp_set_state(sc, INIT); - if (sc->sc_naddrs6) - SC2IFP(sc)->if_flags |= IFF_UP; - if (sc->sc_carpdev) - CARP_SCLOCK(sc); - carp_setrun(sc, 0); - if (sc->sc_carpdev) - CARP_SCUNLOCK(sc); - return (0); - } - /* we have to do it by hands to check we won't match on us */ - ia_if = NULL; own = 0; - IN6_IFADDR_RLOCK(); - TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { - int i; - - for (i = 0; i < 4; i++) { - if ((sin6->sin6_addr.s6_addr32[i] & - ia->ia_prefixmask.sin6_addr.s6_addr32[i]) != - (ia->ia_addr.sin6_addr.s6_addr32[i] & - ia->ia_prefixmask.sin6_addr.s6_addr32[i])) - break; - } - /* and, yeah, we need a multicast-capable iface too */ - if (ia->ia_ifp != SC2IFP(sc) && - (ia->ia_ifp->if_flags & IFF_MULTICAST) && - (i == 4)) { - if (!ia_if) - ia_if = ia; - if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, - &ia->ia_addr.sin6_addr)) - own++; - } - } + if ((cif = ifp->if_carp) == NULL) + cif = carp_alloc_if(ifp); - if (!ia_if) { - IN6_IFADDR_RUNLOCK(); - return (EADDRNOTAVAIL); - } - ia = ia_if; - ifa_ref(&ia->ia_ifa); - IN6_IFADDR_RUNLOCK(); - ifp = ia->ia_ifp; - - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || - (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp)) { - ifa_free(&ia->ia_ifa); - return (EADDRNOTAVAIL); - } + sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); - if (!sc->sc_naddrs6) { - struct in6_multi *in6m; + sc->sc_advbase = CARP_DFLTINTV; + sc->sc_vhid = -1; /* required setting */ + sc->sc_init_counter = 1; + sc->sc_state = INIT; - im6o->im6o_multicast_ifp = ifp; + sc->sc_ifasiz = sizeof(struct ifaddr *); + sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO); + sc->sc_carpdev = ifp; - /* join CARP multicast address */ - bzero(&in6, sizeof(in6)); - in6.s6_addr16[0] = htons(0xff02); - in6.s6_addr8[15] = 0x12; - if (in6_setscope(&in6, ifp, NULL) != 0) - goto cleanup; - in6m = NULL; - error = in6_mc_join(ifp, &in6, NULL, &in6m, 0); - if (error) - goto cleanup; - im6o->im6o_membership[0] = in6m; - im6o->im6o_num_memberships++; + CARP_LOCK_INIT(sc); +#ifdef INET + callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); +#endif +#ifdef INET6 + callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); +#endif + callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED); - /* join solicited multicast address */ - bzero(&in6, sizeof(in6)); - in6.s6_addr16[0] = htons(0xff02); - in6.s6_addr32[1] = 0; - in6.s6_addr32[2] = htonl(1); - in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3]; - in6.s6_addr8[12] = 0xff; - if (in6_setscope(&in6, ifp, NULL) != 0) - goto cleanup; - in6m = NULL; - error = in6_mc_join(ifp, &in6, NULL, &in6m, 0); - if (error) - goto cleanup; - im6o->im6o_membership[1] = in6m; - im6o->im6o_num_memberships++; - } + CIF_LOCK(cif); + TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list); + CIF_UNLOCK(cif); - if (!ifp->if_carp) { - cif = malloc(sizeof(*cif), M_CARP, - M_WAITOK|M_ZERO); - if (!cif) { - error = ENOBUFS; - goto cleanup; - } - if ((error = ifpromisc(ifp, 1))) { - free(cif, M_CARP); - goto cleanup; - } + mtx_lock(&carp_mtx); + LIST_INSERT_HEAD(&carp_list, sc, sc_next); + mtx_unlock(&carp_mtx); - CARP_LOCK_INIT(cif); - CARP_LOCK(cif); - cif->vhif_ifp = ifp; - TAILQ_INIT(&cif->vhif_vrs); - ifp->if_carp = cif; + return (sc); +} - } else { - struct carp_softc *vr; +static void +carp_grow_ifas(struct carp_softc *sc) +{ + struct ifaddr **new; + + new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO); + CARP_LOCK(sc); + bcopy(sc->sc_ifas, new, sc->sc_ifasiz); + free(sc->sc_ifas, M_CARP); + sc->sc_ifas = new; + sc->sc_ifasiz *= 2; + CARP_UNLOCK(sc); +} - cif = (struct carp_if *)ifp->if_carp; - CARP_LOCK(cif); - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) - if (vr != sc && vr->sc_vhid == sc->sc_vhid) { - CARP_UNLOCK(cif); - error = EINVAL; - goto cleanup; - } - } - sc->sc_ia6 = ia; - sc->sc_carpdev = ifp; +static void +carp_destroy(struct carp_softc *sc) +{ + struct ifnet *ifp = sc->sc_carpdev; + struct carp_if *cif = ifp->if_carp; - { /* XXX prevent endless loop if already in queue */ - struct carp_softc *vr, *after = NULL; - int myself = 0; - cif = (struct carp_if *)ifp->if_carp; - CARP_LOCK_ASSERT(cif); - - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { - if (vr == sc) - myself = 1; - if (vr->sc_vhid < sc->sc_vhid) - after = vr; - } + sx_assert(&carp_sx, SA_XLOCKED); - if (!myself) { - /* We're trying to keep things in order */ - if (after == NULL) { - TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); - } else { - TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); - } - cif->vhif_nvrs++; - } - } + if (sc->sc_suppress) + carp_demote_adj(-V_carp_ifdown_adj, "vhid removed"); + CARP_UNLOCK(sc); + + CIF_LOCK(cif); + TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list); + CIF_UNLOCK(cif); + + mtx_lock(&carp_mtx); + LIST_REMOVE(sc, sc_next); + mtx_unlock(&carp_mtx); + + callout_drain(&sc->sc_ad_tmo); +#ifdef INET + callout_drain(&sc->sc_md_tmo); +#endif +#ifdef INET6 + callout_drain(&sc->sc_md6_tmo); +#endif + CARP_LOCK_DESTROY(sc); + + free(sc->sc_ifas, M_CARP); + free(sc, M_CARP); +} + +static struct carp_if* +carp_alloc_if(struct ifnet *ifp) +{ + struct carp_if *cif; + int error; + + cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO); + + if ((error = ifpromisc(ifp, 1)) != 0) + printf("%s: ifpromisc(%s) failed: %d\n", + __func__, ifp->if_xname, error); + else + cif->cif_flags |= CIF_PROMISC; + + CIF_LOCK_INIT(cif); + cif->cif_ifp = ifp; + TAILQ_INIT(&cif->cif_vrs); + + IF_ADDR_WLOCK(ifp); + ifp->if_carp = cif; + if_ref(ifp); + IF_ADDR_WUNLOCK(ifp); + + return (cif); +} + +static void +carp_free_if(struct carp_if *cif) +{ + struct ifnet *ifp = cif->cif_ifp; - sc->sc_naddrs6++; - SC2IFP(sc)->if_flags |= IFF_UP; - if (own) - sc->sc_advskew = 0; - carp_sc_state_locked(sc); - carp_setrun(sc, 0); + CIF_LOCK_ASSERT(cif); + KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty", + __func__)); - CARP_UNLOCK(cif); - ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */ + IF_ADDR_WLOCK(ifp); + ifp->if_carp = NULL; + IF_ADDR_WUNLOCK(ifp); - return (0); + CIF_LOCK_DESTROY(cif); -cleanup: - if (!sc->sc_naddrs6) - carp_multicast6_cleanup(sc, 1); - ifa_free(&ia->ia_ifa); - return (error); + if (cif->cif_flags & CIF_PROMISC) + ifpromisc(ifp, 0); + if_rele(ifp); + + free(cif, M_CARP); } -static int -carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) +static void +carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv) { - int error = 0; - - if (!--sc->sc_naddrs6) { - struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp; - - CARP_LOCK(cif); - callout_stop(&sc->sc_ad_tmo); - SC2IFP(sc)->if_flags &= ~IFF_UP; - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; - sc->sc_vhid = -1; - carp_multicast6_cleanup(sc, 1); - TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); - if (!--cif->vhif_nvrs) { - CARP_LOCK_DESTROY(cif); - sc->sc_carpdev->if_carp = NULL; - free(cif, M_CARP); - } else - CARP_UNLOCK(cif); - } - return (error); + CARP_LOCK(sc); + carpr->carpr_state = sc->sc_state; + carpr->carpr_vhid = sc->sc_vhid; + carpr->carpr_advbase = sc->sc_advbase; + carpr->carpr_advskew = sc->sc_advskew; + if (priv) + bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key)); + else + bzero(carpr->carpr_key, sizeof(carpr->carpr_key)); + CARP_UNLOCK(sc); } -#endif /* INET6 */ -static int -carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr) +int +carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td) { - struct carp_softc *sc = ifp->if_softc, *vr; struct carpreq carpr; - struct ifaddr *ifa; - struct ifreq *ifr; - struct ifaliasreq *ifra; - int locked = 0, error = 0; + struct ifnet *ifp; + struct carp_softc *sc = NULL; + int error = 0, locked = 0; - ifa = (struct ifaddr *)addr; - ifra = (struct ifaliasreq *)addr; - ifr = (struct ifreq *)addr; + if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr))) + return (error); - switch (cmd) { - case SIOCSIFADDR: - switch (ifa->ifa_addr->sa_family) { -#ifdef INET - case AF_INET: - SC2IFP(sc)->if_flags |= IFF_UP; - bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, - sizeof(struct sockaddr)); - error = carp_set_addr(sc, satosin(ifa->ifa_addr)); - break; -#endif /* INET */ -#ifdef INET6 - case AF_INET6: - SC2IFP(sc)->if_flags |= IFF_UP; - error = carp_set_addr6(sc, satosin6(ifa->ifa_addr)); - break; -#endif /* INET6 */ - default: - error = EAFNOSUPPORT; - break; - } - break; + ifp = ifunit_ref(ifr->ifr_name); + if (ifp == NULL) + return (ENXIO); - case SIOCAIFADDR: - switch (ifa->ifa_addr->sa_family) { -#ifdef INET - case AF_INET: - SC2IFP(sc)->if_flags |= IFF_UP; - bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, - sizeof(struct sockaddr)); - error = carp_set_addr(sc, satosin(&ifra->ifra_addr)); - break; -#endif /* INET */ -#ifdef INET6 - case AF_INET6: - SC2IFP(sc)->if_flags |= IFF_UP; - error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr)); - break; -#endif /* INET6 */ - default: - error = EAFNOSUPPORT; - break; - } + switch (ifp->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: + case IFT_BRIDGE: + case IFT_FDDI: + case IFT_ISO88025: break; + default: + error = EOPNOTSUPP; + goto out; + } - case SIOCDIFADDR: - switch (ifa->ifa_addr->sa_family) { -#ifdef INET - case AF_INET: - error = carp_del_addr(sc, satosin(&ifra->ifra_addr)); - break; -#endif /* INET */ -#ifdef INET6 - case AF_INET6: - error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr)); + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + error = EADDRNOTAVAIL; + goto out; + } + + sx_xlock(&carp_sx); + switch (cmd) { + case SIOCSVH: + if ((error = priv_check(td, PRIV_NETINET_CARP))) break; -#endif /* INET6 */ - default: - error = EAFNOSUPPORT; + if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID || + carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) { + error = EINVAL; break; } - break; - case SIOCSIFFLAGS: - if (sc->sc_carpdev) { - locked = 1; - CARP_SCLOCK(sc); + if (ifp->if_carp) { + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + if (sc->sc_vhid == carpr.carpr_vhid) + break; + CIF_UNLOCK(ifp->if_carp); } - if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) { - callout_stop(&sc->sc_ad_tmo); - callout_stop(&sc->sc_md_tmo); - callout_stop(&sc->sc_md6_tmo); - if (sc->sc_state == MASTER) - carp_send_ad_locked(sc); - carp_set_state(sc, INIT); - carp_setrun(sc, 0); - } else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) { - SC2IFP(sc)->if_flags |= IFF_UP; - carp_setrun(sc, 0); + if (sc == NULL) { + sc = carp_alloc(ifp); + CARP_LOCK(sc); + sc->sc_vhid = carpr.carpr_vhid; + LLADDR(&sc->sc_addr)[0] = 0; + LLADDR(&sc->sc_addr)[1] = 0; + LLADDR(&sc->sc_addr)[2] = 0x5e; + LLADDR(&sc->sc_addr)[3] = 0; + LLADDR(&sc->sc_addr)[4] = 1; + LLADDR(&sc->sc_addr)[5] = sc->sc_vhid; + } else + CARP_LOCK(sc); + locked = 1; + if (carpr.carpr_advbase > 0) { + if (carpr.carpr_advbase > 255 || + carpr.carpr_advbase < CARP_DFLTINTV) { + error = EINVAL; + break; + } + sc->sc_advbase = carpr.carpr_advbase; } - break; - - case SIOCSVH: - error = priv_check(curthread, PRIV_NETINET_CARP); - if (error) - break; - if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr))) + if (carpr.carpr_advskew >= 255) { + error = EINVAL; break; - error = 1; - if (sc->sc_carpdev) { - locked = 1; - CARP_SCLOCK(sc); } - if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) { + sc->sc_advskew = carpr.carpr_advskew; + if (carpr.carpr_key[0] != '\0') { + bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); + carp_hmac_prepare(sc); + } + if (sc->sc_state != INIT && + carpr.carpr_state != sc->sc_state) { switch (carpr.carpr_state) { case BACKUP: callout_stop(&sc->sc_ad_tmo); - carp_set_state(sc, BACKUP); + carp_set_state(sc, BACKUP, + "user requested via ifconfig"); carp_setrun(sc, 0); - carp_setroute(sc, RTM_DELETE); + carp_delroute(sc); break; case MASTER: - carp_master_down_locked(sc); + carp_master_down_locked(sc, + "user requested via ifconfig"); break; default: break; } } - if (carpr.carpr_vhid > 0) { - if (carpr.carpr_vhid > 255) { - error = EINVAL; - break; - } - if (sc->sc_carpdev) { - struct carp_if *cif; - cif = (struct carp_if *)sc->sc_carpdev->if_carp; - TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) - if (vr != sc && - vr->sc_vhid == carpr.carpr_vhid) { - error = EEXIST; - break; - } - if (error == EEXIST) - break; - } - sc->sc_vhid = carpr.carpr_vhid; - IF_LLADDR(sc->sc_ifp)[0] = 0; - IF_LLADDR(sc->sc_ifp)[1] = 0; - IF_LLADDR(sc->sc_ifp)[2] = 0x5e; - IF_LLADDR(sc->sc_ifp)[3] = 0; - IF_LLADDR(sc->sc_ifp)[4] = 1; - IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid; - error--; + break; + + case SIOCGVH: + { + int priveleged; + + if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) { + error = EINVAL; + break; } - if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) { - if (carpr.carpr_advskew >= 255) { - error = EINVAL; + if (carpr.carpr_count < 1) { + error = EMSGSIZE; + break; + } + if (ifp->if_carp == NULL) { + error = ENOENT; + break; + } + + priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0); + if (carpr.carpr_vhid != 0) { + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + if (sc->sc_vhid == carpr.carpr_vhid) + break; + CIF_UNLOCK(ifp->if_carp); + if (sc == NULL) { + error = ENOENT; break; } - if (carpr.carpr_advbase > 255) { - error = EINVAL; + carp_carprcp(&carpr, sc, priveleged); + error = copyout(&carpr, ifr->ifr_data, sizeof(carpr)); + } else { + int i, count; + + count = 0; + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) + count++; + + if (count > carpr.carpr_count) { + CIF_UNLOCK(ifp->if_carp); + error = EMSGSIZE; break; } - sc->sc_advbase = carpr.carpr_advbase; - sc->sc_advskew = carpr.carpr_advskew; - error--; - } - bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); - if (error > 0) - error = EINVAL; - else { - error = 0; - carp_setrun(sc, 0); - } - break; - case SIOCGVH: - /* XXX: lockless read */ - bzero(&carpr, sizeof(carpr)); - carpr.carpr_state = sc->sc_state; - carpr.carpr_vhid = sc->sc_vhid; - carpr.carpr_advbase = sc->sc_advbase; - carpr.carpr_advskew = sc->sc_advskew; - error = priv_check(curthread, PRIV_NETINET_CARP); - if (error == 0) - bcopy(sc->sc_key, carpr.carpr_key, - sizeof(carpr.carpr_key)); - error = copyout(&carpr, ifr->ifr_data, sizeof(carpr)); + i = 0; + IFNET_FOREACH_CARP(ifp, sc) { + carp_carprcp(&carpr, sc, priveleged); + carpr.carpr_count = count; + error = copyout(&carpr, ifr->ifr_data + + (i * sizeof(carpr)), sizeof(carpr)); + if (error) { + CIF_UNLOCK(ifp->if_carp); + break; + } + i++; + } + CIF_UNLOCK(ifp->if_carp); + } break; - + } default: error = EINVAL; } + sx_xunlock(&carp_sx); +out: if (locked) - CARP_SCUNLOCK(sc); - - carp_hmac_prepare(sc); + CARP_UNLOCK(sc); + if_rele(ifp); return (error); } -/* - * XXX: this is looutput. We should eventually use it from there. - */ static int -carp_looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, - struct route *ro) +carp_get_vhid(struct ifaddr *ifa) { - u_int32_t af; - struct rtentry *rt = NULL; - M_ASSERTPKTHDR(m); /* check if we have the packet header */ - - if (ro != NULL) - rt = ro->ro_rt; - if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - m_freem(m); - return (rt->rt_flags & RTF_BLACKHOLE ? 0 : - rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); - } - - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; - - /* BPF writes need to be handled specially. */ - if (dst->sa_family == AF_UNSPEC) { - bcopy(dst->sa_data, &af, sizeof(af)); - dst->sa_family = af; - } - -#if 1 /* XXX */ - switch (dst->sa_family) { - case AF_INET: - case AF_INET6: - case AF_IPX: - case AF_APPLETALK: - break; - default: - printf("carp_looutput: af=%d unexpected\n", dst->sa_family); - m_freem(m); - return (EAFNOSUPPORT); - } -#endif - return(if_simloop(ifp, m, dst->sa_family, 0)); -} + if (ifa == NULL || ifa->ifa_carp == NULL) + return (0); -/* - * Start output on carp interface. This function should never be called. - */ -static void -carp_start(struct ifnet *ifp) -{ -#ifdef DEBUG - printf("%s: start called\n", ifp->if_xname); -#endif + return (ifa->ifa_carp->sc_vhid); } int -carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, - struct rtentry *rt) +carp_attach(struct ifaddr *ifa, int vhid) { - struct m_tag *mtag; + struct ifnet *ifp = ifa->ifa_ifp; + struct carp_if *cif = ifp->if_carp; struct carp_softc *sc; - struct ifnet *carp_ifp; + int index, error; - if (!sa) - return (0); + KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa)); - switch (sa->sa_family) { + switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: - break; -#endif /* INET */ +#endif #ifdef INET6 case AF_INET6: +#endif break; -#endif /* INET6 */ default: - return (0); + return (EPROTOTYPE); } - mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); - if (mtag == NULL) - return (0); + sx_xlock(&carp_sx); + if (ifp->if_carp == NULL) { + sx_xunlock(&carp_sx); + return (ENOPROTOOPT); + } - bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *)); - sc = carp_ifp->if_softc; + CIF_LOCK(cif); + IFNET_FOREACH_CARP(ifp, sc) + if (sc->sc_vhid == vhid) + break; + CIF_UNLOCK(cif); + if (sc == NULL) { + sx_xunlock(&carp_sx); + return (ENOENT); + } - /* Set the source MAC address to Virtual Router MAC Address */ - switch (ifp->if_type) { - case IFT_ETHER: - case IFT_L2VLAN: { - struct ether_header *eh; + error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family); + if (error) { + CIF_FREE(cif); + sx_xunlock(&carp_sx); + return (error); + } - eh = mtod(m, struct ether_header *); - eh->ether_shost[0] = 0; - eh->ether_shost[1] = 0; - eh->ether_shost[2] = 0x5e; - eh->ether_shost[3] = 0; - eh->ether_shost[4] = 1; - eh->ether_shost[5] = sc->sc_vhid; - } - break; - case IFT_FDDI: { - struct fddi_header *fh; + index = sc->sc_naddrs + sc->sc_naddrs6 + 1; + if (index > sc->sc_ifasiz / sizeof(struct ifaddr *)) + carp_grow_ifas(sc); - fh = mtod(m, struct fddi_header *); - fh->fddi_shost[0] = 0; - fh->fddi_shost[1] = 0; - fh->fddi_shost[2] = 0x5e; - fh->fddi_shost[3] = 0; - fh->fddi_shost[4] = 1; - fh->fddi_shost[5] = sc->sc_vhid; - } + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + cif->cif_naddrs++; + sc->sc_naddrs++; break; - case IFT_ISO88025: { - struct iso88025_header *th; - th = mtod(m, struct iso88025_header *); - th->iso88025_shost[0] = 3; - th->iso88025_shost[1] = 0; - th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1); - th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1); - th->iso88025_shost[4] = 0; - th->iso88025_shost[5] = 0; - } +#endif +#ifdef INET6 + case AF_INET6: + cif->cif_naddrs6++; + sc->sc_naddrs6++; break; - default: - printf("%s: carp is not supported for this interface type\n", - ifp->if_xname); - return (EOPNOTSUPP); +#endif } + ifa_ref(ifa); + + CARP_LOCK(sc); + sc->sc_ifas[index - 1] = ifa; + ifa->ifa_carp = sc; + carp_hmac_prepare(sc); + carp_sc_state(sc); + CARP_UNLOCK(sc); + + sx_xunlock(&carp_sx); + return (0); } -static void -carp_set_state(struct carp_softc *sc, int state) +void +carp_detach(struct ifaddr *ifa) { - int link_state; + struct ifnet *ifp = ifa->ifa_ifp; + struct carp_if *cif = ifp->if_carp; + struct carp_softc *sc = ifa->ifa_carp; + int i, index; - if (sc->sc_carpdev) - CARP_SCLOCK_ASSERT(sc); + KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa)); - if (sc->sc_state == state) - return; + sx_xlock(&carp_sx); - sc->sc_state = state; - switch (state) { - case BACKUP: - link_state = LINK_STATE_DOWN; - break; - case MASTER: - link_state = LINK_STATE_UP; + CARP_LOCK(sc); + /* Shift array. */ + index = sc->sc_naddrs + sc->sc_naddrs6; + for (i = 0; i < index; i++) + if (sc->sc_ifas[i] == ifa) + break; + KASSERT(i < index, ("%s: %p no backref", __func__, ifa)); + for (; i < index - 1; i++) + sc->sc_ifas[i] = sc->sc_ifas[i+1]; + sc->sc_ifas[index - 1] = NULL; + + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + cif->cif_naddrs--; + sc->sc_naddrs--; break; - default: - link_state = LINK_STATE_UNKNOWN; +#endif +#ifdef INET6 + case AF_INET6: + cif->cif_naddrs6--; + sc->sc_naddrs6--; break; +#endif } - if_link_state_change(SC2IFP(sc), link_state); + + carp_ifa_delroute(ifa); + carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family); + + ifa->ifa_carp = NULL; + ifa_free(ifa); + + carp_hmac_prepare(sc); + carp_sc_state(sc); + + if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) + carp_destroy(sc); + else + CARP_UNLOCK(sc); + + CIF_FREE(cif); + + sx_xunlock(&carp_sx); } -void -carp_carpdev_state(struct ifnet *ifp) +static void +carp_set_state(struct carp_softc *sc, int state, const char *reason) { - struct carp_if *cif; - cif = ifp->if_carp; - CARP_LOCK(cif); - carp_carpdev_state_locked(cif); - CARP_UNLOCK(cif); + CARP_LOCK_ASSERT(sc); + + if (sc->sc_state != state) { + const char *carp_states[] = { CARP_STATES }; + char subsys[IFNAMSIZ+5]; + + snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid, + sc->sc_carpdev->if_xname); + + CARP_LOG("%s: %s -> %s (%s)\n", subsys, + carp_states[sc->sc_state], carp_states[state], reason); + + sc->sc_state = state; + + devctl_notify("CARP", subsys, carp_states[state], NULL); + } } static void -carp_carpdev_state_locked(struct carp_if *cif) +carp_linkstate(struct ifnet *ifp) { struct carp_softc *sc; - TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) - carp_sc_state_locked(sc); + CIF_LOCK(ifp->if_carp); + IFNET_FOREACH_CARP(ifp, sc) { + CARP_LOCK(sc); + carp_sc_state(sc); + CARP_UNLOCK(sc); + } + CIF_UNLOCK(ifp->if_carp); } static void -carp_sc_state_locked(struct carp_softc *sc) +carp_sc_state(struct carp_softc *sc) { - CARP_SCLOCK_ASSERT(sc); + + CARP_LOCK_ASSERT(sc); if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || !(sc->sc_carpdev->if_flags & IFF_UP)) { - sc->sc_flags_backup = SC2IFP(sc)->if_flags; - SC2IFP(sc)->if_flags &= ~IFF_UP; - SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; callout_stop(&sc->sc_ad_tmo); +#ifdef INET callout_stop(&sc->sc_md_tmo); +#endif +#ifdef INET6 callout_stop(&sc->sc_md6_tmo); - carp_set_state(sc, INIT); +#endif + carp_set_state(sc, INIT, "hardware interface down"); carp_setrun(sc, 0); - if (!sc->sc_suppress) { - carp_suppress_preempt++; - if (carp_suppress_preempt == 1) { - CARP_SCUNLOCK(sc); - carp_send_ad_all(); - CARP_SCLOCK(sc); - } - } + if (!sc->sc_suppress) + carp_demote_adj(V_carp_ifdown_adj, "interface down"); sc->sc_suppress = 1; } else { - SC2IFP(sc)->if_flags |= sc->sc_flags_backup; - carp_set_state(sc, INIT); + carp_set_state(sc, INIT, "hardware interface up"); carp_setrun(sc, 0); if (sc->sc_suppress) - carp_suppress_preempt--; + carp_demote_adj(-V_carp_ifdown_adj, "interface up"); sc->sc_suppress = 0; } +} + +static void +carp_demote_adj(int adj, char *reason) +{ + atomic_add_int(&V_carp_demotion, adj); + CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason); + taskqueue_enqueue(taskqueue_swi, &carp_sendall_task); +} - return; +static int +carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS) +{ + int new, error; + + new = V_carp_demotion; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error || !req->newptr) + return (error); + + carp_demote_adj(new, "sysctl"); + + return (0); } #ifdef INET @@ -2313,7 +2029,7 @@ static struct protosw in_carp_protosw = { .pr_protocol = IPPROTO_CARP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = carp_input, - .pr_output = (pr_output_t *)rip_output, + .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; @@ -2321,7 +2037,7 @@ static struct protosw in_carp_protosw = { #ifdef INET6 extern struct domain inet6domain; -static struct ip6protosw in6_carp_protosw = { +static struct protosw in6_carp_protosw = { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_CARP, @@ -2337,10 +2053,6 @@ static void carp_mod_cleanup(void) { - if (if_detach_event_tag == NULL) - return; - EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); - if_clone_detach(&carp_cloner); #ifdef INET if (proto_reg[CARP_INET] == 0) { (void)ipproto_unregister(IPPROTO_CARP); @@ -2358,10 +2070,19 @@ carp_mod_cleanup(void) carp_iamatch6_p = NULL; carp_macmatch6_p = NULL; #endif + carp_ioctl_p = NULL; + carp_attach_p = NULL; + carp_detach_p = NULL; + carp_get_vhid_p = NULL; carp_linkstate_p = NULL; carp_forus_p = NULL; carp_output_p = NULL; + carp_demote_adj_p = NULL; + carp_master_p = NULL; + mtx_unlock(&carp_mtx); + taskqueue_drain(taskqueue_swi, &carp_sendall_task); mtx_destroy(&carp_mtx); + sx_destroy(&carp_sx); } static int @@ -2369,22 +2090,24 @@ carp_mod_load(void) { int err; - if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, - carp_ifdetach, NULL, EVENTHANDLER_PRI_ANY); - if (if_detach_event_tag == NULL) - return (ENOMEM); mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); - LIST_INIT(&carpif_list); - if_clone_attach(&carp_cloner); - carp_linkstate_p = carp_carpdev_state; + sx_init(&carp_sx, "carp_sx"); + LIST_INIT(&carp_list); + carp_get_vhid_p = carp_get_vhid; carp_forus_p = carp_forus; carp_output_p = carp_output; + carp_linkstate_p = carp_linkstate; + carp_ioctl_p = carp_ioctl; + carp_attach_p = carp_attach; + carp_detach_p = carp_detach; + carp_demote_adj_p = carp_demote_adj; + carp_master_p = carp_master; #ifdef INET6 carp_iamatch6_p = carp_iamatch6; carp_macmatch6_p = carp_macmatch6; proto_reg[CARP_INET6] = pf_proto_register(PF_INET6, (struct protosw *)&in6_carp_protosw); - if (proto_reg[CARP_INET6] != 0) { + if (proto_reg[CARP_INET6]) { printf("carp: error %d attaching to PF_INET6\n", proto_reg[CARP_INET6]); carp_mod_cleanup(); @@ -2400,7 +2123,7 @@ carp_mod_load(void) #ifdef INET carp_iamatch_p = carp_iamatch; proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw); - if (proto_reg[CARP_INET] != 0) { + if (proto_reg[CARP_INET]) { printf("carp: error %d attaching to PF_INET\n", proto_reg[CARP_INET]); carp_mod_cleanup(); @@ -2413,7 +2136,7 @@ carp_mod_load(void) return (err); } #endif - return 0; + return (0); } static int @@ -2424,17 +2147,13 @@ carp_modevent(module_t mod, int type, void *data) return carp_mod_load(); /* NOTREACHED */ case MOD_UNLOAD: - /* - * XXX: For now, disallow module unloading by default due to - * a race condition where a thread may dereference one of the - * function pointer hooks after the module has been - * unloaded, during processing of a packet, causing a panic. - */ -#ifdef CARPMOD_CAN_UNLOAD - carp_mod_cleanup(); -#else - return (EBUSY); -#endif + mtx_lock(&carp_mtx); + if (LIST_EMPTY(&carp_list)) + carp_mod_cleanup(); + else { + mtx_unlock(&carp_mtx); + return (EBUSY); + } break; default: diff --git a/freebsd/sys/netinet/ip_carp.h b/freebsd/sys/netinet/ip_carp.h index 2f2b4f28..5b7e5064 100644 --- a/freebsd/sys/netinet/ip_carp.h +++ b/freebsd/sys/netinet/ip_carp.h @@ -117,69 +117,57 @@ struct carpstats { uint64_t carps_preempt; /* if enabled, preemptions */ }; -#ifdef _KERNEL -#define CARPSTATS_ADD(name, val) carpstats.name += (val) -#define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) -#endif - /* * Configuration structure for SIOCSVH SIOCGVH */ struct carpreq { + int carpr_count; + int carpr_vhid; +#define CARP_MAXVHID 255 int carpr_state; #define CARP_STATES "INIT", "BACKUP", "MASTER" #define CARP_MAXSTATE 2 - int carpr_vhid; int carpr_advskew; +#define CARP_MAXSKEW 240 int carpr_advbase; unsigned char carpr_key[CARP_KEY_LEN]; }; #define SIOCSVH _IOWR('i', 245, struct ifreq) #define SIOCGVH _IOWR('i', 246, struct ifreq) -/* - * Names for CARP sysctl objects - */ -#define CARPCTL_ALLOW 1 /* accept incoming CARP packets */ -#define CARPCTL_PREEMPT 2 /* high-pri backup preemption mode */ -#define CARPCTL_LOG 3 /* log bad packets */ -#define CARPCTL_STATS 4 /* statistics (read-only) */ -#define CARPCTL_ARPBALANCE 5 /* balance arp responses */ -#define CARPCTL_MAXID 6 - -#define CARPCTL_NAMES { \ - { 0, 0 }, \ - { "allow", CTLTYPE_INT }, \ - { "preempt", CTLTYPE_INT }, \ - { "log", CTLTYPE_INT }, \ - { "stats", CTLTYPE_STRUCT }, \ - { "arpbalance", CTLTYPE_INT }, \ -} - #ifdef _KERNEL -void carp_carpdev_state(struct ifnet *); -void carp_input (struct mbuf *, int); -int carp6_input (struct mbuf **, int *, int); -int carp_output (struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *); -int carp_iamatch (struct ifnet *, struct in_ifaddr *, struct in_addr *, - u_int8_t **); +int carp_ioctl(struct ifreq *, u_long, struct thread *); +int carp_attach(struct ifaddr *, int); +void carp_detach(struct ifaddr *); +void carp_carpdev_state(struct ifnet *); +int carp_input(struct mbuf **, int *, int); +int carp6_input (struct mbuf **, int *, int); +int carp_output (struct ifnet *, struct mbuf *, + const struct sockaddr *); +int carp_master(struct ifaddr *); +int carp_iamatch(struct ifaddr *, uint8_t **); struct ifaddr *carp_iamatch6(struct ifnet *, struct in6_addr *); caddr_t carp_macmatch6(struct ifnet *, struct mbuf *, const struct in6_addr *); -struct ifnet *carp_forus (struct ifnet *, u_char *); +int carp_forus(struct ifnet *, u_char *); /* These are external networking stack hooks for CARP */ /* net/if.c */ +extern int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); +extern int (*carp_attach_p)(struct ifaddr *, int); +extern void (*carp_detach_p)(struct ifaddr *); extern void (*carp_linkstate_p)(struct ifnet *); +extern void (*carp_demote_adj_p)(int, char *); +extern int (*carp_master_p)(struct ifaddr *); /* net/if_bridge.c net/if_ethersubr.c */ -extern struct ifnet *(*carp_forus_p)(struct ifnet *, u_char *); +extern int (*carp_forus_p)(struct ifnet *, u_char *); /* net/if_ethersubr.c */ extern int (*carp_output_p)(struct ifnet *, struct mbuf *, - struct sockaddr *, struct rtentry *); + const struct sockaddr *); +/* net/rtsock.c */ +extern int (*carp_get_vhid_p)(struct ifaddr *); #ifdef INET /* netinet/if_ether.c */ -extern int (*carp_iamatch_p)(struct ifnet *, struct in_ifaddr *, - struct in_addr *, u_int8_t **); +extern int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 /* netinet6/nd6_nbr.c */ diff --git a/freebsd/sys/netinet/ip_divert.c b/freebsd/sys/netinet/ip_divert.c index 879f411f..b43ebb7c 100644 --- a/freebsd/sys/netinet/ip_divert.c +++ b/freebsd/sys/netinet/ip_divert.c @@ -32,16 +32,15 @@ #include __FBSDID("$FreeBSD$"); -#if !defined(KLD_MODULE) #include +#include #include #ifndef INET -#error "IPDIVERT requires INET." -#endif +#error "IPDIVERT requires INET" #endif -#include #include +#include #include #include #include @@ -57,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -160,27 +160,30 @@ div_init(void) * place for hashbase == NULL. */ in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", - div_inpcb_init, div_inpcb_fini, UMA_ZONE_NOFREE, - IPI_HASHFIELDS_NONE); + div_inpcb_init, div_inpcb_fini, 0, IPI_HASHFIELDS_NONE); } static void -div_destroy(void) +div_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_divcbinfo); } +VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, + div_destroy, NULL); /* * IPPROTO_DIVERT is not in the real IP protocol number space; this * function should never be called. Just in case, drop any packets. */ -static void -div_input(struct mbuf *m, int off) +static int +div_input(struct mbuf **mp, int *offp, int proto) { + struct mbuf *m = *mp; KMOD_IPSTAT_INC(ips_noproto); m_freem(m); + return (IPPROTO_DONE); } /* @@ -206,23 +209,19 @@ divert_packet(struct mbuf *m, int incoming) } /* Assure header */ if (m->m_len < sizeof(struct ip) && - (m = m_pullup(m, sizeof(struct ip))) == 0) + (m = m_pullup(m, sizeof(struct ip))) == NULL) return; ip = mtod(m, struct ip *); /* Delayed checksums are currently not compatible with divert. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - ip->ip_len = ntohs(ip->ip_len); in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; - ip->ip_len = htons(ip->ip_len); } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) { - ip->ip_len = ntohs(ip->ip_len); sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; - ip->ip_len = htons(ip->ip_len); } #endif bzero(&divsrc, sizeof(divsrc)); @@ -394,10 +393,6 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, INP_RUNLOCK(inp); goto cantsend; } - - /* Convert fields to host order for ip_output() */ - ip->ip_len = ntohs(ip->ip_len); - ip->ip_off = ntohs(ip->ip_off); break; #ifdef INET6 case IPV6_VERSION >> 4: @@ -410,8 +405,6 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, INP_RUNLOCK(inp); goto cantsend; } - - ip6->ip6_plen = ntohs(ip6->ip6_plen); break; } #endif @@ -611,7 +604,7 @@ div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && - (m = m_pullup(m, sizeof (struct ip))) == 0) { + (m = m_pullup(m, sizeof (struct ip))) == NULL) { KMOD_IPSTAT_INC(ips_toosmall); m_freem(m); return EINVAL; @@ -677,7 +670,7 @@ div_pcblist(SYSCTL_HANDLER_ARGS) return error; inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == 0) + if (inp_list == NULL) return ENOMEM; INP_INFO_RLOCK(&V_divcbinfo); @@ -766,9 +759,6 @@ struct protosw div_protosw = { .pr_ctlinput = div_ctlinput, .pr_ctloutput = ip_ctloutput, .pr_init = div_init, -#ifdef VIMAGE - .pr_destroy = div_destroy, -#endif .pr_usrreqs = &div_usrreqs }; @@ -776,9 +766,6 @@ static int div_modevent(module_t mod, int type, void *unused) { int err = 0; -#ifndef VIMAGE - int n; -#endif switch (type) { case MOD_LOAD: @@ -803,10 +790,6 @@ div_modevent(module_t mod, int type, void *unused) err = EPERM; break; case MOD_UNLOAD: -#ifdef VIMAGE - err = EPERM; - break; -#else /* * Forced unload. * @@ -819,8 +802,7 @@ div_modevent(module_t mod, int type, void *unused) * we destroy the lock. */ INP_INFO_WLOCK(&V_divcbinfo); - n = V_divcbinfo.ipi_count; - if (n != 0) { + if (V_divcbinfo.ipi_count != 0) { err = EBUSY; INP_INFO_WUNLOCK(&V_divcbinfo); break; @@ -828,10 +810,11 @@ div_modevent(module_t mod, int type, void *unused) ip_divert_ptr = NULL; err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); INP_INFO_WUNLOCK(&V_divcbinfo); - div_destroy(); +#ifndef VIMAGE + div_destroy(NULL); +#endif EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag); break; -#endif /* !VIMAGE */ default: err = EOPNOTSUPP; break; @@ -845,6 +828,6 @@ static moduledata_t ipdivertmod = { 0 }; -DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); -MODULE_DEPEND(ipdivert, ipfw, 2, 2, 2); +DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); +MODULE_DEPEND(ipdivert, ipfw, 3, 3, 3); MODULE_VERSION(ipdivert, 1); diff --git a/freebsd/sys/netinet/ip_dummynet.h b/freebsd/sys/netinet/ip_dummynet.h index dc2c3412..377b5b09 100644 --- a/freebsd/sys/netinet/ip_dummynet.h +++ b/freebsd/sys/netinet/ip_dummynet.h @@ -29,7 +29,7 @@ #ifndef _IP_DUMMYNET_H #define _IP_DUMMYNET_H - +#define NEW_AQM /* * Definition of the kernel-userland API for dummynet. * @@ -85,7 +85,13 @@ enum { /* special commands for emulation of sysctl variables */ DN_SYSCTL_GET, DN_SYSCTL_SET, - +#ifdef NEW_AQM + /* subtypes used for setting/getting extra parameters. + * these subtypes used with IP_DUMMYNET3 command (get) + * and DN_TEXT (set). */ + DN_AQM_PARAMS, /* AQM extra params */ + DN_SCH_PARAMS, /* scheduler extra params */ +#endif DN_LAST, }; @@ -104,6 +110,10 @@ enum { /* user flags */ DN_HAS_PROFILE = 0x0010, /* a link has a profile */ DN_IS_RED = 0x0020, DN_IS_GENTLE_RED= 0x0040, + DN_IS_ECN = 0x0080, + #ifdef NEW_AQM + DN_IS_AQM = 0x0100, /* AQMs: e.g Codel & PIE */ + #endif DN_PIPE_CMD = 0x1000, /* pipe config... */ }; @@ -171,8 +181,8 @@ struct dn_flow { struct ipfw_flow_id fid; uint64_t tot_pkts; /* statistics counters */ uint64_t tot_bytes; - uint32_t length; /* Queue lenght, in packets */ - uint32_t len_bytes; /* Queue lenght, in bytes */ + uint32_t length; /* Queue length, in packets */ + uint32_t len_bytes; /* Queue length, in bytes */ uint32_t drops; }; @@ -209,7 +219,19 @@ struct dn_profile { int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ }; - +#ifdef NEW_AQM +/* Extra parameters for AQM and scheduler. + * This struct is used to pass and retrieve parameters (configurations) + * to/from AQM and Scheduler. + */ +struct dn_extra_parms { + struct dn_id oid; + char name[16]; + uint32_t nr; +#define DN_MAX_EXTRA_PARM 10 + int64_t par[DN_MAX_EXTRA_PARM]; +}; +#endif /* * Overall structure of dummynet diff --git a/freebsd/sys/netinet/ip_ecn.h b/freebsd/sys/netinet/ip_ecn.h index 6a814160..c5c1c4eb 100644 --- a/freebsd/sys/netinet/ip_ecn.h +++ b/freebsd/sys/netinet/ip_ecn.h @@ -38,10 +38,6 @@ #ifndef _NETINET_IP_ECN_H_ #define _NETINET_IP_ECN_H_ -#if defined(_KERNEL) && !defined(_LKM) -#include -#endif - #define ECN_ALLOWED 1 /* ECN allowed */ #define ECN_FORBIDDEN 0 /* ECN forbidden */ #define ECN_NOCARE (-1) /* no consideration to ECN */ diff --git a/freebsd/sys/netinet/ip_encap.c b/freebsd/sys/netinet/ip_encap.c index 14f8cd51..19ff1a09 100644 --- a/freebsd/sys/netinet/ip_encap.c +++ b/freebsd/sys/netinet/ip_encap.c @@ -67,6 +67,8 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include #include #include #include @@ -86,7 +88,6 @@ __FBSDID("$FreeBSD$"); #ifdef INET6 #include #include -#include #endif #include @@ -98,14 +99,14 @@ static MALLOC_DEFINE(M_NETADDR, "encap_export_host", "Export host address struct static void encap_add(struct encaptab *); static int mask_match(const struct encaptab *, const struct sockaddr *, const struct sockaddr *); -static void encap_fillarg(struct mbuf *, const struct encaptab *); +static void encap_fillarg(struct mbuf *, void *); /* * All global variables in ip_encap.c are locked using encapmtx. */ static struct mtx encapmtx; MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF); -LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab); +static LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab); /* * We currently keey encap_init() for source code compatibility reasons -- @@ -117,18 +118,20 @@ encap_init(void) } #ifdef INET -void -encap4_input(struct mbuf *m, int off) +int +encap4_input(struct mbuf **mp, int *offp, int proto) { struct ip *ip; - int proto; + struct mbuf *m; struct sockaddr_in s, d; const struct protosw *psw; struct encaptab *ep, *match; - int prio, matchprio; + void *arg; + int matchprio, off, prio; + m = *mp; + off = *offp; ip = mtod(m, struct ip *); - proto = ip->ip_p; bzero(&s, sizeof(s)); s.sin_family = AF_INET; @@ -139,6 +142,8 @@ encap4_input(struct mbuf *m, int off) d.sin_len = sizeof(struct sockaddr_in); d.sin_addr = ip->ip_dst; + arg = NULL; + psw = NULL; match = NULL; matchprio = 0; mtx_lock(&encapmtx); @@ -183,21 +188,24 @@ encap4_input(struct mbuf *m, int off) match = ep; } } + if (match != NULL) { + psw = match->psw; + arg = match->arg; + } mtx_unlock(&encapmtx); - if (match) { + if (match != NULL) { /* found a match, "match" has the best one */ - psw = match->psw; - if (psw && psw->pr_input) { - encap_fillarg(m, match); - (*psw->pr_input)(m, off); + if (psw != NULL && psw->pr_input != NULL) { + encap_fillarg(m, arg); + (*psw->pr_input)(mp, offp, proto); } else m_freem(m); - return; + return (IPPROTO_DONE); } /* last resort: inject to raw socket */ - rip_input(m, off); + return (rip_input(mp, offp, proto)); } #endif @@ -208,8 +216,9 @@ encap6_input(struct mbuf **mp, int *offp, int proto) struct mbuf *m = *mp; struct ip6_hdr *ip6; struct sockaddr_in6 s, d; - const struct ip6protosw *psw; + const struct protosw *psw; struct encaptab *ep, *match; + void *arg; int prio, matchprio; ip6 = mtod(m, struct ip6_hdr *); @@ -223,6 +232,8 @@ encap6_input(struct mbuf **mp, int *offp, int proto) d.sin6_len = sizeof(struct sockaddr_in6); d.sin6_addr = ip6->ip6_dst; + arg = NULL; + psw = NULL; match = NULL; matchprio = 0; mtx_lock(&encapmtx); @@ -250,17 +261,20 @@ encap6_input(struct mbuf **mp, int *offp, int proto) match = ep; } } + if (match != NULL) { + psw = match->psw; + arg = match->arg; + } mtx_unlock(&encapmtx); - if (match) { + if (match != NULL) { /* found a match */ - psw = (const struct ip6protosw *)match->psw; - if (psw && psw->pr_input) { - encap_fillarg(m, match); + if (psw != NULL && psw->pr_input != NULL) { + encap_fillarg(m, arg); return (*psw->pr_input)(mp, offp, proto); } else { m_freem(m); - return IPPROTO_DONE; + return (IPPROTO_DONE); } } @@ -439,14 +453,16 @@ mask_match(const struct encaptab *ep, const struct sockaddr *sp, } static void -encap_fillarg(struct mbuf *m, const struct encaptab *ep) +encap_fillarg(struct mbuf *m, void *arg) { struct m_tag *tag; - tag = m_tag_get(PACKET_TAG_ENCAP, sizeof (void*), M_NOWAIT); - if (tag) { - *(void**)(tag+1) = ep->arg; - m_tag_prepend(m, tag); + if (arg != NULL) { + tag = m_tag_get(PACKET_TAG_ENCAP, sizeof(void *), M_NOWAIT); + if (tag != NULL) { + *(void**)(tag+1) = arg; + m_tag_prepend(m, tag); + } } } diff --git a/freebsd/sys/netinet/ip_encap.h b/freebsd/sys/netinet/ip_encap.h index 3b1a5aee..0b8dbd6f 100644 --- a/freebsd/sys/netinet/ip_encap.h +++ b/freebsd/sys/netinet/ip_encap.h @@ -49,7 +49,7 @@ struct encaptab { }; void encap_init(void); -void encap4_input(struct mbuf *, int); +int encap4_input(struct mbuf **, int *, int); int encap6_input(struct mbuf **, int *, int); const struct encaptab *encap_attach(int, int, const struct sockaddr *, const struct sockaddr *, const struct sockaddr *, diff --git a/freebsd/sys/netinet/ip_fastfwd.c b/freebsd/sys/netinet/ip_fastfwd.c index 863b9a16..19dfb1ab 100644 --- a/freebsd/sys/netinet/ip_fastfwd.c +++ b/freebsd/sys/netinet/ip_fastfwd.c @@ -78,7 +78,6 @@ #include __FBSDID("$FreeBSD$"); -#include #include #include @@ -87,6 +86,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -99,6 +99,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -108,12 +109,6 @@ __FBSDID("$FreeBSD$"); #include -static VNET_DEFINE(int, ipfastforward_active); -#define V_ipfastforward_active VNET(ipfastforward_active) - -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW, - &VNET_NAME(ipfastforward_active), 0, "Enable fast IP forwarding"); - static struct sockaddr_in * ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m) { @@ -158,7 +153,7 @@ ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m) * to ip_input for full processing. */ struct mbuf * -ip_fastforward(struct mbuf *m) +ip_tryforward(struct mbuf *m) { struct ip *ip; struct mbuf *m0 = NULL; @@ -166,119 +161,20 @@ ip_fastforward(struct mbuf *m) struct sockaddr_in *dst = NULL; struct ifnet *ifp; struct in_addr odest, dest; - u_short sum, ip_len; + uint16_t ip_len, ip_off; int error = 0; - int hlen, mtu; + int mtu; struct m_tag *fwd_tag = NULL; /* * Are we active and forwarding packets? */ - if (!V_ipfastforward_active || !V_ipforwarding) - return m; M_ASSERTVALID(m); M_ASSERTPKTHDR(m); bzero(&ro, sizeof(ro)); - /* - * Step 1: check for packet drop conditions (and sanity checks) - */ - - /* - * Is entire packet big enough? - */ - if (m->m_pkthdr.len < sizeof(struct ip)) { - IPSTAT_INC(ips_tooshort); - goto drop; - } - - /* - * Is first mbuf large enough for ip header and is header present? - */ - if (m->m_len < sizeof (struct ip) && - (m = m_pullup(m, sizeof (struct ip))) == NULL) { - IPSTAT_INC(ips_toosmall); - return NULL; /* mbuf already free'd */ - } - - ip = mtod(m, struct ip *); - - /* - * Is it IPv4? - */ - if (ip->ip_v != IPVERSION) { - IPSTAT_INC(ips_badvers); - goto drop; - } - - /* - * Is IP header length correct and is it in first mbuf? - */ - hlen = ip->ip_hl << 2; - if (hlen < sizeof(struct ip)) { /* minimum header length */ - IPSTAT_INC(ips_badhlen); - goto drop; - } - if (hlen > m->m_len) { - if ((m = m_pullup(m, hlen)) == NULL) { - IPSTAT_INC(ips_badhlen); - return NULL; /* mbuf already free'd */ - } - ip = mtod(m, struct ip *); - } - - /* - * Checksum correct? - */ - if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) - sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); - else { - if (hlen == sizeof(struct ip)) - sum = in_cksum_hdr(ip); - else - sum = in_cksum(m, hlen); - } - if (sum) { - IPSTAT_INC(ips_badsum); - goto drop; - } - - /* - * Remember that we have checked the IP header and found it valid. - */ - m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); - - ip_len = ntohs(ip->ip_len); - - /* - * Is IP length longer than packet we have got? - */ - if (m->m_pkthdr.len < ip_len) { - IPSTAT_INC(ips_tooshort); - goto drop; - } - - /* - * Is packet longer than IP header tells us? If yes, truncate packet. - */ - if (m->m_pkthdr.len > ip_len) { - if (m->m_len == m->m_pkthdr.len) { - m->m_len = ip_len; - m->m_pkthdr.len = ip_len; - } else - m_adj(m, ip_len - m->m_pkthdr.len); - } - - /* - * Is packet from or to 127/8? - */ - if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || - (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { - IPSTAT_INC(ips_badaddr); - goto drop; - } #ifdef ALTQ /* @@ -288,17 +184,15 @@ ip_fastforward(struct mbuf *m) goto drop; #endif - /* - * Step 2: fallback conditions to normal ip_input path processing - */ - /* * Only IP packets without options */ + ip = mtod(m, struct ip *); + if (ip->ip_hl != (sizeof(struct ip) >> 2)) { - if (ip_doopts == 1) + if (V_ip_doopts == 1) return m; - else if (ip_doopts == 2) { + else if (V_ip_doopts == 2) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB, 0, 0); return NULL; /* mbuf already free'd */ @@ -312,7 +206,7 @@ ip_fastforward(struct mbuf *m) * * XXX: Probably some of these checks could be direct drop * conditions. However it is not clear whether there are some - * hacks or obscure behaviours which make it neccessary to + * hacks or obscure behaviours which make it necessary to * let ip_input handle it. We play safe here and let ip_input * deal with it until it is proven that we can directly drop it. */ @@ -340,12 +234,6 @@ ip_fastforward(struct mbuf *m) * Step 3: incoming packet firewall processing */ - /* - * Convert to host representation - */ - ip->ip_len = ntohs(ip->ip_len); - ip->ip_off = ntohs(ip->ip_off); - odest.s_addr = dest.s_addr = ip->ip_dst.s_addr; /* @@ -464,8 +352,6 @@ passin: forwardlocal: /* * Return packet for processing by ip_input(). - * Keep host byte order as expected at ip_input's - * "ours"-label. */ m->m_flags |= M_FASTFWD_OURS; if (ro.ro_rt) @@ -491,29 +377,18 @@ passout: /* * Step 6: send off the packet */ + ip_len = ntohs(ip->ip_len); + ip_off = ntohs(ip->ip_off); /* * Check if route is dampned (when ARP is unable to resolve) */ if ((ro.ro_rt->rt_flags & RTF_REJECT) && - (ro.ro_rt->rt_rmx.rmx_expire == 0 || - time_uptime < ro.ro_rt->rt_rmx.rmx_expire)) { + (ro.ro_rt->rt_expire == 0 || time_uptime < ro.ro_rt->rt_expire)) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); goto consumed; } -#ifndef ALTQ - /* - * Check if there is enough space in the interface queue - */ - if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= - ifp->if_snd.ifq_maxlen) { - IPSTAT_INC(ips_odropped); - /* would send source quench here but that is depreciated */ - goto drop; - } -#endif - /* * Check if media link state of interface is not down */ @@ -525,28 +400,27 @@ passout: /* * Check if packet fits MTU or if hardware will fragment for us */ - if (ro.ro_rt->rt_rmx.rmx_mtu) - mtu = min(ro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); + if (ro.ro_rt->rt_mtu) + mtu = min(ro.ro_rt->rt_mtu, ifp->if_mtu); else mtu = ifp->if_mtu; - if (ip->ip_len <= mtu || - (ifp->if_hwassist & CSUM_FRAGMENT && (ip->ip_off & IP_DF) == 0)) { + if (ip_len <= mtu) { /* - * Restore packet header fields to original values + * Avoid confusing lower layers. */ - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); + m_clrprotoflags(m); /* * Send off the packet via outgoing interface */ + IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, &ro); } else { /* * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery */ - if (ip->ip_off & IP_DF) { + if (ip_off & IP_DF) { IPSTAT_INC(ips_cantfrag); icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu); @@ -556,14 +430,8 @@ passout: * We have to fragment the packet */ m->m_pkthdr.csum_flags |= CSUM_IP; - /* - * ip_fragment expects ip_len and ip_off in host byte - * order but returns all packets in network byte order - */ - if (ip_fragment(ip, &m, mtu, ifp->if_hwassist, - (~ifp->if_hwassist & CSUM_DELAY_IP))) { + if (ip_fragment(ip, &m, mtu, ifp->if_hwassist)) goto drop; - } KASSERT(m != NULL, ("null mbuf and no error")); /* * Send off the fragments via outgoing interface @@ -572,7 +440,12 @@ passout: do { m0 = m->m_nextpkt; m->m_nextpkt = NULL; + /* + * Avoid confusing lower layers. + */ + m_clrprotoflags(m); + IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, (struct sockaddr *)dst, &ro); if (error) @@ -592,7 +465,7 @@ passout: if (error != 0) IPSTAT_INC(ips_odropped); else { - ro.ro_rt->rt_rmx.rmx_pksent++; + counter_u64_add(ro.ro_rt->rt_pksent, 1); IPSTAT_INC(ips_forward); IPSTAT_INC(ips_fastforward); } diff --git a/freebsd/sys/netinet/ip_fw.h b/freebsd/sys/netinet/ip_fw.h index 14b08f5e..d274ab27 100644 --- a/freebsd/sys/netinet/ip_fw.h +++ b/freebsd/sys/netinet/ip_fw.h @@ -36,25 +36,31 @@ */ #define IPFW_DEFAULT_RULE 65535 +#define RESVD_SET 31 /*set for default and persistent rules*/ +#define IPFW_MAX_SETS 32 /* Number of sets supported by ipfw*/ + /* - * Default number of ipfw tables. + * Compat values for old clients */ +#ifndef _KERNEL #define IPFW_TABLES_MAX 65535 #define IPFW_TABLES_DEFAULT 128 +#endif /* * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit - * argument between 1 and 65534. The value 0 is unused, the value - * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the - * can be 1..65534, or 65535 to indicate the use of a 'tablearg' + * argument between 1 and 65534. The value 0 (IP_FW_TARG) is used + * to represent 'tablearg' value, e.g. indicate the use of a 'tablearg' * result of the most recent table() lookup. * Note that 16bit is only a historical limit, resulting from * the use of a 16-bit fields for that value. In reality, we can have - * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg. + * 2^32 pipes, queues, tag values and so on. */ #define IPFW_ARG_MIN 1 #define IPFW_ARG_MAX 65534 -#define IP_FW_TABLEARG 65535 /* XXX should use 0 */ +#define IP_FW_TABLEARG 65535 /* Compat value for old clients */ +#define IP_FW_TARG 0 /* Current tablearg value */ +#define IP_FW_NAT44_GLOBAL 65535 /* arg1 value for "nat global" */ /* * Number of entries in the call stack of the call/return commands. @@ -65,15 +71,66 @@ /* IP_FW3 header/opcodes */ typedef struct _ip_fw3_opheader { uint16_t opcode; /* Operation opcode */ - uint16_t reserved[3]; /* Align to 64-bit boundary */ + uint16_t version; /* Opcode version */ + uint16_t reserved[2]; /* Align to 64-bit boundary */ } ip_fw3_opheader; - -/* IPFW extented tables support */ +/* IP_FW3 opcodes */ #define IP_FW_TABLE_XADD 86 /* add entry */ #define IP_FW_TABLE_XDEL 87 /* delete entry */ -#define IP_FW_TABLE_XGETSIZE 88 /* get table size */ +#define IP_FW_TABLE_XGETSIZE 88 /* get table size (deprecated) */ #define IP_FW_TABLE_XLIST 89 /* list table contents */ +#define IP_FW_TABLE_XDESTROY 90 /* destroy table */ +#define IP_FW_TABLES_XLIST 92 /* list all tables */ +#define IP_FW_TABLE_XINFO 93 /* request info for one table */ +#define IP_FW_TABLE_XFLUSH 94 /* flush table data */ +#define IP_FW_TABLE_XCREATE 95 /* create new table */ +#define IP_FW_TABLE_XMODIFY 96 /* modify existing table */ +#define IP_FW_XGET 97 /* Retrieve configuration */ +#define IP_FW_XADD 98 /* add rule */ +#define IP_FW_XDEL 99 /* del rule */ +#define IP_FW_XMOVE 100 /* move rules to different set */ +#define IP_FW_XZERO 101 /* clear accounting */ +#define IP_FW_XRESETLOG 102 /* zero rules logs */ +#define IP_FW_SET_SWAP 103 /* Swap between 2 sets */ +#define IP_FW_SET_MOVE 104 /* Move one set to another one */ +#define IP_FW_SET_ENABLE 105 /* Enable/disable sets */ +#define IP_FW_TABLE_XFIND 106 /* finds an entry */ +#define IP_FW_XIFLIST 107 /* list tracked interfaces */ +#define IP_FW_TABLES_ALIST 108 /* list table algorithms */ +#define IP_FW_TABLE_XSWAP 109 /* swap two tables */ +#define IP_FW_TABLE_VLIST 110 /* dump table value hash */ + +#define IP_FW_NAT44_XCONFIG 111 /* Create/modify NAT44 instance */ +#define IP_FW_NAT44_DESTROY 112 /* Destroys NAT44 instance */ +#define IP_FW_NAT44_XGETCONFIG 113 /* Get NAT44 instance config */ +#define IP_FW_NAT44_LIST_NAT 114 /* List all NAT44 instances */ +#define IP_FW_NAT44_XGETLOG 115 /* Get log from NAT44 instance */ + +#define IP_FW_DUMP_SOPTCODES 116 /* Dump available sopts/versions */ +#define IP_FW_DUMP_SRVOBJECTS 117 /* Dump existing named objects */ + +#define IP_FW_NAT64STL_CREATE 130 /* Create stateless NAT64 instance */ +#define IP_FW_NAT64STL_DESTROY 131 /* Destroy stateless NAT64 instance */ +#define IP_FW_NAT64STL_CONFIG 132 /* Modify stateless NAT64 instance */ +#define IP_FW_NAT64STL_LIST 133 /* List stateless NAT64 instances */ +#define IP_FW_NAT64STL_STATS 134 /* Get NAT64STL instance statistics */ +#define IP_FW_NAT64STL_RESET_STATS 135 /* Reset NAT64STL instance statistics */ + +#define IP_FW_NAT64LSN_CREATE 140 /* Create stateful NAT64 instance */ +#define IP_FW_NAT64LSN_DESTROY 141 /* Destroy stateful NAT64 instance */ +#define IP_FW_NAT64LSN_CONFIG 142 /* Modify stateful NAT64 instance */ +#define IP_FW_NAT64LSN_LIST 143 /* List stateful NAT64 instances */ +#define IP_FW_NAT64LSN_STATS 144 /* Get NAT64LSN instance statistics */ +#define IP_FW_NAT64LSN_LIST_STATES 145 /* Get stateful NAT64 states */ +#define IP_FW_NAT64LSN_RESET_STATS 146 /* Reset NAT64LSN instance statistics */ + +#define IP_FW_NPTV6_CREATE 150 /* Create NPTv6 instance */ +#define IP_FW_NPTV6_DESTROY 151 /* Destroy NPTv6 instance */ +#define IP_FW_NPTV6_CONFIG 152 /* Modify NPTv6 instance */ +#define IP_FW_NPTV6_LIST 153 /* List NPTv6 instances */ +#define IP_FW_NPTV6_STATS 154 /* Get NPTv6 instance statistics */ +#define IP_FW_NPTV6_RESET_STATS 155 /* Reset NPTv6 instance statistics */ /* * The kernel representation of ipfw rules is made of a list of @@ -220,11 +277,14 @@ enum ipfw_opcodes { /* arguments (4 byte each) */ O_DSCP, /* 2 u32 = DSCP mask */ O_SETDSCP, /* arg1=DSCP value */ + O_IP_FLOW_LOOKUP, /* arg1=table number, u32=value */ + + O_EXTERNAL_ACTION, /* arg1=id of external action handler */ + O_EXTERNAL_INSTANCE, /* arg1=id of eaction handler instance */ O_LAST_OPCODE /* not an opcode! */ }; - /* * The extension header are filtered only for presence using a bit * vector with a flag for each header. @@ -341,6 +401,7 @@ typedef struct _ipfw_insn_if { union { struct in_addr ip; int glob; + uint16_t kidx; } p; char name[IFNAMSIZ]; } ipfw_insn_if; @@ -377,6 +438,8 @@ typedef struct _ipfw_insn_log { u_int32_t log_left; /* how many left to log */ } ipfw_insn_log; +/* Legacy NAT structures, compat only */ +#ifndef _KERNEL /* * Data structures required by both ipfw(8) and ipfw(4) but not part of the * management API are protected by IPFW_INTERNAL. @@ -438,6 +501,44 @@ struct cfg_nat { #define SOF_REDIR sizeof(struct cfg_redir) #define SOF_SPOOL sizeof(struct cfg_spool) +#endif /* ifndef _KERNEL */ + + +struct nat44_cfg_spool { + struct in_addr addr; + uint16_t port; + uint16_t spare; +}; +#define NAT44_REDIR_ADDR 0x01 +#define NAT44_REDIR_PORT 0x02 +#define NAT44_REDIR_PROTO 0x04 + +/* Nat redirect configuration. */ +struct nat44_cfg_redir { + struct in_addr laddr; /* local ip address */ + struct in_addr paddr; /* public ip address */ + struct in_addr raddr; /* remote ip address */ + uint16_t lport; /* local port */ + uint16_t pport; /* public port */ + uint16_t rport; /* remote port */ + uint16_t pport_cnt; /* number of public ports */ + uint16_t rport_cnt; /* number of remote ports */ + uint16_t mode; /* type of redirect mode */ + uint16_t spool_cnt; /* num of entry in spool chain */ + uint16_t spare; + uint32_t proto; /* protocol: tcp/udp */ +}; + +/* Nat configuration data struct. */ +struct nat44_cfg_nat { + char name[64]; /* nat name */ + char if_name[64]; /* interface name */ + uint32_t size; /* structure size incl. redirs */ + struct in_addr ip; /* nat IPv4 address */ + uint32_t mode; /* aliasing mode */ + uint32_t redir_cnt; /* number of entry in spool chain */ +}; + /* Nat command. */ typedef struct _ipfw_insn_nat { ipfw_insn o; @@ -471,15 +572,17 @@ typedef struct _ipfw_insn_icmp6 { /* * Here we have the structure representing an ipfw rule. * - * It starts with a general area (with link fields and counters) - * followed by an array of one or more instructions, which the code - * accesses as an array of 32-bit values. - * - * Given a rule pointer r: + * Layout: + * struct ip_fw_rule + * [ counter block, size = rule->cntr_len ] + * [ one or more instructions, size = rule->cmd_len * 4 ] * - * r->cmd is the start of the first instruction. - * ACTION_PTR(r) is the start of the first action (things to do - * once a rule matched). + * It starts with a general area (with link fields). + * Counter block may be next (if rule->cntr_len > 0), + * followed by an array of one or more instructions, which the code + * accesses as an array of 32-bit values. rule->cmd_len represents + * the total instructions legth in u32 worrd, while act_ofs represents + * rule action offset in u32 words. * * When assembling instruction, remember the following: * @@ -490,11 +593,41 @@ typedef struct _ipfw_insn_icmp6 { * + if a rule has an "altq" option, it comes after "log" * + if a rule has an O_TAG option, it comes after "log" and "altq" * - * NOTE: we use a simple linked list of rules because we never need - * to delete a rule without scanning the list. We do not use - * queue(3) macros for portability and readability. + * + * All structures (excluding instructions) are u64-aligned. + * Please keep this. */ +struct ip_fw_rule { + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t spare; + uint8_t set; /* rule set (0..31) */ + uint8_t flags; /* rule flags */ + uint32_t rulenum; /* rule number */ + uint32_t id; /* rule id */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; +#define IPFW_RULE_NOOPT 0x01 /* Has no options in body */ + +/* Unaligned version */ + +/* Base ipfw rule counter block. */ +struct ip_fw_bcounter { + uint16_t size; /* Size of counter block, bytes */ + uint8_t flags; /* flags for given block */ + uint8_t spare; + uint32_t timestamp; /* tv_sec of last match */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ +}; + + +#ifndef _KERNEL +/* + * Legacy rule format + */ struct ip_fw { struct ip_fw *x_next; /* linked list of rules */ struct ip_fw *next_rule; /* ptr to next [skipto] rule */ @@ -503,8 +636,7 @@ struct ip_fw { uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ - uint8_t set; /* rule set (0..31) */ -#define RESVD_SET 31 /* set for default and persistent rules */ + uint8_t set; /* rule set (0..31) */ uint8_t _pad; /* padding */ uint32_t id; /* rule id */ @@ -515,12 +647,13 @@ struct ip_fw { ipfw_insn cmd[1]; /* storage for commands */ }; +#endif #define ACTION_PTR(rule) \ (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) ) -#define RULESIZE(rule) (sizeof(struct ip_fw) + \ - ((struct ip_fw *)(rule))->cmd_len * 4 - 4) +#define RULESIZE(rule) (sizeof(*(rule)) + (rule)->cmd_len * 4 - 4) + #if 1 // should be moved to in.h /* @@ -572,7 +705,8 @@ struct _ipfw_dyn_rule { /* to generate keepalives) */ u_int16_t dyn_type; /* rule type */ u_int16_t count; /* refcount */ -}; + u_int16_t kidx; /* index of named object */ +} __packed __aligned(8); /* * Definitions for IP option names. @@ -598,9 +732,27 @@ struct _ipfw_dyn_rule { * These are used for lookup tables. */ -#define IPFW_TABLE_CIDR 1 /* Table for holding IPv4/IPv6 prefixes */ +#define IPFW_TABLE_ADDR 1 /* Table for holding IPv4/IPv6 prefixes */ #define IPFW_TABLE_INTERFACE 2 /* Table for holding interface names */ -#define IPFW_TABLE_MAXTYPE 2 /* Maximum valid number */ +#define IPFW_TABLE_NUMBER 3 /* Table for holding ports/uid/gid/etc */ +#define IPFW_TABLE_FLOW 4 /* Table for holding flow data */ +#define IPFW_TABLE_MAXTYPE 4 /* Maximum valid number */ + +#define IPFW_TABLE_CIDR IPFW_TABLE_ADDR /* compat */ + +/* Value types */ +#define IPFW_VTYPE_LEGACY 0xFFFFFFFF /* All data is filled in */ +#define IPFW_VTYPE_SKIPTO 0x00000001 /* skipto/call/callreturn */ +#define IPFW_VTYPE_PIPE 0x00000002 /* pipe/queue */ +#define IPFW_VTYPE_FIB 0x00000004 /* setfib */ +#define IPFW_VTYPE_NAT 0x00000008 /* nat */ +#define IPFW_VTYPE_DSCP 0x00000010 /* dscp */ +#define IPFW_VTYPE_TAG 0x00000020 /* tag/untag */ +#define IPFW_VTYPE_DIVERT 0x00000040 /* divert/tee */ +#define IPFW_VTYPE_NETGRAPH 0x00000080 /* netgraph/ngtee */ +#define IPFW_VTYPE_LIMIT 0x00000100 /* limit */ +#define IPFW_VTYPE_NH4 0x00000200 /* IPv4 nexthop */ +#define IPFW_VTYPE_NH6 0x00000400 /* IPv6 nexthop */ typedef struct _ipfw_table_entry { in_addr_t addr; /* network address */ @@ -614,6 +766,7 @@ typedef struct _ipfw_table_xentry { uint8_t type; /* entry type */ uint8_t masklen; /* mask length */ uint16_t tbl; /* table number */ + uint16_t flags; /* record flags */ uint32_t value; /* value */ union { /* Longest field needs to be aligned by 4-byte boundary */ @@ -621,6 +774,7 @@ typedef struct _ipfw_table_xentry { char iface[IF_NAMESIZE]; /* interface name */ } k; } ipfw_table_xentry; +#define IPFW_TCF_INET 0x01 /* CIDR flags: IPv4 record */ typedef struct _ipfw_table { u_int32_t size; /* size of entries in bytes */ @@ -630,7 +784,7 @@ typedef struct _ipfw_table { } ipfw_table; typedef struct _ipfw_xtable { - ip_fw3_opheader opheader; /* eXtended tables are controlled via IP_FW3 */ + ip_fw3_opheader opheader; /* IP_FW3 opcode */ uint32_t size; /* size of entries in bytes */ uint32_t cnt; /* # of entries */ uint16_t tbl; /* table number */ @@ -638,4 +792,259 @@ typedef struct _ipfw_xtable { ipfw_table_xentry xent[0]; /* entries */ } ipfw_xtable; +typedef struct _ipfw_obj_tlv { + uint16_t type; /* TLV type */ + uint16_t flags; /* TLV-specific flags */ + uint32_t length; /* Total length, aligned to u64 */ +} ipfw_obj_tlv; +#define IPFW_TLV_TBL_NAME 1 +#define IPFW_TLV_TBLNAME_LIST 2 +#define IPFW_TLV_RULE_LIST 3 +#define IPFW_TLV_DYNSTATE_LIST 4 +#define IPFW_TLV_TBL_ENT 5 +#define IPFW_TLV_DYN_ENT 6 +#define IPFW_TLV_RULE_ENT 7 +#define IPFW_TLV_TBLENT_LIST 8 +#define IPFW_TLV_RANGE 9 +#define IPFW_TLV_EACTION 10 +#define IPFW_TLV_COUNTERS 11 +#define IPFW_TLV_OBJDATA 12 +#define IPFW_TLV_STATE_NAME 14 + +#define IPFW_TLV_EACTION_BASE 1000 +#define IPFW_TLV_EACTION_NAME(arg) (IPFW_TLV_EACTION_BASE + (arg)) + +typedef struct _ipfw_obj_data { + ipfw_obj_tlv head; + void *data[0]; +} ipfw_obj_data; + +/* Object name TLV */ +typedef struct _ipfw_obj_ntlv { + ipfw_obj_tlv head; /* TLV header */ + uint16_t idx; /* Name index */ + uint8_t set; /* set, if applicable */ + uint8_t type; /* object type, if applicable */ + uint32_t spare; /* unused */ + char name[64]; /* Null-terminated name */ +} ipfw_obj_ntlv; + +/* IPv4/IPv6 L4 flow description */ +struct tflow_entry { + uint8_t af; + uint8_t proto; + uint16_t spare; + uint16_t sport; + uint16_t dport; + union { + struct { + struct in_addr sip; + struct in_addr dip; + } a4; + struct { + struct in6_addr sip6; + struct in6_addr dip6; + } a6; + } a; +}; + +typedef struct _ipfw_table_value { + uint32_t tag; /* O_TAG/O_TAGGED */ + uint32_t pipe; /* O_PIPE/O_QUEUE */ + uint16_t divert; /* O_DIVERT/O_TEE */ + uint16_t skipto; /* skipto, CALLRET */ + uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ + uint32_t fib; /* O_SETFIB */ + uint32_t nat; /* O_NAT */ + uint32_t nh4; + uint8_t dscp; + uint8_t spare0; + uint16_t spare1; + struct in6_addr nh6; + uint32_t limit; /* O_LIMIT */ + uint32_t zoneid; /* scope zone id for nh6 */ + uint64_t reserved; +} ipfw_table_value; + +/* Table entry TLV */ +typedef struct _ipfw_obj_tentry { + ipfw_obj_tlv head; /* TLV header */ + uint8_t subtype; /* subtype (IPv4,IPv6) */ + uint8_t masklen; /* mask length */ + uint8_t result; /* request result */ + uint8_t spare0; + uint16_t idx; /* Table name index */ + uint16_t spare1; + union { + /* Longest field needs to be aligned by 8-byte boundary */ + struct in_addr addr; /* IPv4 address */ + uint32_t key; /* uid/gid/port */ + struct in6_addr addr6; /* IPv6 address */ + char iface[IF_NAMESIZE]; /* interface name */ + struct tflow_entry flow; + } k; + union { + ipfw_table_value value; /* value data */ + uint32_t kidx; /* value kernel index */ + } v; +} ipfw_obj_tentry; +#define IPFW_TF_UPDATE 0x01 /* Update record if exists */ +/* Container TLV */ +#define IPFW_CTF_ATOMIC 0x01 /* Perform atomic operation */ +/* Operation results */ +#define IPFW_TR_IGNORED 0 /* Entry was ignored (rollback) */ +#define IPFW_TR_ADDED 1 /* Entry was successfully added */ +#define IPFW_TR_UPDATED 2 /* Entry was successfully updated*/ +#define IPFW_TR_DELETED 3 /* Entry was successfully deleted*/ +#define IPFW_TR_LIMIT 4 /* Entry was ignored (limit) */ +#define IPFW_TR_NOTFOUND 5 /* Entry was not found */ +#define IPFW_TR_EXISTS 6 /* Entry already exists */ +#define IPFW_TR_ERROR 7 /* Request has failed (unknown) */ + +typedef struct _ipfw_obj_dyntlv { + ipfw_obj_tlv head; + ipfw_dyn_rule state; +} ipfw_obj_dyntlv; +#define IPFW_DF_LAST 0x01 /* Last state in chain */ + +/* Containter TLVs */ +typedef struct _ipfw_obj_ctlv { + ipfw_obj_tlv head; /* TLV header */ + uint32_t count; /* Number of sub-TLVs */ + uint16_t objsize; /* Single object size */ + uint8_t version; /* TLV version */ + uint8_t flags; /* TLV-specific flags */ +} ipfw_obj_ctlv; + +/* Range TLV */ +typedef struct _ipfw_range_tlv { + ipfw_obj_tlv head; /* TLV header */ + uint32_t flags; /* Range flags */ + uint16_t start_rule; /* Range start */ + uint16_t end_rule; /* Range end */ + uint32_t set; /* Range set to match */ + uint32_t new_set; /* New set to move/swap to */ +} ipfw_range_tlv; +#define IPFW_RCFLAG_RANGE 0x01 /* rule range is set */ +#define IPFW_RCFLAG_ALL 0x02 /* match ALL rules */ +#define IPFW_RCFLAG_SET 0x04 /* match rules in given set */ +/* User-settable flags */ +#define IPFW_RCFLAG_USER (IPFW_RCFLAG_RANGE | IPFW_RCFLAG_ALL | \ + IPFW_RCFLAG_SET) +/* Internally used flags */ +#define IPFW_RCFLAG_DEFAULT 0x0100 /* Do not skip defaul rule */ + +typedef struct _ipfw_ta_tinfo { + uint32_t flags; /* Format flags */ + uint32_t spare; + uint8_t taclass4; /* algorithm class */ + uint8_t spare4; + uint16_t itemsize4; /* item size in runtime */ + uint32_t size4; /* runtime structure size */ + uint32_t count4; /* number of items in runtime */ + uint8_t taclass6; /* algorithm class */ + uint8_t spare6; + uint16_t itemsize6; /* item size in runtime */ + uint32_t size6; /* runtime structure size */ + uint32_t count6; /* number of items in runtime */ +} ipfw_ta_tinfo; +#define IPFW_TACLASS_HASH 1 /* algo is based on hash */ +#define IPFW_TACLASS_ARRAY 2 /* algo is based on array */ +#define IPFW_TACLASS_RADIX 3 /* algo is based on radix tree */ + +#define IPFW_TATFLAGS_DATA 0x0001 /* Has data filled in */ +#define IPFW_TATFLAGS_AFDATA 0x0002 /* Separate data per AF */ +#define IPFW_TATFLAGS_AFITEM 0x0004 /* diff. items per AF */ + +typedef struct _ipfw_xtable_info { + uint8_t type; /* table type (addr,iface,..) */ + uint8_t tflags; /* type flags */ + uint16_t mflags; /* modification flags */ + uint16_t flags; /* generic table flags */ + uint16_t spare[3]; + uint32_t vmask; /* bitmask with value types */ + uint32_t set; /* set table is in */ + uint32_t kidx; /* kernel index */ + uint32_t refcnt; /* number of references */ + uint32_t count; /* Number of records */ + uint32_t size; /* Total size of records(export)*/ + uint32_t limit; /* Max number of records */ + char tablename[64]; /* table name */ + char algoname[64]; /* algorithm name */ + ipfw_ta_tinfo ta_info; /* additional algo stats */ +} ipfw_xtable_info; +/* Generic table flags */ +#define IPFW_TGFLAGS_LOCKED 0x01 /* Tables is locked from changes*/ +/* Table type-specific flags */ +#define IPFW_TFFLAG_SRCIP 0x01 +#define IPFW_TFFLAG_DSTIP 0x02 +#define IPFW_TFFLAG_SRCPORT 0x04 +#define IPFW_TFFLAG_DSTPORT 0x08 +#define IPFW_TFFLAG_PROTO 0x10 +/* Table modification flags */ +#define IPFW_TMFLAGS_LIMIT 0x0002 /* Change limit value */ +#define IPFW_TMFLAGS_LOCK 0x0004 /* Change table lock state */ + +typedef struct _ipfw_iface_info { + char ifname[64]; /* interface name */ + uint32_t ifindex; /* interface index */ + uint32_t flags; /* flags */ + uint32_t refcnt; /* number of references */ + uint32_t gencnt; /* number of changes */ + uint64_t spare; +} ipfw_iface_info; +#define IPFW_IFFLAG_RESOLVED 0x01 /* Interface exists */ + +typedef struct _ipfw_ta_info { + char algoname[64]; /* algorithm name */ + uint32_t type; /* lookup type */ + uint32_t flags; + uint32_t refcnt; + uint32_t spare0; + uint64_t spare1; +} ipfw_ta_info; + +typedef struct _ipfw_obj_header { + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + uint32_t spare; + uint16_t idx; /* object name index */ + uint8_t objtype; /* object type */ + uint8_t objsubtype; /* object subtype */ + ipfw_obj_ntlv ntlv; /* object name tlv */ +} ipfw_obj_header; + +typedef struct _ipfw_obj_lheader { + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + uint32_t set_mask; /* disabled set mask */ + uint32_t count; /* Total objects count */ + uint32_t size; /* Total size (incl. header) */ + uint32_t objsize; /* Size of one object */ +} ipfw_obj_lheader; + +#define IPFW_CFG_GET_STATIC 0x01 +#define IPFW_CFG_GET_STATES 0x02 +#define IPFW_CFG_GET_COUNTERS 0x04 +typedef struct _ipfw_cfg_lheader { + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + uint32_t set_mask; /* enabled set mask */ + uint32_t spare; + uint32_t flags; /* Request flags */ + uint32_t size; /* neded buffer size */ + uint32_t start_rule; + uint32_t end_rule; +} ipfw_cfg_lheader; + +typedef struct _ipfw_range_header { + ip_fw3_opheader opheader; /* IP_FW3 opcode */ + ipfw_range_tlv range; +} ipfw_range_header; + +typedef struct _ipfw_sopt_info { + uint16_t opcode; + uint8_t version; + uint8_t dir; + uint8_t spare; + uint64_t refcnt; +} ipfw_sopt_info; + #endif /* _IPFW2_H */ diff --git a/freebsd/sys/netinet/ip_gre.c b/freebsd/sys/netinet/ip_gre.c index 9289be96..36d3ed69 100644 --- a/freebsd/sys/netinet/ip_gre.c +++ b/freebsd/sys/netinet/ip_gre.c @@ -1,9 +1,8 @@ #include -/* $NetBSD: ip_gre.c,v 1.29 2003/09/05 23:02:43 itojun Exp $ */ - /*- * Copyright (c) 1998 The NetBSD Foundation, Inc. + * Copyright (c) 2014 Andrey V. Elsukov * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -31,19 +30,14 @@ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * deencapsulate tunneled packets and send them on - * output half is in net/if_gre.[ch] - * This currently handles IPPROTO_GRE, IPPROTO_MOBILE + * + * $NetBSD: ip_gre.c,v 1.29 2003/09/05 23:02:43 itojun Exp $ */ #include __FBSDID("$FreeBSD$"); #include -#include #include #include @@ -55,285 +49,121 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include -#include +#include +#include +#include #include #include -#include -#include -#include +#include +#include -#ifdef INET #include #include -#include #include +#include #include -#include -#include -#else -#error ip_gre input without IP? -#endif -#ifdef NETATALK -#include -#include -#include +#ifdef INET6 +#include #endif -/* Needs IP headers. */ #include -#include - -#if 1 -void gre_inet_ntoa(struct in_addr in); /* XXX */ -#endif - -static struct gre_softc *gre_lookup(struct mbuf *, u_int8_t); - -static struct mbuf *gre_input2(struct mbuf *, int, u_char); - -/* - * De-encapsulate a packet and feed it back through ip input (this - * routine is called whenever IP gets a packet with proto type - * IPPROTO_GRE and a local destination address). - * This really is simple - */ -void -gre_input(struct mbuf *m, int off) +extern struct domain inetdomain; +static const struct protosw in_gre_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_GRE, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = gre_input, + .pr_output = rip_output, + .pr_ctlinput = rip_ctlinput, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}; + +#define GRE_TTL 30 +VNET_DEFINE(int, ip_gre_ttl) = GRE_TTL; +#define V_ip_gre_ttl VNET(ip_gre_ttl) +SYSCTL_INT(_net_inet_ip, OID_AUTO, grettl, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(ip_gre_ttl), 0, ""); + +static int +in_gre_encapcheck(const struct mbuf *m, int off, int proto, void *arg) { - int proto; - - proto = (mtod(m, struct ip *))->ip_p; + GRE_RLOCK_TRACKER; + struct gre_softc *sc; + struct ip *ip; - m = gre_input2(m, off, proto); + sc = (struct gre_softc *)arg; + if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0) + return (0); + M_ASSERTPKTHDR(m); /* - * If no matching tunnel that is up is found. We inject - * the mbuf to raw ip socket to see if anyone picks it up. + * We expect that payload contains at least IPv4 + * or IPv6 packet. */ - if (m != NULL) - rip_input(m, off); -} - -/* - * Decapsulate. Does the real work and is called from gre_input() - * (above). Returns an mbuf back if packet is not yet processed, - * and NULL if it needs no further processing. proto is the protocol - * number of the "calling" foo_input() routine. - */ -static struct mbuf * -gre_input2(struct mbuf *m ,int hlen, u_char proto) -{ - struct greip *gip; - int isr; - struct gre_softc *sc; - u_int16_t flags; - u_int32_t af; - - if ((sc = gre_lookup(m, proto)) == NULL) { - /* No matching tunnel or tunnel is down. */ - return (m); - } - - if (m->m_len < sizeof(*gip)) { - m = m_pullup(m, sizeof(*gip)); - if (m == NULL) - return (NULL); - } - gip = mtod(m, struct greip *); - - GRE2IFP(sc)->if_ipackets++; - GRE2IFP(sc)->if_ibytes += m->m_pkthdr.len; - - switch (proto) { - case IPPROTO_GRE: - hlen += sizeof(struct gre_h); - - /* process GRE flags as packet can be of variable len */ - flags = ntohs(gip->gi_flags); - - /* Checksum & Offset are present */ - if ((flags & GRE_CP) | (flags & GRE_RP)) - hlen += 4; - /* We don't support routing fields (variable length) */ - if (flags & GRE_RP) - return (m); - if (flags & GRE_KP) - hlen += 4; - if (flags & GRE_SP) - hlen += 4; - - switch (ntohs(gip->gi_ptype)) { /* ethertypes */ - case WCCP_PROTOCOL_TYPE: - if (sc->wccp_ver == WCCP_V2) - hlen += 4; - /* FALLTHROUGH */ - case ETHERTYPE_IP: /* shouldn't need a schednetisr(), */ - isr = NETISR_IP;/* as we are in ip_input */ - af = AF_INET; - break; -#ifdef INET6 - case ETHERTYPE_IPV6: - isr = NETISR_IPV6; - af = AF_INET6; - break; -#endif -#ifdef NETATALK - case ETHERTYPE_ATALK: - isr = NETISR_ATALK1; - af = AF_APPLETALK; - break; -#endif - default: - /* Others not yet supported. */ - return (m); - } - break; - default: - /* Others not yet supported. */ - return (m); - } - - if (hlen > m->m_pkthdr.len) { - m_freem(m); - return (NULL); - } - /* Unlike NetBSD, in FreeBSD m_adj() adjusts m->m_pkthdr.len as well */ - m_adj(m, hlen); - - if (bpf_peers_present(GRE2IFP(sc)->if_bpf)) { - bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m); - } + if (m->m_pkthdr.len < sizeof(struct greip) + sizeof(struct ip)) + return (0); - if ((GRE2IFP(sc)->if_flags & IFF_MONITOR) != 0) { - m_freem(m); - return(NULL); - } - - m->m_pkthdr.rcvif = GRE2IFP(sc); + GRE_RLOCK(sc); + if (sc->gre_family == 0) + goto bad; - netisr_queue(isr, m); + KASSERT(sc->gre_family == AF_INET, + ("wrong gre_family: %d", sc->gre_family)); - /* Packet is done, no further processing needed. */ - return (NULL); + ip = mtod(m, struct ip *); + if (sc->gre_oip.ip_src.s_addr != ip->ip_dst.s_addr || + sc->gre_oip.ip_dst.s_addr != ip->ip_src.s_addr) + goto bad; + + GRE_RUNLOCK(sc); + return (32 * 2); +bad: + GRE_RUNLOCK(sc); + return (0); } -/* - * input routine for IPPRPOTO_MOBILE - * This is a little bit diffrent from the other modes, as the - * encapsulating header was not prepended, but instead inserted - * between IP header and payload - */ - -void -gre_mobile_input(struct mbuf *m, int hlen) +int +in_gre_output(struct mbuf *m, int af, int hlen) { - struct ip *ip; - struct mobip_h *mip; - struct gre_softc *sc; - int msiz; - - if ((sc = gre_lookup(m, IPPROTO_MOBILE)) == NULL) { - /* No matching tunnel or tunnel is down. */ - m_freem(m); - return; - } - - if (m->m_len < sizeof(*mip)) { - m = m_pullup(m, sizeof(*mip)); - if (m == NULL) - return; - } - ip = mtod(m, struct ip *); - mip = mtod(m, struct mobip_h *); - - GRE2IFP(sc)->if_ipackets++; - GRE2IFP(sc)->if_ibytes += m->m_pkthdr.len; - - if (ntohs(mip->mh.proto) & MOB_H_SBIT) { - msiz = MOB_H_SIZ_L; - mip->mi.ip_src.s_addr = mip->mh.osrc; - } else - msiz = MOB_H_SIZ_S; - - if (m->m_len < (ip->ip_hl << 2) + msiz) { - m = m_pullup(m, (ip->ip_hl << 2) + msiz); - if (m == NULL) - return; - ip = mtod(m, struct ip *); - mip = mtod(m, struct mobip_h *); - } - - mip->mi.ip_dst.s_addr = mip->mh.odst; - mip->mi.ip_p = (ntohs(mip->mh.proto) >> 8); - - if (gre_in_cksum((u_int16_t *)&mip->mh, msiz) != 0) { - m_freem(m); - return; - } - - bcopy((caddr_t)(ip) + (ip->ip_hl << 2) + msiz, (caddr_t)(ip) + - (ip->ip_hl << 2), m->m_len - msiz - (ip->ip_hl << 2)); - m->m_len -= msiz; - m->m_pkthdr.len -= msiz; - - /* - * On FreeBSD, rip_input() supplies us with ip->ip_len - * already converted into host byteorder and also decreases - * it by the lengh of IP header, however, ip_input() expects - * that this field is in the original format (network byteorder - * and full size of IP packet), so that adjust accordingly. - */ - ip->ip_len = htons(ip->ip_len + sizeof(struct ip) - msiz); - - ip->ip_sum = 0; - ip->ip_sum = in_cksum(m, (ip->ip_hl << 2)); - - if (bpf_peers_present(GRE2IFP(sc)->if_bpf)) { - u_int32_t af = AF_INET; - bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m); - } - - if ((GRE2IFP(sc)->if_flags & IFF_MONITOR) != 0) { - m_freem(m); - return; + struct greip *gi; + + gi = mtod(m, struct greip *); + switch (af) { + case AF_INET: + /* + * gre_transmit() has used M_PREPEND() that doesn't guarantee + * m_data is contiguous more than hlen bytes. Use m_copydata() + * here to avoid m_pullup(). + */ + m_copydata(m, hlen + offsetof(struct ip, ip_tos), + sizeof(u_char), &gi->gi_ip.ip_tos); + m_copydata(m, hlen + offsetof(struct ip, ip_id), + sizeof(u_short), (caddr_t)&gi->gi_ip.ip_id); + break; +#ifdef INET6 + case AF_INET6: + gi->gi_ip.ip_tos = 0; /* XXX */ + ip_fillid(&gi->gi_ip); + break; +#endif } - - m->m_pkthdr.rcvif = GRE2IFP(sc); - - netisr_queue(NETISR_IP, m); + gi->gi_ip.ip_ttl = V_ip_gre_ttl; + gi->gi_ip.ip_len = htons(m->m_pkthdr.len); + return (ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL)); } -/* - * Find the gre interface associated with our src/dst/proto set. - * - * XXXRW: Need some sort of drain/refcount mechanism so that the softc - * reference remains valid after it's returned from gre_lookup(). Right - * now, I'm thinking it should be reference-counted with a gre_dropref() - * when the caller is done with the softc. This is complicated by how - * to handle destroying the gre softc; probably using a gre_drain() in - * in_gre.c during destroy. - */ -static struct gre_softc * -gre_lookup(struct mbuf *m, u_int8_t proto) +int +in_gre_attach(struct gre_softc *sc) { - struct ip *ip = mtod(m, struct ip *); - struct gre_softc *sc; - - mtx_lock(&gre_mtx); - for (sc = LIST_FIRST(&gre_softc_list); sc != NULL; - sc = LIST_NEXT(sc, sc_list)) { - if ((sc->g_dst.s_addr == ip->ip_src.s_addr) && - (sc->g_src.s_addr == ip->ip_dst.s_addr) && - (sc->g_proto == proto) && - ((GRE2IFP(sc)->if_flags & IFF_UP) != 0)) { - mtx_unlock(&gre_mtx); - return (sc); - } - } - mtx_unlock(&gre_mtx); - return (NULL); + KASSERT(sc->gre_ecookie == NULL, ("gre_ecookie isn't NULL")); + sc->gre_ecookie = encap_attach_func(AF_INET, IPPROTO_GRE, + in_gre_encapcheck, &in_gre_protosw, sc); + if (sc->gre_ecookie == NULL) + return (EEXIST); + return (0); } diff --git a/freebsd/sys/netinet/ip_icmp.c b/freebsd/sys/netinet/ip_icmp.c index cd581948..f34cc4bd 100644 --- a/freebsd/sys/netinet/ip_icmp.c +++ b/freebsd/sys/netinet/ip_icmp.c @@ -35,7 +35,6 @@ __FBSDID("$FreeBSD$"); #include -#include #include #include @@ -44,15 +43,19 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -60,16 +63,13 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #ifdef INET -#ifdef IPSEC -#include -#include -#endif #include @@ -83,68 +83,79 @@ __FBSDID("$FreeBSD$"); */ static VNET_DEFINE(int, icmplim) = 200; #define V_icmplim VNET(icmplim) -SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim), 0, "Maximum number of ICMP responses per second"); static VNET_DEFINE(int, icmplim_output) = 1; #define V_icmplim_output VNET(icmplim_output) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmplim_output), 0, - "Enable rate limiting of ICMP responses"); + "Enable logging of ICMP response rate limiting"); #ifdef INET -VNET_DEFINE(struct icmpstat, icmpstat); -SYSCTL_VNET_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW, - &VNET_NAME(icmpstat), icmpstat, ""); +VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat); +VNET_PCPUSTAT_SYSINIT(icmpstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat, + icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)"); + +#ifdef VIMAGE +VNET_PCPUSTAT_SYSUNINIT(icmpstat); +#endif /* VIMAGE */ static VNET_DEFINE(int, icmpmaskrepl) = 0; #define V_icmpmaskrepl VNET(icmpmaskrepl) -SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskrepl), 0, - "Reply to ICMP Address Mask Request packets."); + "Reply to ICMP Address Mask Request packets"); static VNET_DEFINE(u_int, icmpmaskfake) = 0; #define V_icmpmaskfake VNET(icmpmaskfake) -SYSCTL_VNET_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_RW, +SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpmaskfake), 0, - "Fake reply to ICMP Address Mask Request packets."); + "Fake reply to ICMP Address Mask Request packets"); VNET_DEFINE(int, drop_redirect) = 0; +#define V_drop_redirect VNET(drop_redirect) +SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(drop_redirect), 0, + "Ignore ICMP redirects"); static VNET_DEFINE(int, log_redirect) = 0; #define V_log_redirect VNET(log_redirect) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(log_redirect), 0, "Log ICMP redirects to the console"); static VNET_DEFINE(char, reply_src[IFNAMSIZ]); #define V_reply_src VNET(reply_src) -SYSCTL_VNET_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_RW, +SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(reply_src), IFNAMSIZ, - "icmp reply source for non-local packets."); + "ICMP reply source for non-local packets"); static VNET_DEFINE(int, icmp_rfi) = 0; #define V_icmp_rfi VNET(icmp_rfi) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_rfi), 0, "ICMP reply from incoming interface for non-local packets"); static VNET_DEFINE(int, icmp_quotelen) = 8; #define V_icmp_quotelen VNET(icmp_quotelen) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_quotelen), 0, "Number of bytes from original packet to quote in ICMP reply"); -/* - * ICMP broadcast echo sysctl - */ static VNET_DEFINE(int, icmpbmcastecho) = 0; #define V_icmpbmcastecho VNET(icmpbmcastecho) -SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmpbmcastecho), 0, - ""); + "Reply to multicast ICMP Echo Request and Timestamp packets"); +static VNET_DEFINE(int, icmptstamprepl) = 1; +#define V_icmptstamprepl VNET(icmptstamprepl) +SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_RW, + &VNET_NAME(icmptstamprepl), 0, + "Respond to ICMP Timestamp packets"); #ifdef ICMPPRINTFS int icmpprintfs = 0; @@ -155,39 +166,6 @@ static void icmp_send(struct mbuf *, struct mbuf *); extern struct protosw inetsw[]; -static int -sysctl_net_icmp_drop_redir(SYSCTL_HANDLER_ARGS) -{ - int error, new; - int i; - struct radix_node_head *rnh; - - new = V_drop_redirect; - error = sysctl_handle_int(oidp, &new, 0, req); - if (error == 0 && req->newptr) { - new = (new != 0) ? 1 : 0; - - if (new == V_drop_redirect) - return (0); - - for (i = 0; i < rt_numfibs; i++) { - if ((rnh = rt_tables_get_rnh(i, AF_INET)) == NULL) - continue; - RADIX_NODE_HEAD_LOCK(rnh); - in_setmatchfunc(rnh, new); - RADIX_NODE_HEAD_UNLOCK(rnh); - } - - V_drop_redirect = new; - } - - return (error); -} - -SYSCTL_VNET_PROC(_net_inet_icmp, OID_AUTO, drop_redirect, - CTLTYPE_INT|CTLFLAG_RW, 0, 0, - sysctl_net_icmp_drop_redir, "I", "Ignore ICMP redirects"); - /* * Kernel module interface for updating icmpstat. The argument is an index * into icmpstat treated as an array of u_long. While this encodes the @@ -199,7 +177,7 @@ void kmod_icmpstat_inc(int statnum) { - (*((u_long *)&V_icmpstat + statnum))++; + counter_u64_add(VNET(icmpstat)[statnum], 1); } /* @@ -231,7 +209,7 @@ icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) */ if (n->m_flags & M_DECRYPTED) goto freeit; - if (oip->ip_off & ~(IP_MF|IP_DF)) + if (oip->ip_off & htons(~(IP_MF|IP_DF))) goto freeit; if (n->m_flags & (M_BCAST|M_MCAST)) goto freeit; @@ -247,7 +225,7 @@ icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) /* * Calculate length to quote from original packet and * prevent the ICMP mbuf from overflowing. - * Unfortunatly this is non-trivial since ip_forward() + * Unfortunately this is non-trivial since ip_forward() * sends us truncated packets. */ nlen = m_length(n, NULL); @@ -265,25 +243,54 @@ icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) tcphlen = th->th_off << 2; if (tcphlen < sizeof(struct tcphdr)) goto freeit; - if (oip->ip_len < oiphlen + tcphlen) + if (ntohs(oip->ip_len) < oiphlen + tcphlen) goto freeit; if (oiphlen + tcphlen > n->m_len && n->m_next == NULL) goto stdreply; if (n->m_len < oiphlen + tcphlen && ((n = m_pullup(n, oiphlen + tcphlen)) == NULL)) goto freeit; - icmpelen = max(tcphlen, min(V_icmp_quotelen, oip->ip_len - oiphlen)); + icmpelen = max(tcphlen, min(V_icmp_quotelen, + ntohs(oip->ip_len) - oiphlen)); + } else if (oip->ip_p == IPPROTO_SCTP) { + struct sctphdr *sh; + struct sctp_chunkhdr *ch; + + if (ntohs(oip->ip_len) < oiphlen + sizeof(struct sctphdr)) + goto stdreply; + if (oiphlen + sizeof(struct sctphdr) > n->m_len && + n->m_next == NULL) + goto stdreply; + if (n->m_len < oiphlen + sizeof(struct sctphdr) && + (n = m_pullup(n, oiphlen + sizeof(struct sctphdr))) == NULL) + goto freeit; + icmpelen = max(sizeof(struct sctphdr), + min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); + sh = (struct sctphdr *)((caddr_t)oip + oiphlen); + if (ntohl(sh->v_tag) == 0 && + ntohs(oip->ip_len) >= oiphlen + sizeof(struct sctphdr) + 8 && + (n->m_len >= oiphlen + sizeof(struct sctphdr) + 8 || + n->m_next != NULL)) { + if (n->m_len < oiphlen + sizeof(struct sctphdr) + 8 && + (n = m_pullup(n, oiphlen + sizeof(struct sctphdr) + 8)) == NULL) + goto freeit; + ch = (struct sctp_chunkhdr *)(sh + 1); + if (ch->chunk_type == SCTP_INITIATION) { + icmpelen = max(sizeof(struct sctphdr) + 8, + min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); + } + } } else -stdreply: icmpelen = max(8, min(V_icmp_quotelen, oip->ip_len - oiphlen)); +stdreply: icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen)); icmplen = min(oiphlen + icmpelen, nlen); if (icmplen < sizeof(struct ip)) goto freeit; if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen) - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); else - m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto freeit; #ifdef MAC @@ -324,8 +331,6 @@ stdreply: icmpelen = max(8, min(V_icmp_quotelen, oip->ip_len - oiphlen)); */ m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); nip = &icp->icmp_ip; - nip->ip_len = htons(nip->ip_len); - nip->ip_off = htons(nip->ip_off); /* * Set up ICMP message mbuf and copy old IP header (without options @@ -340,7 +345,7 @@ stdreply: icmpelen = max(8, min(V_icmp_quotelen, oip->ip_len - oiphlen)); m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; nip = mtod(m, struct ip *); bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); - nip->ip_len = m->m_len; + nip->ip_len = htons(m->m_len); nip->ip_v = IPVERSION; nip->ip_hl = 5; nip->ip_p = IPPROTO_ICMP; @@ -355,19 +360,22 @@ freeit: /* * Process a received ICMP message. */ -void -icmp_input(struct mbuf *m, int off) +int +icmp_input(struct mbuf **mp, int *offp, int proto) { struct icmp *icp; struct in_ifaddr *ia; + struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct sockaddr_in icmpsrc, icmpdst, icmpgw; - int hlen = off; - int icmplen = ip->ip_len; + int hlen = *offp; + int icmplen = ntohs(ip->ip_len) - *offp; int i, code; void (*ctlfunc)(int, struct sockaddr *, void *); int fibnum; + *mp = NULL; + /* * Locate icmp structure in mbuf, and check * that not corrupted and of at least minimum length. @@ -387,7 +395,7 @@ icmp_input(struct mbuf *m, int off) i = hlen + min(icmplen, ICMP_ADVLENMIN); if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { ICMPSTAT_INC(icps_tooshort); - return; + return (IPPROTO_DONE); } ip = mtod(m, struct ip *); m->m_len -= hlen; @@ -400,19 +408,6 @@ icmp_input(struct mbuf *m, int off) m->m_len += hlen; m->m_data -= hlen; - if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { - /* - * Deliver very specific ICMP type only. - */ - switch (icp->icmp_type) { - case ICMP_UNREACH: - case ICMP_TIMXCEED: - break; - default: - goto freeit; - } - } - #ifdef ICMPPRINTFS if (icmpprintfs) printf("icmp_input, type %d code %d\n", icp->icmp_type, @@ -489,12 +484,6 @@ icmp_input(struct mbuf *m, int off) if (code > 1) goto badcode; code = PRC_PARAMPROB; - goto deliver; - - case ICMP_SOURCEQUENCH: - if (code) - goto badcode; - code = PRC_QUENCH; deliver: /* * Problem with datagram; advise higher level routines. @@ -504,7 +493,6 @@ icmp_input(struct mbuf *m, int off) ICMPSTAT_INC(icps_badlen); goto freeit; } - icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len); /* Discard ICMP's in response to multicast packets */ if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) goto badcode; @@ -517,6 +505,23 @@ icmp_input(struct mbuf *m, int off) * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. */ + i = sizeof(struct ip) + min(icmplen, ICMP_ADVLENPREF(icp)); + ip_stripoptions(m); + if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { + /* This should actually not happen */ + ICMPSTAT_INC(icps_tooshort); + return (IPPROTO_DONE); + } + ip = mtod(m, struct ip *); + icp = (struct icmp *)(ip + 1); + /* + * The upper layer handler can rely on: + * - The outer IP header has no options. + * - The outer IP header, the ICMP header, the inner IP header, + * and the first n bytes of the inner payload are contiguous. + * n is at least 8, but might be larger based on + * ICMP_ADVLENPREF. See its definition in ip_icmp.h. + */ ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; if (ctlfunc) (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, @@ -540,6 +545,8 @@ icmp_input(struct mbuf *m, int off) goto reflect; case ICMP_TSTAMP: + if (V_icmptstamprepl == 0) + break; if (!V_icmpbmcastecho && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { ICMPSTAT_INC(icps_bmcasttstamp); @@ -597,11 +604,10 @@ icmp_input(struct mbuf *m, int off) } ifa_free(&ia->ia_ifa); reflect: - ip->ip_len += hlen; /* since ip_input deducts this */ ICMPSTAT_INC(icps_reflect); ICMPSTAT_INC(icps_outhist[icp->icmp_type]); icmp_reflect(m); - return; + return (IPPROTO_DONE); case ICMP_REDIRECT: if (V_log_redirect) { @@ -658,9 +664,6 @@ reflect: (struct sockaddr *)&icmpgw, fibnum); } pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); -#ifdef IPSEC - key_sa_routechange((struct sockaddr *)&icmpsrc); -#endif break; /* @@ -673,16 +676,19 @@ reflect: case ICMP_TSTAMPREPLY: case ICMP_IREQREPLY: case ICMP_MASKREPLY: + case ICMP_SOURCEQUENCH: default: break; } raw: - rip_input(m, off); - return; + *mp = m; + rip_input(mp, offp, proto); + return (IPPROTO_DONE); freeit: m_freem(m); + return (IPPROTO_DONE); } /* @@ -691,12 +697,14 @@ freeit: static void icmp_reflect(struct mbuf *m) { + struct rm_priotracker in_ifa_tracker; struct ip *ip = mtod(m, struct ip *); struct ifaddr *ifa; struct ifnet *ifp; struct in_ifaddr *ia; struct in_addr t; - struct mbuf *opts = 0; + struct nhop4_extended nh_ext; + struct mbuf *opts = NULL; int optlen = (ip->ip_hl << 2) - sizeof(struct ip); if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || @@ -707,8 +715,6 @@ icmp_reflect(struct mbuf *m) goto done; /* Ip_output() will check for broadcast */ } - m_addr_changed(m); - t = ip->ip_dst; ip->ip_dst = ip->ip_src; @@ -718,15 +724,15 @@ icmp_reflect(struct mbuf *m) * If the incoming packet was addressed directly to one of our * own addresses, use dst as the src for the reply. */ - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) { if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) { t = IA_SIN(ia)->sin_addr; - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); goto match; } } - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* * If the incoming packet was addressed to one of our broadcast @@ -791,14 +797,12 @@ icmp_reflect(struct mbuf *m) * When we don't have a route back to the packet source, stop here * and drop the packet. */ - ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); - if (ia == NULL) { + if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh_ext) != 0) { m_freem(m); ICMPSTAT_INC(icps_noroute); goto done; } - t = IA_SIN(ia)->sin_addr; - ifa_free(&ia->ia_ifa); + t = nh_ext.nh_src; match: #ifdef MAC mac_netinet_icmp_replyinplace(m); @@ -816,8 +820,8 @@ match: * add on any record-route or timestamp options. */ cp = (u_char *) (ip + 1); - if ((opts = ip_srcroute(m)) == 0 && - (opts = m_gethdr(M_DONTWAIT, MT_DATA))) { + if ((opts = ip_srcroute(m)) == NULL && + (opts = m_gethdr(M_NOWAIT, MT_DATA))) { opts->m_len = sizeof(struct in_addr); mtod(opts, struct in_addr *)->s_addr = 0; } @@ -865,19 +869,7 @@ match: printf("%d\n", opts->m_len); #endif } - /* - * Now strip out original options by copying rest of first - * mbuf's data back, and adjust the IP length. - */ - ip->ip_len -= optlen; - ip->ip_v = IPVERSION; - ip->ip_hl = 5; - m->m_len -= optlen; - if (m->m_flags & M_PKTHDR) - m->m_pkthdr.len -= optlen; - optlen += sizeof(struct ip); - bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1), - (unsigned)(m->m_len - sizeof(struct ip))); + ip_stripoptions(m); } m_tag_delete_nonpersistent(m); m->m_flags &= ~(M_BCAST|M_MCAST); @@ -903,7 +895,7 @@ icmp_send(struct mbuf *m, struct mbuf *opts) m->m_len -= hlen; icp = mtod(m, struct icmp *); icp->icmp_cksum = 0; - icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen); + icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen); m->m_data -= hlen; m->m_len += hlen; m->m_pkthdr.rcvif = (struct ifnet *)0; @@ -919,7 +911,7 @@ icmp_send(struct mbuf *m, struct mbuf *opts) } /* - * Return milliseconds since 00:00 GMT in network format. + * Return milliseconds since 00:00 UTC in network format. */ uint32_t iptime(void) diff --git a/freebsd/sys/netinet/ip_icmp.h b/freebsd/sys/netinet/ip_icmp.h index 9cabdb58..64db0064 100644 --- a/freebsd/sys/netinet/ip_icmp.h +++ b/freebsd/sys/netinet/ip_icmp.h @@ -99,7 +99,7 @@ struct icmp { struct id_ts { /* ICMP Timestamp */ /* * The next 3 fields are in network format, - * milliseconds since 00:00 GMT + * milliseconds since 00:00 UTC */ uint32_t its_otime; /* Originate */ uint32_t its_rtime; /* Receive */ @@ -136,6 +136,14 @@ struct icmp { #define ICMP_ADVLENMIN (8 + sizeof (struct ip) + 8) /* min */ #define ICMP_ADVLEN(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8) /* N.B.: must separately check that ip_hl >= 5 */ + /* This is the minimum length required by RFC 792. */ +/* + * ICMP_ADVLENPREF is the preferred number of bytes which should be contiguous. + * SCTP needs additional 12 bytes to be able to access the initiate tag + * in packets containing an INIT chunk. For also supporting SCTP/UDP, + * additional 8 bytes are needed. + */ +#define ICMP_ADVLENPREF(p) (8 + ((p)->icmp_ip.ip_hl << 2) + 8 + 8 + 12) /* * Definition of type and code field values. @@ -207,7 +215,7 @@ struct icmp { #ifdef _KERNEL void icmp_error(struct mbuf *, int, int, uint32_t, int); -void icmp_input(struct mbuf *, int); +int icmp_input(struct mbuf **, int *, int); int ip_next_mtu(int, int); #endif diff --git a/freebsd/sys/netinet/ip_id.c b/freebsd/sys/netinet/ip_id.c index a76c7b78..17352cfb 100644 --- a/freebsd/sys/netinet/ip_id.c +++ b/freebsd/sys/netinet/ip_id.c @@ -76,119 +76,149 @@ __FBSDID("$FreeBSD$"); * enabled. */ -#include -#include #include -#include +#include +#include #include -#include +#include #include #include #include -#include +#include #include +#include + +#include + #include +#include #include -#include +/* + * By default we generate IP ID only for non-atomic datagrams, as + * suggested by RFC6864. We use per-CPU counter for that, or if + * user wants to, we can turn on random ID generation. + */ +static VNET_DEFINE(int, ip_rfc6864) = 1; +static VNET_DEFINE(int, ip_do_randomid) = 0; +#define V_ip_rfc6864 VNET(ip_rfc6864) +#define V_ip_do_randomid VNET(ip_do_randomid) + +/* + * Random ID state engine. + */ static MALLOC_DEFINE(M_IPID, "ipid", "randomized ip id state"); +static VNET_DEFINE(uint16_t *, id_array); +static VNET_DEFINE(bitstr_t *, id_bits); +static VNET_DEFINE(int, array_ptr); +static VNET_DEFINE(int, array_size); +static VNET_DEFINE(int, random_id_collisions); +static VNET_DEFINE(int, random_id_total); +static VNET_DEFINE(struct mtx, ip_id_mtx); +#define V_id_array VNET(id_array) +#define V_id_bits VNET(id_bits) +#define V_array_ptr VNET(array_ptr) +#define V_array_size VNET(array_size) +#define V_random_id_collisions VNET(random_id_collisions) +#define V_random_id_total VNET(random_id_total) +#define V_ip_id_mtx VNET(ip_id_mtx) -static u_int16_t *id_array = NULL; -static bitstr_t *id_bits = NULL; -static int array_ptr = 0; -static int array_size = 8192; -static int random_id_collisions = 0; -static int random_id_total = 0; -static struct mtx ip_id_mtx; +/* + * Non-random ID state engine is simply a per-cpu counter. + */ +static VNET_DEFINE(counter_u64_t, ip_id); +#define V_ip_id VNET(ip_id) -static void ip_initid(void); +static int sysctl_ip_randomid(SYSCTL_HANDLER_ARGS); static int sysctl_ip_id_change(SYSCTL_HANDLER_ARGS); - -MTX_SYSINIT(ip_id_mtx, &ip_id_mtx, "ip_id_mtx", MTX_DEF); +static void ip_initid(int); +static uint16_t ip_randomid(void); +static void ipid_sysinit(void); +static void ipid_sysuninit(void); SYSCTL_DECL(_net_inet_ip); -SYSCTL_PROC(_net_inet_ip, OID_AUTO, random_id_period, CTLTYPE_INT|CTLFLAG_RW, - &array_size, 0, sysctl_ip_id_change, "IU", "IP ID Array size"); -SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_collisions, CTLFLAG_RD, - &random_id_collisions, 0, "Count of IP ID collisions"); -SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_total, CTLFLAG_RD, - &random_id_total, 0, "Count of IP IDs created"); +SYSCTL_PROC(_net_inet_ip, OID_AUTO, random_id, + CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(ip_do_randomid), 0, sysctl_ip_randomid, "IU", + "Assign random ip_id values"); +SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(ip_rfc6864), 0, + "Use constant IP ID for atomic datagrams"); +SYSCTL_PROC(_net_inet_ip, OID_AUTO, random_id_period, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, + &VNET_NAME(array_size), 0, sysctl_ip_id_change, "IU", "IP ID Array size"); +SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_collisions, + CTLFLAG_RD | CTLFLAG_VNET, + &VNET_NAME(random_id_collisions), 0, "Count of IP ID collisions"); +SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_total, CTLFLAG_RD | CTLFLAG_VNET, + &VNET_NAME(random_id_total), 0, "Count of IP IDs created"); + +static int +sysctl_ip_randomid(SYSCTL_HANDLER_ARGS) +{ + int error, new; + + new = V_ip_do_randomid; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error || req->newptr == NULL) + return (error); + if (new != 0 && new != 1) + return (EINVAL); + if (new == V_ip_do_randomid) + return (0); + if (new == 1 && V_ip_do_randomid == 0) + ip_initid(8192); + /* We don't free memory when turning random ID off, due to race. */ + V_ip_do_randomid = new; + return (0); +} static int sysctl_ip_id_change(SYSCTL_HANDLER_ARGS) { int error, new; - new = array_size; + new = V_array_size; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { - if (new >= 512 && new <= 32768) { - mtx_lock(&ip_id_mtx); - array_size = new; - ip_initid(); - mtx_unlock(&ip_id_mtx); - } else + if (new >= 512 && new <= 32768) + ip_initid(new); + else error = EINVAL; } return (error); } -/* - * ip_initid() runs with a mutex held and may execute in a network context. - * As a result, it uses M_NOWAIT. Ideally, we would always do this - * allocation from the sysctl contact and have it be an invariant that if - * this random ID allocation mode is selected, the buffers are present. This - * would also avoid potential network context failures of IP ID generation. - */ static void -ip_initid(void) +ip_initid(int new_size) { + uint16_t *new_array; + bitstr_t *new_bits; - mtx_assert(&ip_id_mtx, MA_OWNED); + new_array = malloc(new_size * sizeof(uint16_t), M_IPID, + M_WAITOK | M_ZERO); + new_bits = malloc(bitstr_size(65536), M_IPID, M_WAITOK | M_ZERO); - if (id_array != NULL) { - free(id_array, M_IPID); - free(id_bits, M_IPID); - } - random_id_collisions = 0; - random_id_total = 0; - array_ptr = 0; - id_array = (u_int16_t *) malloc(array_size * sizeof(u_int16_t), - M_IPID, M_NOWAIT | M_ZERO); - id_bits = (bitstr_t *) malloc(bitstr_size(65536), M_IPID, - M_NOWAIT | M_ZERO); - if (id_array == NULL || id_bits == NULL) { - /* Neither or both. */ - if (id_array != NULL) { - free(id_array, M_IPID); - id_array = NULL; - } - if (id_bits != NULL) { - free(id_bits, M_IPID); - id_bits = NULL; - } + mtx_lock(&V_ip_id_mtx); + if (V_id_array != NULL) { + free(V_id_array, M_IPID); + free(V_id_bits, M_IPID); } + V_id_array = new_array; + V_id_bits = new_bits; + V_array_size = new_size; + V_array_ptr = 0; + V_random_id_collisions = 0; + V_random_id_total = 0; + mtx_unlock(&V_ip_id_mtx); } -u_int16_t +static uint16_t ip_randomid(void) { - u_int16_t new_id; - - mtx_lock(&ip_id_mtx); - if (id_array == NULL) - ip_initid(); - - /* - * Fail gracefully; return a fixed id if memory allocation failed; - * ideally we wouldn't do allocation in this context in order to - * avoid the possibility of this failure mode. - */ - if (id_array == NULL) { - mtx_unlock(&ip_id_mtx); - return (1); - } + uint16_t new_id; + mtx_lock(&V_ip_id_mtx); /* * To avoid a conflict with the zeros that the array is initially * filled with, we never hand out an id of zero. @@ -196,16 +226,76 @@ ip_randomid(void) new_id = 0; do { if (new_id != 0) - random_id_collisions++; + V_random_id_collisions++; arc4rand(&new_id, sizeof(new_id), 0); - } while (bit_test(id_bits, new_id) || new_id == 0); - bit_clear(id_bits, id_array[array_ptr]); - bit_set(id_bits, new_id); - id_array[array_ptr] = new_id; - array_ptr++; - if (array_ptr == array_size) - array_ptr = 0; - random_id_total++; - mtx_unlock(&ip_id_mtx); + } while (bit_test(V_id_bits, new_id) || new_id == 0); + bit_clear(V_id_bits, V_id_array[V_array_ptr]); + bit_set(V_id_bits, new_id); + V_id_array[V_array_ptr] = new_id; + V_array_ptr++; + if (V_array_ptr == V_array_size) + V_array_ptr = 0; + V_random_id_total++; + mtx_unlock(&V_ip_id_mtx); return (new_id); } + +void +ip_fillid(struct ip *ip) +{ + + /* + * Per RFC6864 Section 4 + * + * o Atomic datagrams: (DF==1) && (MF==0) && (frag_offset==0) + * o Non-atomic datagrams: (DF==0) || (MF==1) || (frag_offset>0) + */ + if (V_ip_rfc6864 && (ip->ip_off & htons(IP_DF)) == htons(IP_DF)) + ip->ip_id = 0; + else if (V_ip_do_randomid) + ip->ip_id = ip_randomid(); + else { + counter_u64_add(V_ip_id, 1); + /* + * There are two issues about this trick, to be kept in mind. + * 1) We can migrate between counter_u64_add() and next + * line, and grab counter from other CPU, resulting in too + * quick ID reuse. This is tolerable in our particular case, + * since probability of such event is much lower then reuse + * of ID due to legitimate overflow, that at modern Internet + * speeds happens all the time. + * 2) We are relying on the fact that counter(9) is based on + * UMA_ZONE_PCPU uma(9) zone. We also take only last + * sixteen bits of a counter, so we don't care about the + * fact that machines with 32-bit word update their counters + * not atomically. + */ + ip->ip_id = htons((*(uint64_t *)zpcpu_get(V_ip_id)) & 0xffff); + } +} + +static void +ipid_sysinit(void) +{ + int i; + + mtx_init(&V_ip_id_mtx, "ip_id_mtx", NULL, MTX_DEF); + V_ip_id = counter_u64_alloc(M_WAITOK); + + CPU_FOREACH(i) + arc4rand(zpcpu_get_cpu(V_ip_id, i), sizeof(uint64_t), 0); +} +VNET_SYSINIT(ip_id, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipid_sysinit, NULL); + +static void +ipid_sysuninit(void) +{ + + if (V_id_array != NULL) { + free(V_id_array, M_IPID); + free(V_id_bits, M_IPID); + } + counter_u64_free(V_ip_id); + mtx_destroy(&V_ip_id_mtx); +} +VNET_SYSUNINIT(ip_id, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ipid_sysuninit, NULL); diff --git a/freebsd/sys/netinet/ip_input.c b/freebsd/sys/netinet/ip_input.c index 24002aac..425dbc1f 100644 --- a/freebsd/sys/netinet/ip_input.c +++ b/freebsd/sys/netinet/ip_input.c @@ -35,13 +35,14 @@ __FBSDID("$FreeBSD$"); #include -#include #include #include #include +#include #include #include +#include #include #include #include @@ -50,7 +51,9 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include +#include #include #include @@ -61,10 +64,11 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include -#include #include +#include #include #include #include @@ -77,7 +81,10 @@ __FBSDID("$FreeBSD$"); #include #ifdef IPSEC #include +#include +#include #endif /* IPSEC */ +#include #include @@ -87,39 +94,30 @@ __FBSDID("$FreeBSD$"); CTASSERT(sizeof(struct ip) == 20); #endif -struct rwlock in_ifaddr_lock; -RW_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); +/* IP reassembly functions are defined in ip_reass.c. */ +extern void ipreass_init(void); +extern void ipreass_drain(void); +extern void ipreass_slowtimo(void); +#ifdef VIMAGE +extern void ipreass_destroy(void); +#endif + +struct rmlock in_ifaddr_lock; +RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */ #define V_ipsendredirects VNET(ipsendredirects) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); -static VNET_DEFINE(int, ip_keepfaith); -#define V_ip_keepfaith VNET(ip_keepfaith) -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, - &VNET_NAME(ip_keepfaith), 0, - "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); - -static VNET_DEFINE(int, ip_sendsourcequench); -#define V_ip_sendsourcequench VNET(ip_sendsourcequench) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW, - &VNET_NAME(ip_sendsourcequench), 0, - "Enable the transmission of source quench packets"); - -VNET_DEFINE(int, ip_do_randomid); -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, - &VNET_NAME(ip_do_randomid), 0, - "Assign random ip_id values"); - /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table @@ -135,7 +133,7 @@ SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, */ static VNET_DEFINE(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); @@ -145,8 +143,32 @@ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, +#ifdef RSS + .nh_m2cpuid = rss_soft_m2cpuid_v4, + .nh_policy = NETISR_POLICY_CPU, + .nh_dispatch = NETISR_DISPATCH_HYBRID, +#else .nh_policy = NETISR_POLICY_FLOW, +#endif +}; + +#ifdef RSS +/* + * Directly dispatched frames are currently assumed + * to have a flowid already calculated. + * + * It should likely have something that assert it + * actually has valid flow details. + */ +static struct netisr_handler ip_direct_nh = { + .nh_name = "ip_direct", + .nh_handler = ip_direct_input, + .nh_proto = NETISR_IP_DIRECT, + .nh_m2cpuid = rss_soft_m2cpuid_v4, + .nh_policy = NETISR_POLICY_CPU, + .nh_dispatch = NETISR_DISPATCH_HYBRID, }; +#endif extern struct domain inetdomain; extern struct protosw inetsw[]; @@ -155,41 +177,6 @@ VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ -VNET_DEFINE(struct ipstat, ipstat); -SYSCTL_VNET_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, - &VNET_NAME(ipstat), ipstat, - "IP statistics (struct ipstat, netinet/ip_var.h)"); - -static VNET_DEFINE(uma_zone_t, ipq_zone); -static VNET_DEFINE(TAILQ_HEAD(ipqhead, ipq), ipq[IPREASS_NHASH]); -static struct mtx ipqlock; - -#define V_ipq_zone VNET(ipq_zone) -#define V_ipq VNET(ipq) - -#define IPQ_LOCK() mtx_lock(&ipqlock) -#define IPQ_UNLOCK() mtx_unlock(&ipqlock) -#define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF) -#define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED) - -static void maxnipq_update(void); -static void ipq_zone_change(void *); -static void ip_drain_locked(void); - -static VNET_DEFINE(int, maxnipq); /* Administrative limit on # reass queues. */ -static VNET_DEFINE(int, nipq); /* Total # of reass queues */ -#define V_maxnipq VNET(maxnipq) -#define V_nipq VNET(nipq) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD, - &VNET_NAME(nipq), 0, - "Current number of IPv4 fragment reassembly queue entries"); - -static VNET_DEFINE(int, maxfragsperpacket); -#define V_maxfragsperpacket VNET(maxfragsperpacket) -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, - &VNET_NAME(maxfragsperpacket), 0, - "Maximum number of IPv4 fragments allowed per packet"); - #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); @@ -197,42 +184,39 @@ SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif -#ifdef FLOWTABLE -static VNET_DEFINE(int, ip_output_flowtable_size) = 2048; -VNET_DEFINE(struct flowtable *, ip_ft); -#define V_ip_output_flowtable_size VNET(ip_output_flowtable_size) - -SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN, - &VNET_NAME(ip_output_flowtable_size), 2048, - "number of entries in the per-cpu output flow caches"); -#endif +/* + * IP statistics are stored in the "array" of counter(9)s. + */ +VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); +VNET_PCPUSTAT_SYSINIT(ipstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, + "IP statistics (struct ipstat, netinet/ip_var.h)"); -static void ip_freef(struct ipqhead *, struct ipq *); +#ifdef VIMAGE +VNET_PCPUSTAT_SYSUNINIT(ipstat); +#endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index - * into ipstat treated as an array of u_long. While this encodes the general - * layout of ipstat into the caller, it doesn't encode its location, so that - * future changes to add, for example, per-CPU stats support won't cause - * binary compatibility problems for kernel modules. + * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { - (*((u_long *)&V_ipstat + statnum))++; + counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { - (*((u_long *)&V_ipstat + statnum))--; + counter_u64_add(VNET(ipstat)[statnum], -1); } static int @@ -273,6 +257,46 @@ SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); +#ifdef RSS +static int +sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) +{ + int error, qlimit; + + netisr_getqlimit(&ip_direct_nh, &qlimit); + error = sysctl_handle_int(oidp, &qlimit, 0, req); + if (error || !req->newptr) + return (error); + if (qlimit < 1) + return (EINVAL); + return (netisr_setqlimit(&ip_direct_nh, qlimit)); +} +SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen, + CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", + "Maximum size of the IP direct input queue"); + +static int +sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) +{ + u_int64_t qdrops_long; + int error, qdrops; + + netisr_getqdrops(&ip_direct_nh, &qdrops_long); + qdrops = qdrops_long; + error = sysctl_handle_int(oidp, &qdrops, 0, req); + if (error || !req->newptr) + return (error); + if (qdrops != 0) + return (EINVAL); + netisr_clearqdrops(&ip_direct_nh); + return (0); +} + +SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops, + CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", + "Number of packets dropped from the IP direct input queue"); +#endif /* RSS */ + /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. @@ -283,19 +307,11 @@ ip_init(void) struct protosw *pr; int i; - V_ip_id = time_second & 0xffff; - TAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ - for (i = 0; i < IPREASS_NHASH; i++) - TAILQ_INIT(&V_ipq[i]); - V_maxnipq = nmbclusters / 32; - V_maxfragsperpacket = 16; - V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, - NULL, UMA_ALIGN_PTR, 0); - maxnipq_update(); + ipreass_init(); /* Initialize packet filter hooks. */ V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; @@ -304,27 +320,27 @@ ip_init(void) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); -#ifdef FLOWTABLE - if (TUNABLE_INT_FETCH("net.inet.ip.output_flowtable_size", - &V_ip_output_flowtable_size)) { - if (V_ip_output_flowtable_size < 256) - V_ip_output_flowtable_size = 256; - if (!powerof2(V_ip_output_flowtable_size)) { - printf("flowtable must be power of 2 size\n"); - V_ip_output_flowtable_size = 2048; - } - } else { - /* - * round up to the next power of 2 - */ - V_ip_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1); - } - V_ip_ft = flowtable_alloc("ipv4", V_ip_output_flowtable_size, FL_PCPU); -#endif + if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET, + &V_ipsec_hhh_in[HHOOK_IPSEC_INET], + HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) + printf("%s: WARNING: unable to register input helper hook\n", + __func__); + if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET, + &V_ipsec_hhh_out[HHOOK_IPSEC_INET], + HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) + printf("%s: WARNING: unable to register output helper hook\n", + __func__); /* Skip initialization of globals for non-default instances. */ - if (!IS_DEFAULT_VNET(curvnet)) +#ifdef VIMAGE + if (!IS_DEFAULT_VNET(curvnet)) { + netisr_register_vnet(&ip_nh); +#ifdef RSS + netisr_register_vnet(&ip_direct_nh); +#endif return; + } +#endif pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) @@ -346,27 +362,79 @@ ip_init(void) ip_protox[pr->pr_protocol] = pr - inetsw; } - EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, - NULL, EVENTHANDLER_PRI_ANY); - - /* Initialize various other remaining things. */ - IPQ_LOCK_INIT(); netisr_register(&ip_nh); +#ifdef RSS + netisr_register(&ip_direct_nh); +#endif } #ifdef VIMAGE -void -ip_destroy(void) +static void +ip_destroy(void *unused __unused) { + struct ifnet *ifp; + int error; + +#ifdef RSS + netisr_unregister_vnet(&ip_direct_nh); +#endif + netisr_unregister_vnet(&ip_nh); + + if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0) + printf("%s: WARNING: unable to unregister pfil hook, " + "error %d\n", __func__, error); + + error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]); + if (error != 0) { + printf("%s: WARNING: unable to deregister input helper hook " + "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: " + "error %d returned\n", __func__, error); + } + error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]); + if (error != 0) { + printf("%s: WARNING: unable to deregister output helper hook " + "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: " + "error %d returned\n", __func__, error); + } + + /* Remove the IPv4 addresses from all interfaces. */ + in_ifscrub_all(); + + /* Make sure the IPv4 routes are gone as well. */ + IFNET_RLOCK(); + TAILQ_FOREACH(ifp, &V_ifnet, if_link) + rt_flushifroutes_af(ifp, AF_INET); + IFNET_RUNLOCK(); + + /* Destroy IP reassembly queue. */ + ipreass_destroy(); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); +} - IPQ_LOCK(); - ip_drain_locked(); - IPQ_UNLOCK(); +VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL); +#endif - uma_zdestroy(V_ipq_zone); +#ifdef RSS +/* + * IP direct input routine. + * + * This is called when reinjecting completed fragments where + * all of the previous checking and book-keeping has been done. + */ +void +ip_direct_input(struct mbuf *m) +{ + struct ip *ip; + int hlen; + + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + + IPSTAT_INC(ips_delivered); + (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); + return; } #endif @@ -382,21 +450,18 @@ ip_input(struct mbuf *m) struct ifaddr *ifa; struct ifnet *ifp; int checkif, hlen = 0; - u_short sum; + uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { - /* - * Firewall or NAT changed destination to local. - * We expect ip_len and ip_off to be in host byte order. - */ m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; + ip_len = ntohs(ip->ip_len); goto ours; } @@ -430,6 +495,8 @@ ip_input(struct mbuf *m) ip = mtod(m, struct ip *); } + IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); + /* 127/8 must not appear on wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || @@ -460,15 +527,11 @@ ip_input(struct mbuf *m) return; #endif - /* - * Convert fields to host representation. - */ - ip->ip_len = ntohs(ip->ip_len); - if (ip->ip_len < hlen) { + ip_len = ntohs(ip->ip_len); + if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; } - ip->ip_off = ntohs(ip->ip_off); /* * Check that the amount of data in the buffers @@ -476,24 +539,35 @@ ip_input(struct mbuf *m) * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ - if (m->m_pkthdr.len < ip->ip_len) { + if (m->m_pkthdr.len < ip_len) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } - if (m->m_pkthdr.len > ip->ip_len) { + if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { - m->m_len = ip->ip_len; - m->m_pkthdr.len = ip->ip_len; + m->m_len = ip_len; + m->m_pkthdr.len = ip_len; } else - m_adj(m, ip->ip_len - m->m_pkthdr.len); + m_adj(m, ip_len - m->m_pkthdr.len); } + + /* Try to forward the packet, but if we fail continue */ #ifdef IPSEC + /* For now we do not handle IPSEC in tryforward. */ + if (!key_havesp(IPSEC_DIR_INBOUND) && !key_havesp(IPSEC_DIR_OUTBOUND) && + (V_ipforwarding == 1)) + if (ip_tryforward(m) == NULL) + return; /* * Bypass packet filtering for packets previously handled by IPsec. */ if (ip_ipsec_filtertunnel(m)) goto passin; +#else + if (V_ipforwarding == 1) + if (ip_tryforward(m) == NULL) + return; #endif /* IPSEC */ /* @@ -523,8 +597,7 @@ tooshort: goto ours; } if (m->m_flags & M_IP_NEXTHOP) { - dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL); - if (dchg != 0) { + if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us @@ -535,6 +608,7 @@ tooshort: } } passin: + /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an @@ -597,7 +671,9 @@ passin: */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == ifp)) { - ifa_ref(&ia->ia_ifa); + counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); + counter_u64_add(ia->ia_ifa.ifa_ibytes, + m->m_pkthdr.len); /* IN_IFADDR_RUNLOCK(); */ goto ours; } @@ -620,13 +696,17 @@ passin: ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { - ifa_ref(ifa); + counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); + counter_u64_add(ia->ia_ifa.ifa_ibytes, + m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { - ifa_ref(ifa); + counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); + counter_u64_add(ia->ia_ifa.ifa_ibytes, + m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } @@ -678,18 +758,6 @@ passin: if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; - /* - * FAITH(Firewall Aided Internet Translator) - */ - if (ifp && ifp->if_type == IFT_FAITH) { - if (V_ip_keepfaith) { - if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) - goto ours; - } - m_freem(m); - return; - } - /* * Not for us; forward if possible and desirable. */ @@ -697,10 +765,6 @@ passin: IPSTAT_INC(ips_cantforward); m_freem(m); } else { -#ifdef IPSEC - if (ip_ipsec_fwd(m)) - goto bad; -#endif /* IPSEC */ ip_forward(m, dchg); } return; @@ -711,25 +775,16 @@ ours: * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ - if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) { - if (ia != NULL) - ifa_free(&ia->ia_ifa); + if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; - } #endif /* IPSTEALTH */ - /* Count the packet in the ip address stats */ - if (ia != NULL) { - ia->ia_ifa.if_ipackets++; - ia->ia_ifa.if_ibytes += m->m_pkthdr.len; - ifa_free(&ia->ia_ifa); - } - /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ - if (ip->ip_off & (IP_MF | IP_OFFMASK)) { + if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { + /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; @@ -738,19 +793,13 @@ ours: hlen = ip->ip_hl << 2; } - /* - * Further protocols expect the packet length to be w/o the - * IP header. - */ - ip->ip_len -= hlen; - #ifdef IPSEC /* * enforce IPsec policy checking if we are seeing last header. * note that we do not visit this with protocols with pcb layer * code - like udp/tcp/raw ip. */ - if (ip_ipsec_input(m)) + if (ip_ipsec_input(m, ip->ip_p) != 0) goto bad; #endif /* IPSEC */ @@ -759,418 +808,12 @@ ours: */ IPSTAT_INC(ips_delivered); - (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); + (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; bad: m_freem(m); } -/* - * After maxnipq has been updated, propagate the change to UMA. The UMA zone - * max has slightly different semantics than the sysctl, for historical - * reasons. - */ -static void -maxnipq_update(void) -{ - - /* - * -1 for unlimited allocation. - */ - if (V_maxnipq < 0) - uma_zone_set_max(V_ipq_zone, 0); - /* - * Positive number for specific bound. - */ - if (V_maxnipq > 0) - uma_zone_set_max(V_ipq_zone, V_maxnipq); - /* - * Zero specifies no further fragment queue allocation -- set the - * bound very low, but rely on implementation elsewhere to actually - * prevent allocation and reclaim current queues. - */ - if (V_maxnipq == 0) - uma_zone_set_max(V_ipq_zone, 1); -} - -static void -ipq_zone_change(void *tag) -{ - - if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { - V_maxnipq = nmbclusters / 32; - maxnipq_update(); - } -} - -static int -sysctl_maxnipq(SYSCTL_HANDLER_ARGS) -{ - int error, i; - - i = V_maxnipq; - error = sysctl_handle_int(oidp, &i, 0, req); - if (error || !req->newptr) - return (error); - - /* - * XXXRW: Might be a good idea to sanity check the argument and place - * an extreme upper bound. - */ - if (i < -1) - return (EINVAL); - V_maxnipq = i; - maxnipq_update(); - return (0); -} - -SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, - NULL, 0, sysctl_maxnipq, "I", - "Maximum number of IPv4 fragment reassembly queue entries"); - -/* - * Take incoming datagram fragment and try to reassemble it into - * whole datagram. If the argument is the first fragment or one - * in between the function will return NULL and store the mbuf - * in the fragment chain. If the argument is the last fragment - * the packet will be reassembled and the pointer to the new - * mbuf returned for further processing. Only m_tags attached - * to the first packet/fragment are preserved. - * The IP header is *NOT* adjusted out of iplen. - */ -struct mbuf * -ip_reass(struct mbuf *m) -{ - struct ip *ip; - struct mbuf *p, *q, *nq, *t; - struct ipq *fp = NULL; - struct ipqhead *head; - int i, hlen, next; - u_int8_t ecn, ecn0; - u_short hash; - - /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ - if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { - IPSTAT_INC(ips_fragments); - IPSTAT_INC(ips_fragdropped); - m_freem(m); - return (NULL); - } - - ip = mtod(m, struct ip *); - hlen = ip->ip_hl << 2; - - hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); - head = &V_ipq[hash]; - IPQ_LOCK(); - - /* - * Look for queue of fragments - * of this datagram. - */ - TAILQ_FOREACH(fp, head, ipq_list) - if (ip->ip_id == fp->ipq_id && - ip->ip_src.s_addr == fp->ipq_src.s_addr && - ip->ip_dst.s_addr == fp->ipq_dst.s_addr && -#ifdef MAC - mac_ipq_match(m, fp) && -#endif - ip->ip_p == fp->ipq_p) - goto found; - - fp = NULL; - - /* - * Attempt to trim the number of allocated fragment queues if it - * exceeds the administrative limit. - */ - if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { - /* - * drop something from the tail of the current queue - * before proceeding further - */ - struct ipq *q = TAILQ_LAST(head, ipqhead); - if (q == NULL) { /* gak */ - for (i = 0; i < IPREASS_NHASH; i++) { - struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); - if (r) { - IPSTAT_ADD(ips_fragtimeout, - r->ipq_nfrags); - ip_freef(&V_ipq[i], r); - break; - } - } - } else { - IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags); - ip_freef(head, q); - } - } - -found: - /* - * Adjust ip_len to not reflect header, - * convert offset of this to bytes. - */ - ip->ip_len -= hlen; - if (ip->ip_off & IP_MF) { - /* - * Make sure that fragments have a data length - * that's a non-zero multiple of 8 bytes. - */ - if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { - IPSTAT_INC(ips_toosmall); /* XXX */ - goto dropfrag; - } - m->m_flags |= M_FRAG; - } else - m->m_flags &= ~M_FRAG; - ip->ip_off <<= 3; - - - /* - * Attempt reassembly; if it succeeds, proceed. - * ip_reass() will return a different mbuf. - */ - IPSTAT_INC(ips_fragments); - m->m_pkthdr.header = ip; - - /* Previous ip_reass() started here. */ - /* - * Presence of header sizes in mbufs - * would confuse code below. - */ - m->m_data += hlen; - m->m_len -= hlen; - - /* - * If first fragment to arrive, create a reassembly queue. - */ - if (fp == NULL) { - fp = uma_zalloc(V_ipq_zone, M_NOWAIT); - if (fp == NULL) - goto dropfrag; -#ifdef MAC - if (mac_ipq_init(fp, M_NOWAIT) != 0) { - uma_zfree(V_ipq_zone, fp); - fp = NULL; - goto dropfrag; - } - mac_ipq_create(m, fp); -#endif - TAILQ_INSERT_HEAD(head, fp, ipq_list); - V_nipq++; - fp->ipq_nfrags = 1; - fp->ipq_ttl = IPFRAGTTL; - fp->ipq_p = ip->ip_p; - fp->ipq_id = ip->ip_id; - fp->ipq_src = ip->ip_src; - fp->ipq_dst = ip->ip_dst; - fp->ipq_frags = m; - m->m_nextpkt = NULL; - goto done; - } else { - fp->ipq_nfrags++; -#ifdef MAC - mac_ipq_update(m, fp); -#endif - } - -#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) - - /* - * Handle ECN by comparing this segment with the first one; - * if CE is set, do not lose CE. - * drop if CE and not-ECT are mixed for the same packet. - */ - ecn = ip->ip_tos & IPTOS_ECN_MASK; - ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; - if (ecn == IPTOS_ECN_CE) { - if (ecn0 == IPTOS_ECN_NOTECT) - goto dropfrag; - if (ecn0 != IPTOS_ECN_CE) - GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; - } - if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) - goto dropfrag; - - /* - * Find a segment which begins after this one does. - */ - for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) - if (GETIP(q)->ip_off > ip->ip_off) - break; - - /* - * If there is a preceding segment, it may provide some of - * our data already. If so, drop the data from the incoming - * segment. If it provides all of our data, drop us, otherwise - * stick new segment in the proper place. - * - * If some of the data is dropped from the preceding - * segment, then it's checksum is invalidated. - */ - if (p) { - i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; - if (i > 0) { - if (i >= ip->ip_len) - goto dropfrag; - m_adj(m, i); - m->m_pkthdr.csum_flags = 0; - ip->ip_off += i; - ip->ip_len -= i; - } - m->m_nextpkt = p->m_nextpkt; - p->m_nextpkt = m; - } else { - m->m_nextpkt = fp->ipq_frags; - fp->ipq_frags = m; - } - - /* - * While we overlap succeeding segments trim them or, - * if they are completely covered, dequeue them. - */ - for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; - q = nq) { - i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off; - if (i < GETIP(q)->ip_len) { - GETIP(q)->ip_len -= i; - GETIP(q)->ip_off += i; - m_adj(q, i); - q->m_pkthdr.csum_flags = 0; - break; - } - nq = q->m_nextpkt; - m->m_nextpkt = nq; - IPSTAT_INC(ips_fragdropped); - fp->ipq_nfrags--; - m_freem(q); - } - - /* - * Check for complete reassembly and perform frag per packet - * limiting. - * - * Frag limiting is performed here so that the nth frag has - * a chance to complete the packet before we drop the packet. - * As a result, n+1 frags are actually allowed per packet, but - * only n will ever be stored. (n = maxfragsperpacket.) - * - */ - next = 0; - for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { - if (GETIP(q)->ip_off != next) { - if (fp->ipq_nfrags > V_maxfragsperpacket) { - IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ip_freef(head, fp); - } - goto done; - } - next += GETIP(q)->ip_len; - } - /* Make sure the last packet didn't have the IP_MF flag */ - if (p->m_flags & M_FRAG) { - if (fp->ipq_nfrags > V_maxfragsperpacket) { - IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ip_freef(head, fp); - } - goto done; - } - - /* - * Reassembly is complete. Make sure the packet is a sane size. - */ - q = fp->ipq_frags; - ip = GETIP(q); - if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { - IPSTAT_INC(ips_toolong); - IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ip_freef(head, fp); - goto done; - } - - /* - * Concatenate fragments. - */ - m = q; - t = m->m_next; - m->m_next = NULL; - m_cat(m, t); - nq = q->m_nextpkt; - q->m_nextpkt = NULL; - for (q = nq; q != NULL; q = nq) { - nq = q->m_nextpkt; - q->m_nextpkt = NULL; - m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; - m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; - m_cat(m, q); - } - /* - * In order to do checksumming faster we do 'end-around carry' here - * (and not in for{} loop), though it implies we are not going to - * reassemble more than 64k fragments. - */ - while (m->m_pkthdr.csum_data & 0xffff0000) - m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + - (m->m_pkthdr.csum_data >> 16); -#ifdef MAC - mac_ipq_reassemble(fp, m); - mac_ipq_destroy(fp); -#endif - - /* - * Create header for new ip packet by modifying header of first - * packet; dequeue and discard fragment reassembly header. - * Make header visible. - */ - ip->ip_len = (ip->ip_hl << 2) + next; - ip->ip_src = fp->ipq_src; - ip->ip_dst = fp->ipq_dst; - TAILQ_REMOVE(head, fp, ipq_list); - V_nipq--; - uma_zfree(V_ipq_zone, fp); - m->m_len += (ip->ip_hl << 2); - m->m_data -= (ip->ip_hl << 2); - /* some debugging cruft by sklower, below, will go away soon */ - if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ - m_fixhdr(m); - IPSTAT_INC(ips_reassembled); - IPQ_UNLOCK(); - return (m); - -dropfrag: - IPSTAT_INC(ips_fragdropped); - if (fp != NULL) - fp->ipq_nfrags--; - m_freem(m); -done: - IPQ_UNLOCK(); - return (NULL); - -#undef GETIP -} - -/* - * Free a fragment reassembly header and all - * associated datagrams. - */ -static void -ip_freef(struct ipqhead *fhp, struct ipq *fp) -{ - struct mbuf *q; - - IPQ_LOCK_ASSERT(); - - while (fp->ipq_frags) { - q = fp->ipq_frags; - fp->ipq_frags = q->m_nextpkt; - m_freem(q); - } - TAILQ_REMOVE(fhp, fp, ipq_list); - uma_zfree(V_ipq_zone, fp); - V_nipq--; -} - /* * IP timer processing; * if a timer expires on a reassembly @@ -1180,82 +823,28 @@ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); - struct ipq *fp; - int i; VNET_LIST_RLOCK_NOSLEEP(); - IPQ_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - for (i = 0; i < IPREASS_NHASH; i++) { - for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { - struct ipq *fpp; - - fpp = fp; - fp = TAILQ_NEXT(fp, ipq_list); - if(--fpp->ipq_ttl == 0) { - IPSTAT_ADD(ips_fragtimeout, - fpp->ipq_nfrags); - ip_freef(&V_ipq[i], fpp); - } - } - } - /* - * If we are over the maximum number of fragments - * (due to the limit being lowered), drain off - * enough to get down to the new limit. - */ - if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { - for (i = 0; i < IPREASS_NHASH; i++) { - while (V_nipq > V_maxnipq && - !TAILQ_EMPTY(&V_ipq[i])) { - IPSTAT_ADD(ips_fragdropped, - TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); - ip_freef(&V_ipq[i], - TAILQ_FIRST(&V_ipq[i])); - } - } - } + ipreass_slowtimo(); CURVNET_RESTORE(); } - IPQ_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); } -/* - * Drain off all datagram fragments. - */ -static void -ip_drain_locked(void) -{ - int i; - - IPQ_LOCK_ASSERT(); - - for (i = 0; i < IPREASS_NHASH; i++) { - while(!TAILQ_EMPTY(&V_ipq[i])) { - IPSTAT_ADD(ips_fragdropped, - TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); - ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); - } - } -} - void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); - IPQ_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - ip_drain_locked(); + ipreass_drain(); CURVNET_RESTORE(); } - IPQ_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); - in_rtqdrain(); } /* @@ -1314,33 +903,6 @@ ipproto_unregister(short ipproto) return (0); } -/* - * Given address of next destination (final or next hop), return (referenced) - * internet address info of interface to be used to get there. - */ -struct in_ifaddr * -ip_rtaddr(struct in_addr dst, u_int fibnum) -{ - struct route sro; - struct sockaddr_in *sin; - struct in_ifaddr *ia; - - bzero(&sro, sizeof(sro)); - sin = (struct sockaddr_in *)&sro.ro_dst; - sin->sin_family = AF_INET; - sin->sin_len = sizeof(*sin); - sin->sin_addr = dst; - in_rtalloc_ign(&sro, 0, fibnum); - - if (sro.ro_rt == NULL) - return (NULL); - - ia = ifatoia(sro.ro_rt->rt_ifa); - ifa_ref(&ia->ia_ifa); - RTFREE(sro.ro_rt); - return (ia); -} - u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, @@ -1370,6 +932,7 @@ ip_forward(struct mbuf *m, int srcrt) struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; + struct sockaddr_in *sin; struct in_addr dest; struct route ro; int error, type = 0, code = 0, mtu = 0; @@ -1379,6 +942,13 @@ ip_forward(struct mbuf *m, int srcrt) m_freem(m); return; } +#ifdef IPSEC + if (ip_ipsec_fwd(m) != 0) { + IPSTAT_INC(ips_cantforward); + m_freem(m); + return; + } +#endif /* IPSEC */ #ifdef IPSTEALTH if (!V_ipstealth) { #endif @@ -1391,7 +961,23 @@ ip_forward(struct mbuf *m, int srcrt) } #endif - ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); + bzero(&ro, sizeof(ro)); + sin = (struct sockaddr_in *)&ro.ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_dst; +#ifdef RADIX_MPATH + rtalloc_mpath_fib(&ro, + ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), + M_GETFIB(m)); +#else + in_rtalloc_ign(&ro, 0, M_GETFIB(m)); +#endif + if (ro.ro_rt != NULL) { + ia = ifatoia(ro.ro_rt->rt_ifa); + ifa_ref(&ia->ia_ifa); + } else + ia = NULL; #ifndef IPSEC /* * 'ia' may be NULL if there is no route for this destination. @@ -1400,6 +986,7 @@ ip_forward(struct mbuf *m, int srcrt) */ if (!srcrt && ia == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); + RO_RTFREE(&ro); return; } #endif @@ -1420,8 +1007,8 @@ ip_forward(struct mbuf *m, int srcrt) * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ - MGETHDR(mcopy, M_DONTWAIT, m->m_type); - if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) { + mcopy = m_gethdr(M_NOWAIT, m->m_type); + if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now @@ -1432,7 +1019,7 @@ ip_forward(struct mbuf *m, int srcrt) mcopy = NULL; } if (mcopy != NULL) { - mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy)); + mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } @@ -1456,16 +1043,8 @@ ip_forward(struct mbuf *m, int srcrt) dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { - struct sockaddr_in *sin; struct rtentry *rt; - bzero(&ro, sizeof(ro)); - sin = (struct sockaddr_in *)&ro.ro_dst; - sin->sin_family = AF_INET; - sin->sin_len = sizeof(*sin); - sin->sin_addr = ip->ip_dst; - in_rtalloc_ign(&ro, 0, M_GETFIB(m)); - rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && @@ -1484,20 +1063,12 @@ ip_forward(struct mbuf *m, int srcrt) code = ICMP_REDIRECT_HOST; } } - if (rt) - RTFREE(rt); } - /* - * Try to cache the route MTU from ip_output so we can consider it for - * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191. - */ - bzero(&ro, sizeof(ro)); - error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_rt) - mtu = ro.ro_rt->rt_rmx.rmx_mtu; + mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); if (error) @@ -1560,31 +1131,12 @@ ip_forward(struct mbuf *m, int srcrt) if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else - mtu = ip_next_mtu(ip->ip_len, 0); + mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: - /* - * A router should not generate ICMP_SOURCEQUENCH as - * required in RFC1812 Requirements for IP Version 4 Routers. - * Source quench could be a big problem under DoS attacks, - * or if the underlying interface is rate-limited. - * Those who need source quench packets may re-enable them - * via the net.inet.ip.sendsourcequench sysctl. - */ - if (V_ip_sendsourcequench == 0) { - m_freem(mcopy); - if (ia != NULL) - ifa_free(&ia->ia_ifa); - return; - } else { - type = ICMP_SOURCEQUENCH; - code = 0; - } - break; - case EACCES: /* ipfw denied packet */ m_freem(mcopy); if (ia != NULL) @@ -1606,8 +1158,8 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, bintime(&bt); if (inp->inp_socket->so_options & SO_BINTIME) { - *mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt), - SCM_BINTIME, SOL_SOCKET); + *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), + SCM_BINTIME, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } @@ -1615,20 +1167,20 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct timeval tv; bintime2timeval(&bt, &tv); - *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), - SCM_TIMESTAMP, SOL_SOCKET); + *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET); if (*mp) mp = &(*mp)->m_next; } } if (inp->inp_flags & INP_RECVDSTADDR) { - *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, + *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { - *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, + *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; @@ -1640,14 +1192,14 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { - *mp = sbcreatecontrol((caddr_t) opts_deleted_above, + *mp = sbcreatecontrol((caddr_t)opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { - *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), + *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; @@ -1662,36 +1214,73 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; - if (((ifp = m->m_pkthdr.rcvif)) - && ( ifp->if_index && (ifp->if_index <= V_if_index))) { + if ((ifp = m->m_pkthdr.rcvif) && + ifp->if_index && ifp->if_index <= V_if_index) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ - if ((sdp->sdl_family != AF_LINK) - || (sdp->sdl_len > sizeof(sdlbuf))) { + if (sdp->sdl_family != AF_LINK || + sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: - sdl2->sdl_len - = offsetof(struct sockaddr_dl, sdl_data[0]); + sdl2->sdl_len = + offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } - *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, - IP_RECVIF, IPPROTO_IP); + *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, + IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { - *mp = sbcreatecontrol((caddr_t) &ip->ip_tos, + *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } + + if (inp->inp_flags2 & INP_RECVFLOWID) { + uint32_t flowid, flow_type; + + flowid = m->m_pkthdr.flowid; + flow_type = M_HASHTYPE_GET(m); + + /* + * XXX should handle the failure of one or the + * other - don't populate both? + */ + *mp = sbcreatecontrol((caddr_t) &flowid, + sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + *mp = sbcreatecontrol((caddr_t) &flow_type, + sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + +#ifdef RSS + if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { + uint32_t flowid, flow_type; + uint32_t rss_bucketid; + + flowid = m->m_pkthdr.flowid; + flow_type = M_HASHTYPE_GET(m); + + if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { + *mp = sbcreatecontrol((caddr_t) &rss_bucketid, + sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + } +#endif } /* @@ -1745,13 +1334,18 @@ ip_rsvp_done(void) return 0; } -void -rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */ +int +rsvp_input(struct mbuf **mp, int *offp, int proto) { + struct mbuf *m; + + m = *mp; + *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ - rsvp_input_p(m, off); - return; + *mp = m; + rsvp_input_p(mp, offp, proto); + return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member @@ -1761,13 +1355,15 @@ rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */ if (!V_rsvp_on) { m_freem(m); - return; + return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { - rip_input(m, off); - return; + *mp = m; + rip_input(mp, offp, proto); + return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); + return (IPPROTO_DONE); } diff --git a/freebsd/sys/netinet/ip_ipsec.h b/freebsd/sys/netinet/ip_ipsec.h index 2870c114..f499b740 100644 --- a/freebsd/sys/netinet/ip_ipsec.h +++ b/freebsd/sys/netinet/ip_ipsec.h @@ -34,7 +34,7 @@ int ip_ipsec_filtertunnel(struct mbuf *); int ip_ipsec_fwd(struct mbuf *); -int ip_ipsec_input(struct mbuf *); +int ip_ipsec_input(struct mbuf *, int); int ip_ipsec_mtu(struct mbuf *, int); -int ip_ipsec_output(struct mbuf **, struct inpcb *, int *, int *); +int ip_ipsec_output(struct mbuf **, struct inpcb *, int *); #endif diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c index f4aeed24..f8b14735 100644 --- a/freebsd/sys/netinet/ip_mroute.c +++ b/freebsd/sys/netinet/ip_mroute.c @@ -79,6 +79,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -95,8 +96,10 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include +#include #include #include #include @@ -121,7 +124,6 @@ __FBSDID("$FreeBSD$"); #endif #define VIFI_INVALID ((vifi_t) -1) -#define M_HASCL(m) ((m)->m_flags & M_EXT) static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */ #define V_last_tv_sec VNET(last_tv_sec) @@ -147,11 +149,11 @@ static struct mtx mrouter_mtx; static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ -static VNET_DEFINE(struct mrtstat, mrtstat); -#define V_mrtstat VNET(mrtstat) -SYSCTL_VNET_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, - &VNET_NAME(mrtstat), mrtstat, - "IPv4 Multicast Forwarding Statistics (struct mrtstat, " +static VNET_PCPUSTAT_DEFINE(struct mrtstat, mrtstat); +VNET_PCPUSTAT_SYSINIT(mrtstat); +VNET_PCPUSTAT_SYSUNINIT(mrtstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat, + mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, " "netinet/ip_mroute.h)"); static VNET_DEFINE(u_long, mfchash); @@ -179,7 +181,7 @@ static VNET_DEFINE(vifi_t, numvifs); #define V_numvifs VNET(numvifs) static VNET_DEFINE(struct vif, viftable[MAXVIFS]); #define V_viftable VNET(viftable) -SYSCTL_VNET_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, +SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]", "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); @@ -227,13 +229,13 @@ static VNET_DEFINE(struct callout, bw_upcalls_ch); #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ -static VNET_DEFINE(struct pimstat, pimstat); -#define V_pimstat VNET(pimstat) +static VNET_PCPUSTAT_DEFINE(struct pimstat, pimstat); +VNET_PCPUSTAT_SYSINIT(pimstat); +VNET_PCPUSTAT_SYSUNINIT(pimstat); SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); -SYSCTL_VNET_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, - &VNET_NAME(pimstat), pimstat, - "PIM Statistics (struct pimstat, netinet/pim_var.h)"); +SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat, + pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)"); static u_long pim_squelch_wholepkt = 0; SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, @@ -247,7 +249,7 @@ static const struct protosw in_pim_protosw = { .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = pim_input, - .pr_output = (pr_output_t*)rip_output, + .pr_output = rip_output, .pr_ctloutput = rip_ctloutput, .pr_usrreqs = &rip_usrreqs }; @@ -538,7 +540,7 @@ X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused) int error = 0; /* - * Currently the only function calling this ioctl routine is rtioctl(). + * Currently the only function calling this ioctl routine is rtioctl_fib(). * Typically, only root can create the raw socket in order to execute * this ioctl method, however the request might be coming from a prison */ @@ -635,8 +637,8 @@ if_detached_event(void *arg __unused, struct ifnet *ifp) continue; for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; - for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) { - nrt = LIST_NEXT(rt, mfc_hash); + + LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (rt->mfc_parent == vifi) { expire_mfc(rt); } @@ -754,8 +756,8 @@ X_ip_mrouter_done(void) */ for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; - for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) { - nrt = LIST_NEXT(rt, mfc_hash); + + LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { expire_mfc(rt); } } @@ -1303,8 +1305,8 @@ X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, return ENOBUFS; } - mb0 = m_copypacket(m, M_DONTWAIT); - if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) + mb0 = m_copypacket(m, M_NOWAIT); + if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen)) mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); @@ -1446,9 +1448,7 @@ expire_upcalls(void *arg) if (V_nexpire[i] == 0) continue; - for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) { - nrt = LIST_NEXT(rt, mfc_hash); - + LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { if (TAILQ_EMPTY(&rt->mfc_stall)) continue; @@ -1490,7 +1490,7 @@ ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) { struct ip *ip = mtod(m, struct ip *); vifi_t vifi; - int plen = ip->ip_len; + int plen = ntohs(ip->ip_len); VIF_LOCK_ASSERT(); @@ -1546,7 +1546,7 @@ ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) int hlen = ip->ip_hl << 2; struct mbuf *mm = m_copy(m, 0, hlen); - if (mm && (M_HASCL(mm) || mm->m_len < hlen)) + if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen)) mm = m_pullup(mm, hlen); if (mm == NULL) return ENOBUFS; @@ -1666,8 +1666,8 @@ phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) * the IP header is actually copied, not just referenced, * so that ip_output() only scribbles on the copy. */ - mb_copy = m_copypacket(m, M_DONTWAIT); - if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) + mb_copy = m_copypacket(m, M_NOWAIT); + if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen)) mb_copy = m_pullup(mb_copy, hlen); if (mb_copy == NULL) return; @@ -1720,12 +1720,16 @@ X_ip_rsvp_force_done(struct socket *so __unused) } -static void -X_rsvp_input(struct mbuf *m, int off __unused) +static int +X_rsvp_input(struct mbuf **mp, int *offp, int proto) { + struct mbuf *m; + m = *mp; + *mp = NULL; if (!V_rsvp_on) m_freem(m); + return (IPPROTO_DONE); } /* @@ -2080,13 +2084,12 @@ bw_upcalls_send(void) * Allocate a new mbuf, initialize it with the header and * the payload for the pending calls. */ - MGETHDR(m, M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); return; } - m->m_len = m->m_pkthdr.len = 0; m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]); @@ -2381,7 +2384,7 @@ pim_register_prepare(struct ip *ip, struct mbuf *m) * Copy the old packet & pullup its IP header into the * new mbuf so we can modify it. */ - mb_copy = m_copypacket(m, M_DONTWAIT); + mb_copy = m_copypacket(m, M_NOWAIT); if (mb_copy == NULL) return NULL; mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); @@ -2395,15 +2398,14 @@ pim_register_prepare(struct ip *ip, struct mbuf *m) /* Compute the MTU after the PIM Register encapsulation */ mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); - if (ip->ip_len <= mtu) { + if (ntohs(ip->ip_len) <= mtu) { /* Turn the IP header into a valid one */ - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); } else { /* Fragment the packet */ - if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) { + mb_copy->m_pkthdr.csum_flags |= CSUM_IP; + if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) { m_freem(mb_copy); return NULL; } @@ -2428,7 +2430,7 @@ pim_register_send_upcall(struct ip *ip, struct vif *vifp, /* * Add a new mbuf with an upcall header */ - MGETHDR(mb_first, M_DONTWAIT, MT_DATA); + mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; @@ -2486,7 +2488,7 @@ pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, /* * Add a new mbuf with the encapsulating header */ - MGETHDR(mb_first, M_DONTWAIT, MT_DATA); + mb_first = m_gethdr(M_NOWAIT, MT_DATA); if (mb_first == NULL) { m_freem(mb_copy); return ENOBUFS; @@ -2502,8 +2504,8 @@ pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, */ ip_outer = mtod(mb_first, struct ip *); *ip_outer = pim_encap_iphdr; - ip_outer->ip_id = ip_newid(); - ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); + ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) + + sizeof(pim_encap_pimhdr)); ip_outer->ip_src = V_viftable[vifi].v_lcl_addr; ip_outer->ip_dst = rt->mfc_rp; /* @@ -2511,8 +2513,9 @@ pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, * IP_DF bit. */ ip_outer->ip_tos = ip->ip_tos; - if (ntohs(ip->ip_off) & IP_DF) - ip_outer->ip_off |= IP_DF; + if (ip->ip_off & htons(IP_DF)) + ip_outer->ip_off |= htons(IP_DF); + ip_fillid(ip_outer); pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + sizeof(pim_encap_iphdr)); *pimhdr = pim_encap_pimhdr; @@ -2559,15 +2562,18 @@ pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) * (used by PIM-SM): the PIM header is stripped off, and the inner packet * is passed to if_simloop(). */ -void -pim_input(struct mbuf *m, int off) +int +pim_input(struct mbuf **mp, int *offp, int proto) { + struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); struct pim *pim; + int iphlen = *offp; int minlen; - int datalen = ip->ip_len; + int datalen = ntohs(ip->ip_len) - iphlen; int ip_tos; - int iphlen = off; + + *mp = NULL; /* Keep statistics */ PIMSTAT_INC(pims_rcv_total_msgs); @@ -2581,7 +2587,7 @@ pim_input(struct mbuf *m, int off) CTR3(KTR_IPMF, "%s: short packet (%d) from %s", __func__, datalen, inet_ntoa(ip->ip_src)); m_freem(m); - return; + return (IPPROTO_DONE); } /* @@ -2597,10 +2603,9 @@ pim_input(struct mbuf *m, int off) * Get the IP and PIM headers in contiguous memory, and * possibly the PIM REGISTER header. */ - if ((m->m_flags & M_EXT || m->m_len < minlen) && - (m = m_pullup(m, minlen)) == 0) { + if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) { CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__); - return; + return (IPPROTO_DONE); } /* m_pullup() may have given us a new mbuf so reset ip. */ @@ -2625,7 +2630,7 @@ pim_input(struct mbuf *m, int off) PIMSTAT_INC(pims_rcv_badsum); CTR1(KTR_IPMF, "%s: invalid checksum", __func__); m_freem(m); - return; + return (IPPROTO_DONE); } /* PIM version check */ @@ -2634,7 +2639,7 @@ pim_input(struct mbuf *m, int off) CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__, (int)PIM_VT_V(pim->pim_vt), PIM_VERSION); m_freem(m); - return; + return (IPPROTO_DONE); } /* restore mbuf back to the outer IP */ @@ -2659,7 +2664,7 @@ pim_input(struct mbuf *m, int off) CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, (int)V_reg_vif_num); m_freem(m); - return; + return (IPPROTO_DONE); } /* XXX need refcnt? */ vifp = V_viftable[V_reg_vif_num].v_ifp; @@ -2673,7 +2678,7 @@ pim_input(struct mbuf *m, int off) PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: register packet size too small", __func__); m_freem(m); - return; + return (IPPROTO_DONE); } reghdr = (u_int32_t *)(pim + 1); @@ -2687,7 +2692,7 @@ pim_input(struct mbuf *m, int off) PIMSTAT_INC(pims_rcv_badregisters); CTR1(KTR_IPMF, "%s: bad encap ip version", __func__); m_freem(m); - return; + return (IPPROTO_DONE); } /* verify the inner packet is destined to a mcast group */ @@ -2696,7 +2701,7 @@ pim_input(struct mbuf *m, int off) CTR2(KTR_IPMF, "%s: bad encap ip dest %s", __func__, inet_ntoa(encap_ip->ip_dst)); m_freem(m); - return; + return (IPPROTO_DONE); } /* If a NULL_REGISTER, pass it to the daemon */ @@ -2735,7 +2740,7 @@ pim_input(struct mbuf *m, int off) if (mcp == NULL) { CTR1(KTR_IPMF, "%s: m_copy() failed", __func__); m_freem(m); - return; + return (IPPROTO_DONE); } /* Keep statistics */ @@ -2771,9 +2776,10 @@ pim_input_to_daemon: * XXX: the outer IP header pkt size of a Register is not adjust to * reflect the fact that the inner multicast data is truncated. */ - rip_input(m, iphlen); + *mp = m; + rip_input(mp, offp, proto); - return; + return (IPPROTO_DONE); } static int @@ -2813,12 +2819,12 @@ vnet_mroute_init(const void *unused __unused) MALLOC(V_nexpire, u_char *, mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO); bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); - callout_init(&V_expire_upcalls_ch, CALLOUT_MPSAFE); - callout_init(&V_bw_upcalls_ch, CALLOUT_MPSAFE); - callout_init(&V_bw_meter_ch, CALLOUT_MPSAFE); + callout_init(&V_expire_upcalls_ch, 1); + callout_init(&V_bw_upcalls_ch, 1); + callout_init(&V_bw_meter_ch, 1); } -VNET_SYSINIT(vnet_mroute_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_mroute_init, +VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init, NULL); static void @@ -2829,7 +2835,7 @@ vnet_mroute_uninit(const void *unused __unused) V_nexpire = NULL; } -VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, +VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, vnet_mroute_uninit, NULL); static int @@ -2944,4 +2950,4 @@ static moduledata_t ip_mroutemod = { 0 }; -DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE); +DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE); diff --git a/freebsd/sys/netinet/ip_mroute.h b/freebsd/sys/netinet/ip_mroute.h index e945b92c..65f7d83c 100644 --- a/freebsd/sys/netinet/ip_mroute.h +++ b/freebsd/sys/netinet/ip_mroute.h @@ -206,23 +206,24 @@ struct bw_upcall { * The kernel's multicast routing statistics. */ struct mrtstat { - u_long mrts_mfc_lookups; /* # forw. cache hash table hits */ - u_long mrts_mfc_misses; /* # forw. cache hash table misses */ - u_long mrts_upcalls; /* # calls to multicast routing daemon */ - u_long mrts_no_route; /* no route for packet's origin */ - u_long mrts_bad_tunnel; /* malformed tunnel options */ - u_long mrts_cant_tunnel; /* no room for tunnel options */ - u_long mrts_wrong_if; /* arrived on wrong interface */ - u_long mrts_upq_ovflw; /* upcall Q overflow */ - u_long mrts_cache_cleanups; /* # entries with no upcalls */ - u_long mrts_drop_sel; /* pkts dropped selectively */ - u_long mrts_q_overflow; /* pkts dropped - Q overflow */ - u_long mrts_pkt2large; /* pkts dropped - size > BKT SIZE */ - u_long mrts_upq_sockfull; /* upcalls dropped - socket full */ + uint64_t mrts_mfc_lookups; /* # forw. cache hash table hits */ + uint64_t mrts_mfc_misses; /* # forw. cache hash table misses */ + uint64_t mrts_upcalls; /* # calls to multicast routing daemon */ + uint64_t mrts_no_route; /* no route for packet's origin */ + uint64_t mrts_bad_tunnel; /* malformed tunnel options */ + uint64_t mrts_cant_tunnel; /* no room for tunnel options */ + uint64_t mrts_wrong_if; /* arrived on wrong interface */ + uint64_t mrts_upq_ovflw; /* upcall Q overflow */ + uint64_t mrts_cache_cleanups; /* # entries with no upcalls */ + uint64_t mrts_drop_sel; /* pkts dropped selectively */ + uint64_t mrts_q_overflow; /* pkts dropped - Q overflow */ + uint64_t mrts_pkt2large; /* pkts dropped - size > BKT SIZE */ + uint64_t mrts_upq_sockfull; /* upcalls dropped - socket full */ }; #ifdef _KERNEL -#define MRTSTAT_ADD(name, val) V_mrtstat.name += (val) +#define MRTSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct mrtstat, mrtstat, name, (val)) #define MRTSTAT_INC(name) MRTSTAT_ADD(name, 1) #endif diff --git a/freebsd/sys/netinet/ip_options.c b/freebsd/sys/netinet/ip_options.c index 6431aaa1..134479c9 100644 --- a/freebsd/sys/netinet/ip_options.c +++ b/freebsd/sys/netinet/ip_options.c @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -67,18 +68,21 @@ __FBSDID("$FreeBSD$"); #include -static int ip_dosourceroute = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, - &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); +static VNET_DEFINE(int, ip_dosourceroute); +SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_dosourceroute), 0, + "Enable forwarding source routed IP packets"); +#define V_ip_dosourceroute VNET(ip_dosourceroute) -static int ip_acceptsourceroute = 0; +static VNET_DEFINE(int, ip_acceptsourceroute); SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, - CTLFLAG_RW, &ip_acceptsourceroute, 0, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_acceptsourceroute), 0, "Enable accepting source routed IP packets"); +#define V_ip_acceptsourceroute VNET(ip_acceptsourceroute) -int ip_doopts = 1; /* 0 = ignore, 1 = process, 2 = reject */ -SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_RW, - &ip_doopts, 0, "Enable IP options processing ([LS]SRR, RR, TS)"); +VNET_DEFINE(int, ip_doopts) = 1; /* 0 = ignore, 1 = process, 2 = reject */ +SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(ip_doopts), 0, "Enable IP options processing ([LS]SRR, RR, TS)"); static void save_rte(struct mbuf *m, u_char *, struct in_addr); @@ -103,12 +107,13 @@ ip_dooptions(struct mbuf *m, int pass) int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; struct in_addr *sin, dst; uint32_t ntime; + struct nhop4_extended nh_ext; struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; /* Ignore or reject packets with IP options. */ - if (ip_doopts == 0) + if (V_ip_doopts == 0) return 0; - else if (ip_doopts == 2) { + else if (V_ip_doopts == 2) { type = ICMP_UNREACH; code = ICMP_UNREACH_FILTER_PROHIB; goto bad; @@ -169,7 +174,7 @@ ip_dooptions(struct mbuf *m, int pass) code = ICMP_UNREACH_SRCFAIL; goto bad; } - if (!ip_dosourceroute) + if (!V_ip_dosourceroute) goto nosourcerouting; /* * Loose routing, and not at next destination @@ -182,7 +187,7 @@ ip_dooptions(struct mbuf *m, int pass) /* * End of source route. Should be for us. */ - if (!ip_acceptsourceroute) + if (!V_ip_acceptsourceroute) goto nosourcerouting; save_rte(m, cp, ip->ip_src); break; @@ -191,7 +196,7 @@ ip_dooptions(struct mbuf *m, int pass) if (V_ipstealth) goto dropit; #endif - if (!ip_dosourceroute) { + if (!V_ip_dosourceroute) { if (V_ipforwarding) { char buf[16]; /* aaa.bbb.ccc.ddd\0 */ /* @@ -226,23 +231,34 @@ dropit: (void)memcpy(&ipaddr.sin_addr, cp + off, sizeof(ipaddr.sin_addr)); + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + if (opt == IPOPT_SSRR) { #define INA struct in_ifaddr * #define SA struct sockaddr * - if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == NULL) - ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0); - } else -/* XXX MRT 0 for routing */ - ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m)); - if (ia == NULL) { - type = ICMP_UNREACH; - code = ICMP_UNREACH_SRCFAIL; - goto bad; + ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr, + RT_ALL_FIBS); + if (ia == NULL) + ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0, + RT_ALL_FIBS); + if (ia == NULL) + goto bad; + + memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + ifa_free(&ia->ia_ifa); + } else { + /* XXX MRT 0 for routing */ + if (fib4_lookup_nh_ext(M_GETFIB(m), + ipaddr.sin_addr, 0, 0, &nh_ext) != 0) + goto bad; + + memcpy(cp + off, &nh_ext.nh_src, + sizeof(struct in_addr)); } + ip->ip_dst = ipaddr.sin_addr; - (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), - sizeof(struct in_addr)); - ifa_free(&ia->ia_ifa); cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* * Let ip_intr's mcast routing check handle mcast pkts @@ -276,15 +292,19 @@ dropit: * destination, use the incoming interface (should be * same). */ - if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL && - (ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m))) == NULL) { + if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) != NULL) { + memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + ifa_free(&ia->ia_ifa); + } else if (fib4_lookup_nh_ext(M_GETFIB(m), + ipaddr.sin_addr, 0, 0, &nh_ext) == 0) { + memcpy(cp + off, &nh_ext.nh_src, + sizeof(struct in_addr)); + } else { type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; goto bad; } - (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), - sizeof(struct in_addr)); - ifa_free(&ia->ia_ifa); cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; @@ -413,7 +433,7 @@ ip_srcroute(struct mbuf *m0) if (opts->ip_nhops == 0) return (NULL); - m = m_get(M_DONTWAIT, MT_DATA); + m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); @@ -455,29 +475,23 @@ ip_srcroute(struct mbuf *m0) } /* - * Strip out IP options, at higher level protocol in the kernel. Second - * argument is buffer to which options will be moved, and return value is - * their length. - * - * XXX should be deleted; last arg currently ignored. + * Strip out IP options, at higher level protocol in the kernel. */ void -ip_stripoptions(struct mbuf *m, struct mbuf *mopt) +ip_stripoptions(struct mbuf *m) { - int i; struct ip *ip = mtod(m, struct ip *); - caddr_t opts; int olen; - olen = (ip->ip_hl << 2) - sizeof (struct ip); - opts = (caddr_t)(ip + 1); - i = m->m_len - (sizeof (struct ip) + olen); - bcopy(opts + olen, opts, (unsigned)i); + olen = (ip->ip_hl << 2) - sizeof(struct ip); m->m_len -= olen; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len -= olen; - ip->ip_v = IPVERSION; + ip->ip_len = htons(ntohs(ip->ip_len) - olen); ip->ip_hl = sizeof(struct ip) >> 2; + + bcopy((char *)ip + sizeof(struct ip) + olen, (ip + 1), + (size_t )(m->m_len - sizeof(struct ip))); } /* @@ -496,19 +510,19 @@ ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) unsigned optlen; optlen = opt->m_len - sizeof(p->ipopt_dst); - if (optlen + ip->ip_len > IP_MAXPACKET) { + if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) { *phlen = 0; return (m); /* XXX should fail */ } if (p->ipopt_dst.s_addr) ip->ip_dst = p->ipopt_dst; - if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { - MGETHDR(n, M_DONTWAIT, MT_DATA); + if (!M_WRITABLE(m) || M_LEADINGSPACE(m) < optlen) { + n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { *phlen = 0; return (m); } - M_MOVE_PKTHDR(n, m); + m_move_pkthdr(n, m); n->m_pkthdr.rcvif = NULL; n->m_pkthdr.len += optlen; m->m_len -= sizeof(struct ip); @@ -529,7 +543,7 @@ ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) *phlen = sizeof(struct ip) + optlen; ip->ip_v = IPVERSION; ip->ip_hl = *phlen >> 2; - ip->ip_len += optlen; + ip->ip_len = htons(ntohs(ip->ip_len) + optlen); return (m); } @@ -596,7 +610,7 @@ ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m) /* turn off any old options */ if (*pcbopt) (void)m_free(*pcbopt); - *pcbopt = 0; + *pcbopt = NULL; if (m == NULL || m->m_len == 0) { /* * Only turning off any previous options. @@ -694,7 +708,7 @@ bad: * may change in future. * Router alert options SHOULD be passed if running in IPSTEALTH mode and * we are not the endpoint. - * Length checks on individual options should already have been peformed + * Length checks on individual options should already have been performed * by ip_dooptions() therefore they are folded under INVARIANTS here. * * Return zero if not present or options are invalid, non-zero if present. diff --git a/freebsd/sys/netinet/ip_options.h b/freebsd/sys/netinet/ip_options.h index 7ba5ae64..4a6ea420 100644 --- a/freebsd/sys/netinet/ip_options.h +++ b/freebsd/sys/netinet/ip_options.h @@ -47,14 +47,15 @@ struct ipopt_tag { struct ipoptrt ip_srcrt; }; -extern int ip_doopts; /* process or ignore IP options */ +VNET_DECLARE(int, ip_doopts); /* process or ignore IP options */ +#define V_ip_doopts VNET(ip_doopts) int ip_checkrouteralert(struct mbuf *); int ip_dooptions(struct mbuf *, int); struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); int ip_optcopy(struct ip *, struct ip *); int ip_pcbopts(struct inpcb *, int, struct mbuf *); -void ip_stripoptions(struct mbuf *, struct mbuf *); +void ip_stripoptions(struct mbuf *); struct mbuf *ip_srcroute(struct mbuf *); #endif /* !_NETINET_IP_OPTIONS_H_ */ diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c index a06fed68..81e7b123 100644 --- a/freebsd/sys/netinet/ip_output.c +++ b/freebsd/sys/netinet/ip_output.c @@ -34,27 +34,32 @@ #include __FBSDID("$FreeBSD$"); -#include +#include #include -#include #include #include +#include #include +#include #include #include #include +#include #include #include #include #include #include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -63,12 +68,15 @@ __FBSDID("$FreeBSD$"); #ifdef RADIX_MPATH #include #endif +#include #include #include +#include #include #include #include +#include #include #include #include @@ -86,25 +94,112 @@ __FBSDID("$FreeBSD$"); #include -VNET_DEFINE(u_short, ip_id); - #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif -static void ip_mloopback - (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); +static void ip_mloopback(struct ifnet *, const struct mbuf *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; +static inline int +ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp, + struct sockaddr_in *dst, int *fibnum, int *error) +{ + struct m_tag *fwd_tag = NULL; + struct mbuf *m; + struct in_addr odst; + struct ip *ip; + + m = *mp; + ip = mtod(m, struct ip *); + + /* Run through list of hooks for output packets. */ + odst.s_addr = ip->ip_dst.s_addr; + *error = pfil_run_hooks(&V_inet_pfil_hook, mp, ifp, PFIL_OUT, inp); + m = *mp; + if ((*error) != 0 || m == NULL) + return 1; /* Finished */ + + ip = mtod(m, struct ip *); + + /* See if destination IP address was changed by packet filter. */ + if (odst.s_addr != ip->ip_dst.s_addr) { + m->m_flags |= M_SKIP_FIREWALL; + /* If destination is now ourself drop to ip_input(). */ + if (in_localip(ip->ip_dst)) { + m->m_flags |= M_FASTFWD_OURS; + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = V_loif; + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xffff; + } + m->m_pkthdr.csum_flags |= + CSUM_IP_CHECKED | CSUM_IP_VALID; +#ifdef SCTP + if (m->m_pkthdr.csum_flags & CSUM_SCTP) + m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; +#endif + *error = netisr_queue(NETISR_IP, m); + return 1; /* Finished */ + } + + bzero(dst, sizeof(*dst)); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = ip->ip_dst; + + return -1; /* Reloop */ + } + /* See if fib was changed by packet filter. */ + if ((*fibnum) != M_GETFIB(m)) { + m->m_flags |= M_SKIP_FIREWALL; + *fibnum = M_GETFIB(m); + return -1; /* Reloop for FIB change */ + } + + /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ + if (m->m_flags & M_FASTFWD_OURS) { + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = V_loif; + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xffff; + } +#ifdef SCTP + if (m->m_pkthdr.csum_flags & CSUM_SCTP) + m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; +#endif + m->m_pkthdr.csum_flags |= + CSUM_IP_CHECKED | CSUM_IP_VALID; + + *error = netisr_queue(NETISR_IP, m); + return 1; /* Finished */ + } + /* Or forward to some other address? */ + if ((m->m_flags & M_IP_NEXTHOP) && + ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { + bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); + m->m_flags |= M_SKIP_FIREWALL; + m->m_flags &= ~M_IP_NEXTHOP; + m_tag_delete(m, fwd_tag); + + return -1; /* Reloop for CHANGE of dst */ + } + + return 0; +} + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). - * ip_len and ip_off are in host format. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be @@ -118,20 +213,22 @@ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { + struct rm_priotracker in_ifa_tracker; struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; - int n; /* scratchpad */ int error = 0; struct sockaddr_in *dst; + const struct sockaddr_in *gw; struct in_ifaddr *ia; - int isbroadcast, sw_csum; + int isbroadcast; + uint16_t ip_len, ip_off; struct route iproute; struct rtentry *rte; /* cache for ro->ro_rt */ - struct in_addr odst; - struct m_tag *fwd_tag = NULL; + uint32_t fibnum; + int have_ia_ref; #ifdef IPSEC int no_route_but_check_spd = 0; #endif @@ -140,31 +237,21 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); - if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { + if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, inp->inp_flowtype); } } if (ro == NULL) { ro = &iproute; bzero(ro, sizeof (*ro)); - } + } else + ro->ro_flags |= RT_LLE_CACHE; #ifdef FLOWTABLE - if (ro->ro_rt == NULL) { - struct flentry *fle; - - /* - * The flow table returns route entries valid for up to 30 - * seconds; we rely on the remainder of ip_output() taking no - * longer than that long for the stability of ro_rt. The - * flow ID assignment must have happened before this point. - */ - fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET); - if (fle != NULL) - flow_to_route(fle, ro); - } + if (ro->ro_rt == NULL) + (void )flowtable_lookup(AF_INET, m, ro); #endif if (opt) { @@ -174,37 +261,49 @@ ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); + ip_len = ntohs(ip->ip_len); + ip_off = ntohs(ip->ip_off); - /* - * Fill in IP header. If we are not allowing fragmentation, - * then the ip_id field is meaningless, but we don't set it - * to zero. Doing so causes various problems when devices along - * the path (routers, load balancers, firewalls, etc.) illegally - * disable DF on our packet. Note that a 16-bit counter - * will wrap around in less than 10 seconds at 100 Mbit/s on a - * medium with MTU 1500. See Steven M. Bellovin, "A Technique - * for Counting NATted Hosts", Proc. IMW'02, available at - * . - */ if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; - ip->ip_id = ip_newid(); + ip_fillid(ip); IPSTAT_INC(ips_localout); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } + /* + * dst/gw handling: + * + * dst can be rewritten but always points to &ro->ro_dst. + * gw is readonly but can point either to dst OR rt_gateway, + * therefore we need restore gw if we're redoing lookup. + */ + gw = dst = (struct sockaddr_in *)&ro->ro_dst; + fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); + rte = ro->ro_rt; + if (rte == NULL) { + bzero(dst, sizeof(*dst)); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = ip->ip_dst; + } again: - dst = (struct sockaddr_in *)&ro->ro_dst; - ia = NULL; + /* + * Validate route against routing table additions; + * a better/more specific route might have been added. + */ + if (inp) + RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. + * Also check whether routing cache needs invalidation. */ rte = ro->ro_rt; if (rte && ((rte->rt_flags & RTF_UP) == 0 || @@ -212,16 +311,14 @@ again: !RT_LINK_IS_UP(rte->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { - RO_RTFREE(ro); - ro->ro_lle = NULL; - rte = NULL; - } - if (rte == NULL && fwd_tag == NULL) { - bzero(dst, sizeof(*dst)); - dst->sin_family = AF_INET; - dst->sin_len = sizeof(*dst); - dst->sin_addr = ip->ip_dst; + RTFREE(rte); + rte = ro->ro_rt = (struct rtentry *)NULL; + if (ro->ro_lle) + LLE_FREE(ro->ro_lle); /* zeros ro_lle */ + ro->ro_lle = (struct llentry *)NULL; } + ia = NULL; + have_ia_ref = 0; /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an @@ -229,27 +326,33 @@ again: * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { - if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && - (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { + if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), + M_GETFIB(m)))) == NULL && + (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), + M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } + have_ia_ref = 1; ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = 1; } else if (flags & IP_ROUTETOIF) { - if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && - (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) { + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), + M_GETFIB(m)))) == NULL && + (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, + M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } + have_ia_ref = 1; ifp = ia->ia_ifp; ip->ip_ttl = 1; - isbroadcast = in_broadcast(dst->sin_addr, ifp); + isbroadcast = in_ifaddr_broadcast(dst->sin_addr, ia); } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* @@ -257,7 +360,9 @@ again: * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; - IFP_TO_IA(ifp, ia); + IFP_TO_IA(ifp, ia, &in_ifa_tracker); + if (ia) + have_ia_ref = 1; isbroadcast = 0; /* fool gcc */ } else { /* @@ -269,14 +374,14 @@ again: #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), - inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); + fibnum); #else - in_rtalloc_ign(ro, 0, - inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); + in_rtalloc_ign(ro, 0, fibnum); #endif rte = ro->ro_rt; } if (rte == NULL || + (rte->rt_flags & RTF_UP) == 0 || rte->rt_ifp == NULL || !RT_LINK_IS_UP(rte->rt_ifp)) { #ifdef IPSEC @@ -293,45 +398,37 @@ again: goto bad; } ia = ifatoia(rte->rt_ifa); - ifa_ref(&ia->ia_ifa); ifp = rte->rt_ifp; - rte->rt_rmx.rmx_pksent++; + counter_u64_add(rte->rt_pksent, 1); + rt_update_ro_flags(ro); if (rte->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in *)rte->rt_gateway; + gw = (struct sockaddr_in *)rte->rt_gateway; if (rte->rt_flags & RTF_HOST) isbroadcast = (rte->rt_flags & RTF_BROADCAST); else - isbroadcast = in_broadcast(dst->sin_addr, ifp); + isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia); } + /* * Calculate MTU. If we have a route that is up, use that, * otherwise use the interface's MTU. */ - if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) { - /* - * This case can happen if the user changed the MTU - * of an interface after enabling IP on it. Because - * most netifs don't keep track of routes pointing to - * them, there is no way for one to update all its - * routes when the MTU is changed. - */ - if (rte->rt_rmx.rmx_mtu > ifp->if_mtu) - rte->rt_rmx.rmx_mtu = ifp->if_mtu; - mtu = rte->rt_rmx.rmx_mtu; - } else { + if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) + mtu = rte->rt_mtu; + else mtu = ifp->if_mtu; - } /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, rte=%p (rt_flags=0x%08x) ifp=%p", __func__, mtu, rte, (rte != NULL) ? rte->rt_flags : 0, ifp)); + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* - * IP destination address is multicast. Make sure "dst" + * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ - dst = (struct sockaddr_in *)&ro->ro_dst; + gw = dst; /* * See if the caller provided any multicast options */ @@ -373,7 +470,7 @@ again: * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ - ip_mloopback(ifp, m, dst, hlen); + ip_mloopback(ifp, m, hlen); } else { /* * If we are acting as a multicast router, perform @@ -432,23 +529,6 @@ again: } } - /* - * Verify that we have any chance at all of being able to queue the - * packet or packet fragments, unless ALTQ is enabled on the given - * interface in which case packetdrop should be done by queueing. - */ - n = ip->ip_len / mtu + 1; /* how many fragments ? */ - if ( -#ifdef ALTQ - (!ALTQ_IS_ENABLED(&ifp->if_snd)) && -#endif /* ALTQ */ - (ifp->if_snd.ifq_len + n) >= ifp->if_snd.ifq_maxlen ) { - error = ENOBUFS; - IPSTAT_INC(ips_odropped); - ifp->if_snd.ifq_drops += n; - goto bad; - } - /* * Look for broadcast address and * verify user is allowed to send @@ -464,7 +544,7 @@ again: goto bad; } /* don't allow broadcast messages to be fragmented */ - if (ip->ip_len > mtu) { + if (ip_len > mtu) { error = EMSGSIZE; goto bad; } @@ -475,7 +555,7 @@ again: sendit: #ifdef IPSEC - switch(ip_ipsec_output(&m, inp, &flags, &error)) { + switch(ip_ipsec_output(&m, inp, &error)) { case 1: goto bad; case -1: @@ -498,78 +578,29 @@ sendit: #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ - if (!PFIL_HOOKED(&V_inet_pfil_hook)) - goto passout; - - /* Run through list of hooks for output packets. */ - odst.s_addr = ip->ip_dst.s_addr; - error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp); - if (error != 0 || m == NULL) - goto done; + if (PFIL_HOOKED(&V_inet_pfil_hook)) { + switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) { + case 1: /* Finished */ + goto done; - ip = mtod(m, struct ip *); + case 0: /* Continue normally */ + ip = mtod(m, struct ip *); + break; - /* See if destination IP address was changed by packet filter. */ - if (odst.s_addr != ip->ip_dst.s_addr) { - m->m_flags |= M_SKIP_FIREWALL; - /* If destination is now ourself drop to ip_input(). */ - if (in_localip(ip->ip_dst)) { - m->m_flags |= M_FASTFWD_OURS; - if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = V_loif; - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - m->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m->m_pkthdr.csum_data = 0xffff; - } - m->m_pkthdr.csum_flags |= - CSUM_IP_CHECKED | CSUM_IP_VALID; -#ifdef SCTP - if (m->m_pkthdr.csum_flags & CSUM_SCTP) - m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; -#endif - error = netisr_queue(NETISR_IP, m); - goto done; - } else { - if (ia != NULL) + case -1: /* Need to try again */ + /* Reset everything for a new round */ + RO_RTFREE(ro); + if (have_ia_ref) ifa_free(&ia->ia_ifa); - goto again; /* Redo the routing table lookup. */ - } - } + ro->ro_prepend = NULL; + rte = NULL; + gw = dst; + ip = mtod(m, struct ip *); + goto again; - /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ - if (m->m_flags & M_FASTFWD_OURS) { - if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = V_loif; - if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { - m->m_pkthdr.csum_flags |= - CSUM_DATA_VALID | CSUM_PSEUDO_HDR; - m->m_pkthdr.csum_data = 0xffff; } -#ifdef SCTP - if (m->m_pkthdr.csum_flags & CSUM_SCTP) - m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; -#endif - m->m_pkthdr.csum_flags |= - CSUM_IP_CHECKED | CSUM_IP_VALID; - - error = netisr_queue(NETISR_IP, m); - goto done; - } - /* Or forward to some other address? */ - if ((m->m_flags & M_IP_NEXTHOP) && - (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { - dst = (struct sockaddr_in *)&ro->ro_dst; - bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); - m->m_flags |= M_SKIP_FIREWALL; - m->m_flags &= ~M_IP_NEXTHOP; - m_tag_delete(m, fwd_tag); - if (ia != NULL) - ifa_free(&ia->ia_ifa); - goto again; } -passout: /* 127/8 must not appear on wire - RFC1122. */ if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { @@ -581,31 +612,28 @@ passout: } m->m_pkthdr.csum_flags |= CSUM_IP; - sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; - if (sw_csum & CSUM_DELAY_DATA) { + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m); - sw_csum &= ~CSUM_DELAY_DATA; + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP - if (sw_csum & CSUM_SCTP) { + if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); - sw_csum &= ~CSUM_SCTP; + m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif - m->m_pkthdr.csum_flags &= ifp->if_hwassist; /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ - if (ip->ip_len <= mtu || - (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || - ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); + if (ip_len <= mtu || + (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; - if (sw_csum & CSUM_DELAY_IP) + if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); + m->m_pkthdr.csum_flags &= ~CSUM_IP; + } /* * Record statistics for this interface address. @@ -615,28 +643,30 @@ passout: */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) - ia->ia_ifa.if_opackets += - m->m_pkthdr.len / m->m_pkthdr.tso_segsz; + counter_u64_add(ia->ia_ifa.ifa_opackets, + m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; + counter_u64_add(ia->ia_ifa.ifa_opackets, 1); + + counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) - m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); + m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ - m->m_flags &= ~(M_PROTOFLAGS); + m_clrprotoflags(m); + IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, - (struct sockaddr *)dst, ro); + (const struct sockaddr *)gw, ro); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ - if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { + if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; @@ -646,7 +676,7 @@ passout: * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ - error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); + error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { @@ -655,17 +685,19 @@ passout: if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; + counter_u64_add(ia->ia_ifa.ifa_opackets, 1); + counter_u64_add(ia->ia_ifa.ifa_obytes, + m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ - m->m_flags &= ~(M_PROTOFLAGS); + m_clrprotoflags(m); + IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); error = (*ifp->if_output)(ifp, m, - (struct sockaddr *)dst, ro); + (const struct sockaddr *)gw, ro); } else m_freem(m); } @@ -674,9 +706,20 @@ passout: IPSTAT_INC(ips_fragmented); done: - if (ro == &iproute) + /* + * Release the route if using our private route, or if + * (with flowtable) we don't have our own reference. + */ + if (ro == &iproute || ro->ro_flags & RT_NORTREF) RO_RTFREE(ro); - if (ia != NULL) + else if (rte == NULL) + /* + * If the caller supplied a route but somehow the reference + * to it has been released need to prevent the caller + * calling RTFREE on it again. + */ + ro->ro_rt = NULL; + if (have_ia_ref) ifa_free(&ia->ia_ifa); return (error); bad: @@ -691,11 +734,10 @@ bad: * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) - * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, - u_long if_hwassist_flags, int sw_csum) + u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; @@ -705,8 +747,12 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, int firstlen; struct mbuf **mnext; int nfrags; + uint16_t ip_len, ip_off; + + ip_len = ntohs(ip->ip_len); + ip_off = ntohs(ip->ip_off); - if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ + if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } @@ -732,10 +778,10 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, } #endif if (len > PAGE_SIZE) { - /* - * Fragment large datagrams such that each segment - * contains a multiple of PAGE_SIZE amount of data, - * plus headers. This enables a receiver to perform + /* + * Fragment large datagrams such that each segment + * contains a multiple of PAGE_SIZE amount of data, + * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver @@ -747,7 +793,7 @@ ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, off = MIN(mtu, m0->m_pkthdr.len); /* - * firstlen (off - hlen) must be aligned on an + * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) @@ -776,22 +822,30 @@ smart_frag_failure: * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ - for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { + for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); - MGETHDR(m, M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } - /* copy multicast and flowid flag, if any */ - m->m_flags |= (m0->m_flags & (M_FLOWID | M_MCAST)) | M_FRAG; - /* make sure the flowid is the same for the fragmented mbufs */ - M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0)); - m->m_pkthdr.flowid = m0->m_pkthdr.flowid; + /* + * Make sure the complete packet header gets copied + * from the originating mbuf to the newly created + * mbuf. This also ensures that existing firewall + * classification(s), VLAN tags and so on get copied + * to the resulting fragmented packet(s): + */ + if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { + m_free(m); + error = ENOBUFS; + IPSTAT_INC(ips_odropped); + goto done; + } /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload @@ -806,15 +860,14 @@ smart_frag_failure: mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; - /* XXX do we need to add ip->ip_off below ? */ - mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; - if (off + len >= ip->ip_len) { /* last fragment */ - len = ip->ip_len - off; - m->m_flags |= M_LASTFRAG; - } else + /* XXX do we need to add ip_off below ? */ + mhip->ip_off = ((off - hlen) >> 3) + ip_off; + if (off + len >= ip_len) + len = ip_len - off; + else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); - m->m_next = m_copym(m0, off, len, M_DONTWAIT); + m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ @@ -822,36 +875,33 @@ smart_frag_failure: goto done; } m->m_pkthdr.len = mhlen + len; - m->m_pkthdr.rcvif = NULL; #ifdef MAC mac_netinet_fragment(m0, m); #endif - m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; - if (sw_csum & CSUM_DELAY_IP) + if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); + m->m_pkthdr.csum_flags &= ~CSUM_IP; + } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); - /* set first marker for fragment chain */ - m0->m_flags |= M_FIRSTFRAG | M_FRAG; - m0->m_pkthdr.csum_data = nfrags; - /* * Update first fragment by trimming what's been copied out * and updating header. */ - m_adj(m0, hlen + firstlen - ip->ip_len); + m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); - ip->ip_off |= IP_MF; - ip->ip_off = htons(ip->ip_off); + ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; - if (sw_csum & CSUM_DELAY_IP) + if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); + m0->m_pkthdr.csum_flags &= ~CSUM_IP; + } done: *m_frag = m0; @@ -862,11 +912,12 @@ void in_delayed_cksum(struct mbuf *m) { struct ip *ip; - u_short csum, offset; + uint16_t csum, offset, ip_len; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; - csum = in_cksum_skip(m, ip->ip_len, offset); + ip_len = ntohs(ip->ip_len); + csum = in_cksum_skip(m, ip_len, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ @@ -889,6 +940,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; +#ifdef RSS + uint32_t rss_bucket; + int retval; +#endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { @@ -941,7 +996,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) error = EMSGSIZE; break; } - MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); + m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; @@ -967,6 +1022,10 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) break; } /* FALLTHROUGH */ + case IP_BINDMULTI: +#ifdef RSS + case IP_RSS_LISTEN_BUCKET: +#endif case IP_TOS: case IP_TTL: case IP_MINTTL: @@ -975,10 +1034,13 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: - case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: + case IP_RECVFLOWID: +#ifdef RSS + case IP_RECVRSSBUCKETID: +#endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -1009,6 +1071,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) INP_WUNLOCK(inp); \ } while (0) +#define OPTSET2(bit, val) do { \ + INP_WLOCK(inp); \ + if (val) \ + inp->inp_flags2 |= bit; \ + else \ + inp->inp_flags2 &= ~bit; \ + INP_WUNLOCK(inp); \ +} while (0) + case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; @@ -1029,10 +1100,6 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) OPTSET(INP_RECVIF); break; - case IP_FAITH: - OPTSET(INP_FAITH); - break; - case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; @@ -1045,9 +1112,30 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVTOS: OPTSET(INP_RECVTOS); break; + case IP_BINDMULTI: + OPTSET2(INP_BINDMULTI, optval); + break; + case IP_RECVFLOWID: + OPTSET2(INP_RECVFLOWID, optval); + break; +#ifdef RSS + case IP_RSS_LISTEN_BUCKET: + if ((optval >= 0) && + (optval < rss_getnumbuckets())) { + inp->inp_rss_listen_bucket = optval; + OPTSET2(INP_RSS_BUCKET_SET, 1); + } else { + error = EINVAL; + } + break; + case IP_RECVRSSBUCKETID: + OPTSET2(INP_RECVRSSBUCKETID, optval); + break; +#endif } break; #undef OPTSET +#undef OPTSET2 /* * Multicast socket options are processed by the in_mcast @@ -1133,7 +1221,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_OPTIONS: case IP_RETOPTS: if (inp->inp_options) - error = sooptcopyout(sopt, + error = sooptcopyout(sopt, mtod(inp->inp_options, char *), inp->inp_options->m_len); @@ -1150,11 +1238,18 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: - case IP_FAITH: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: + case IP_BINDMULTI: + case IP_FLOWID: + case IP_FLOWTYPE: + case IP_RECVFLOWID: +#ifdef RSS + case IP_RSSBUCKETID: + case IP_RECVRSSBUCKETID: +#endif switch (sopt->sopt_name) { case IP_TOS: @@ -1170,6 +1265,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) +#define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); @@ -1200,10 +1296,6 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) optval = 0; break; - case IP_FAITH: - optval = OPTBIT(INP_FAITH); - break; - case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; @@ -1216,6 +1308,32 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; + case IP_FLOWID: + optval = inp->inp_flowid; + break; + case IP_FLOWTYPE: + optval = inp->inp_flowtype; + break; + case IP_RECVFLOWID: + optval = OPTBIT2(INP_RECVFLOWID); + break; +#ifdef RSS + case IP_RSSBUCKETID: + retval = rss_hash2bucket(inp->inp_flowid, + inp->inp_flowtype, + &rss_bucket); + if (retval == 0) + optval = rss_bucket; + else + error = EINVAL; + break; + case IP_RECVRSSBUCKETID: + optval = OPTBIT2(INP_RECVRSSBUCKETID); + break; +#endif + case IP_BINDMULTI: + optval = OPTBIT2(INP_BINDMULTI); + break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; @@ -1239,7 +1357,7 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) caddr_t req = NULL; size_t len = 0; - if (m != 0) { + if (m != NULL) { req = mtod(m, caddr_t); len = m->m_len; } @@ -1269,18 +1387,17 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt) * replicating that code here. */ static void -ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, - int hlen) +ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen) { - register struct ip *ip; + struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ - copym = m_dup(m, M_DONTWAIT); - if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) + copym = m_dup(m, M_NOWAIT); + if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ @@ -1296,17 +1413,8 @@ ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); - ip->ip_len = htons(ip->ip_len); - ip->ip_off = htons(ip->ip_off); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); -#if 1 /* XXX */ - if (dst->sin_family != AF_INET) { - printf("ip_mloopback: bad address family %d\n", - dst->sin_family); - dst->sin_family = AF_INET; - } -#endif - if_simloop(ifp, copym, dst->sin_family, 0); + if_simloop(ifp, copym, AF_INET, 0); } } diff --git a/freebsd/sys/netinet/ip_reass.c b/freebsd/sys/netinet/ip_reass.c new file mode 100644 index 00000000..aae24b9d --- /dev/null +++ b/freebsd/sys/netinet/ip_reass.c @@ -0,0 +1,660 @@ +#include + +/*- + * Copyright (c) 2015 Gleb Smirnoff + * Copyright (c) 2015 Adrian Chadd + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#ifdef MAC +#include +#endif + +SYSCTL_DECL(_net_inet_ip); + +/* + * Reassembly headers are stored in hash buckets. + */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) + +struct ipqbucket { + TAILQ_HEAD(ipqhead, ipq) head; + struct mtx lock; +}; + +static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]); +#define V_ipq VNET(ipq) +static VNET_DEFINE(uint32_t, ipq_hashseed); +#define V_ipq_hashseed VNET(ipq_hashseed) + +#define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock) +#define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock) +#define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) +#define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) + +void ipreass_init(void); +void ipreass_drain(void); +void ipreass_slowtimo(void); +#ifdef VIMAGE +void ipreass_destroy(void); +#endif +static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); +static void ipreass_zone_change(void *); +static void ipreass_drain_tomax(void); +static void ipq_free(struct ipqhead *, struct ipq *); +static struct ipq * ipq_reuse(int); + +static inline void +ipq_timeout(struct ipqhead *head, struct ipq *fp) +{ + + IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); + ipq_free(head, fp); +} + +static inline void +ipq_drop(struct ipqhead *head, struct ipq *fp) +{ + + IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); + ipq_free(head, fp); +} + +static VNET_DEFINE(uma_zone_t, ipq_zone); +#define V_ipq_zone VNET(ipq_zone) +SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET | + CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_maxfragpackets, "I", + "Maximum number of IPv4 fragment reassembly queue entries"); +SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET, + &VNET_NAME(ipq_zone), + "Current number of IPv4 fragment reassembly queue entries"); + +static VNET_DEFINE(int, noreass); +#define V_noreass VNET(noreass) + +static VNET_DEFINE(int, maxfragsperpacket); +#define V_maxfragsperpacket VNET(maxfragsperpacket) +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(maxfragsperpacket), 0, + "Maximum number of IPv4 fragments allowed per packet"); + +/* + * Take incoming datagram fragment and try to reassemble it into + * whole datagram. If the argument is the first fragment or one + * in between the function will return NULL and store the mbuf + * in the fragment chain. If the argument is the last fragment + * the packet will be reassembled and the pointer to the new + * mbuf returned for further processing. Only m_tags attached + * to the first packet/fragment are preserved. + * The IP header is *NOT* adjusted out of iplen. + */ +#define M_IP_FRAG M_PROTO9 +struct mbuf * +ip_reass(struct mbuf *m) +{ + struct ip *ip; + struct mbuf *p, *q, *nq, *t; + struct ipq *fp; + struct ipqhead *head; + int i, hlen, next; + u_int8_t ecn, ecn0; + uint32_t hash; +#ifdef RSS + uint32_t rss_hash, rss_type; +#endif + + /* + * If no reassembling or maxfragsperpacket are 0, + * never accept fragments. + */ + if (V_noreass == 1 || V_maxfragsperpacket == 0) { + IPSTAT_INC(ips_fragments); + IPSTAT_INC(ips_fragdropped); + m_freem(m); + return (NULL); + } + + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + + /* + * Adjust ip_len to not reflect header, + * convert offset of this to bytes. + */ + ip->ip_len = htons(ntohs(ip->ip_len) - hlen); + if (ip->ip_off & htons(IP_MF)) { + /* + * Make sure that fragments have a data length + * that's a non-zero multiple of 8 bytes. + */ + if (ip->ip_len == htons(0) || (ntohs(ip->ip_len) & 0x7) != 0) { + IPSTAT_INC(ips_toosmall); /* XXX */ + IPSTAT_INC(ips_fragdropped); + m_freem(m); + return (NULL); + } + m->m_flags |= M_IP_FRAG; + } else + m->m_flags &= ~M_IP_FRAG; + ip->ip_off = htons(ntohs(ip->ip_off) << 3); + + /* + * Attempt reassembly; if it succeeds, proceed. + * ip_reass() will return a different mbuf. + */ + IPSTAT_INC(ips_fragments); + m->m_pkthdr.PH_loc.ptr = ip; + + /* + * Presence of header sizes in mbufs + * would confuse code below. + */ + m->m_data += hlen; + m->m_len -= hlen; + + hash = ip->ip_src.s_addr ^ ip->ip_id; + hash = jenkins_hash32(&hash, 1, V_ipq_hashseed) & IPREASS_HMASK; + head = &V_ipq[hash].head; + IPQ_LOCK(hash); + + /* + * Look for queue of fragments + * of this datagram. + */ + TAILQ_FOREACH(fp, head, ipq_list) + if (ip->ip_id == fp->ipq_id && + ip->ip_src.s_addr == fp->ipq_src.s_addr && + ip->ip_dst.s_addr == fp->ipq_dst.s_addr && +#ifdef MAC + mac_ipq_match(m, fp) && +#endif + ip->ip_p == fp->ipq_p) + break; + /* + * If first fragment to arrive, create a reassembly queue. + */ + if (fp == NULL) { + fp = uma_zalloc(V_ipq_zone, M_NOWAIT); + if (fp == NULL) + fp = ipq_reuse(hash); +#ifdef MAC + if (mac_ipq_init(fp, M_NOWAIT) != 0) { + uma_zfree(V_ipq_zone, fp); + fp = NULL; + goto dropfrag; + } + mac_ipq_create(m, fp); +#endif + TAILQ_INSERT_HEAD(head, fp, ipq_list); + fp->ipq_nfrags = 1; + fp->ipq_ttl = IPFRAGTTL; + fp->ipq_p = ip->ip_p; + fp->ipq_id = ip->ip_id; + fp->ipq_src = ip->ip_src; + fp->ipq_dst = ip->ip_dst; + fp->ipq_frags = m; + m->m_nextpkt = NULL; + goto done; + } else { + fp->ipq_nfrags++; +#ifdef MAC + mac_ipq_update(m, fp); +#endif + } + +#define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr)) + + /* + * Handle ECN by comparing this segment with the first one; + * if CE is set, do not lose CE. + * drop if CE and not-ECT are mixed for the same packet. + */ + ecn = ip->ip_tos & IPTOS_ECN_MASK; + ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; + if (ecn == IPTOS_ECN_CE) { + if (ecn0 == IPTOS_ECN_NOTECT) + goto dropfrag; + if (ecn0 != IPTOS_ECN_CE) + GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; + } + if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) + goto dropfrag; + + /* + * Find a segment which begins after this one does. + */ + for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) + if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off)) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us, otherwise + * stick new segment in the proper place. + * + * If some of the data is dropped from the preceding + * segment, then it's checksum is invalidated. + */ + if (p) { + i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) - + ntohs(ip->ip_off); + if (i > 0) { + if (i >= ntohs(ip->ip_len)) + goto dropfrag; + m_adj(m, i); + m->m_pkthdr.csum_flags = 0; + ip->ip_off = htons(ntohs(ip->ip_off) + i); + ip->ip_len = htons(ntohs(ip->ip_len) - i); + } + m->m_nextpkt = p->m_nextpkt; + p->m_nextpkt = m; + } else { + m->m_nextpkt = fp->ipq_frags; + fp->ipq_frags = m; + } + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) > + ntohs(GETIP(q)->ip_off); q = nq) { + i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) - + ntohs(GETIP(q)->ip_off); + if (i < ntohs(GETIP(q)->ip_len)) { + GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i); + GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i); + m_adj(q, i); + q->m_pkthdr.csum_flags = 0; + break; + } + nq = q->m_nextpkt; + m->m_nextpkt = nq; + IPSTAT_INC(ips_fragdropped); + fp->ipq_nfrags--; + m_freem(q); + } + + /* + * Check for complete reassembly and perform frag per packet + * limiting. + * + * Frag limiting is performed here so that the nth frag has + * a chance to complete the packet before we drop the packet. + * As a result, n+1 frags are actually allowed per packet, but + * only n will ever be stored. (n = maxfragsperpacket.) + * + */ + next = 0; + for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { + if (ntohs(GETIP(q)->ip_off) != next) { + if (fp->ipq_nfrags > V_maxfragsperpacket) + ipq_drop(head, fp); + goto done; + } + next += ntohs(GETIP(q)->ip_len); + } + /* Make sure the last packet didn't have the IP_MF flag */ + if (p->m_flags & M_IP_FRAG) { + if (fp->ipq_nfrags > V_maxfragsperpacket) + ipq_drop(head, fp); + goto done; + } + + /* + * Reassembly is complete. Make sure the packet is a sane size. + */ + q = fp->ipq_frags; + ip = GETIP(q); + if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { + IPSTAT_INC(ips_toolong); + ipq_drop(head, fp); + goto done; + } + + /* + * Concatenate fragments. + */ + m = q; + t = m->m_next; + m->m_next = NULL; + m_cat(m, t); + nq = q->m_nextpkt; + q->m_nextpkt = NULL; + for (q = nq; q != NULL; q = nq) { + nq = q->m_nextpkt; + q->m_nextpkt = NULL; + m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; + m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; + m_cat(m, q); + } + /* + * In order to do checksumming faster we do 'end-around carry' here + * (and not in for{} loop), though it implies we are not going to + * reassemble more than 64k fragments. + */ + while (m->m_pkthdr.csum_data & 0xffff0000) + m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + + (m->m_pkthdr.csum_data >> 16); +#ifdef MAC + mac_ipq_reassemble(fp, m); + mac_ipq_destroy(fp); +#endif + + /* + * Create header for new ip packet by modifying header of first + * packet; dequeue and discard fragment reassembly header. + * Make header visible. + */ + ip->ip_len = htons((ip->ip_hl << 2) + next); + ip->ip_src = fp->ipq_src; + ip->ip_dst = fp->ipq_dst; + TAILQ_REMOVE(head, fp, ipq_list); + uma_zfree(V_ipq_zone, fp); + m->m_len += (ip->ip_hl << 2); + m->m_data -= (ip->ip_hl << 2); + /* some debugging cruft by sklower, below, will go away soon */ + if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ + m_fixhdr(m); + IPSTAT_INC(ips_reassembled); + IPQ_UNLOCK(hash); + +#ifdef RSS + /* + * Query the RSS layer for the flowid / flowtype for the + * mbuf payload. + * + * For now, just assume we have to calculate a new one. + * Later on we should check to see if the assigned flowid matches + * what RSS wants for the given IP protocol and if so, just keep it. + * + * We then queue into the relevant netisr so it can be dispatched + * to the correct CPU. + * + * Note - this may return 1, which means the flowid in the mbuf + * is correct for the configured RSS hash types and can be used. + */ + if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { + m->m_pkthdr.flowid = rss_hash; + M_HASHTYPE_SET(m, rss_type); + } + + /* + * Queue/dispatch for reprocessing. + * + * Note: this is much slower than just handling the frame in the + * current receive context. It's likely worth investigating + * why this is. + */ + netisr_dispatch(NETISR_IP_DIRECT, m); + return (NULL); +#endif + + /* Handle in-line */ + return (m); + +dropfrag: + IPSTAT_INC(ips_fragdropped); + if (fp != NULL) + fp->ipq_nfrags--; + m_freem(m); +done: + IPQ_UNLOCK(hash); + return (NULL); + +#undef GETIP +} + +/* + * Initialize IP reassembly structures. + */ +void +ipreass_init(void) +{ + + for (int i = 0; i < IPREASS_NHASH; i++) { + TAILQ_INIT(&V_ipq[i].head); + mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, + MTX_DEF | MTX_DUPOK); + } + V_ipq_hashseed = arc4random(); + V_maxfragsperpacket = 16; + V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, + NULL, UMA_ALIGN_PTR, 0); + uma_zone_set_max(V_ipq_zone, nmbclusters / 32); + + if (IS_DEFAULT_VNET(curvnet)) + EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, + NULL, EVENTHANDLER_PRI_ANY); +} + +/* + * If a timer expires on a reassembly queue, discard it. + */ +void +ipreass_slowtimo(void) +{ + struct ipq *fp, *tmp; + + for (int i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); + TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp) + if (--fp->ipq_ttl == 0) + ipq_timeout(&V_ipq[i].head, fp); + IPQ_UNLOCK(i); + } +} + +/* + * Drain off all datagram fragments. + */ +void +ipreass_drain(void) +{ + + for (int i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); + while(!TAILQ_EMPTY(&V_ipq[i].head)) + ipq_drop(&V_ipq[i].head, TAILQ_FIRST(&V_ipq[i].head)); + IPQ_UNLOCK(i); + } +} + +#ifdef VIMAGE +/* + * Destroy IP reassembly structures. + */ +void +ipreass_destroy(void) +{ + + ipreass_drain(); + uma_zdestroy(V_ipq_zone); + for (int i = 0; i < IPREASS_NHASH; i++) + mtx_destroy(&V_ipq[i].lock); +} +#endif + +/* + * After maxnipq has been updated, propagate the change to UMA. The UMA zone + * max has slightly different semantics than the sysctl, for historical + * reasons. + */ +static void +ipreass_drain_tomax(void) +{ + int target; + + /* + * If we are over the maximum number of fragments, + * drain off enough to get down to the new limit, + * stripping off last elements on queues. Every + * run we strip the oldest element from each bucket. + */ + target = uma_zone_get_max(V_ipq_zone); + while (uma_zone_get_cur(V_ipq_zone) > target) { + struct ipq *fp; + + for (int i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); + fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); + if (fp != NULL) + ipq_timeout(&V_ipq[i].head, fp); + IPQ_UNLOCK(i); + } + } +} + +static void +ipreass_zone_change(void *tag) +{ + + uma_zone_set_max(V_ipq_zone, nmbclusters / 32); + ipreass_drain_tomax(); +} + +/* + * Change the limit on the UMA zone, or disable the fragment allocation + * at all. Since 0 and -1 is a special values here, we need our own handler, + * instead of sysctl_handle_uma_zone_max(). + */ +static int +sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) +{ + int error, max; + + if (V_noreass == 0) { + max = uma_zone_get_max(V_ipq_zone); + if (max == 0) + max = -1; + } else + max = 0; + error = sysctl_handle_int(oidp, &max, 0, req); + if (error || !req->newptr) + return (error); + if (max > 0) { + /* + * XXXRW: Might be a good idea to sanity check the argument + * and place an extreme upper bound. + */ + max = uma_zone_set_max(V_ipq_zone, max); + ipreass_drain_tomax(); + V_noreass = 0; + } else if (max == 0) { + V_noreass = 1; + ipreass_drain(); + } else if (max == -1) { + V_noreass = 0; + uma_zone_set_max(V_ipq_zone, 0); + } else + return (EINVAL); + return (0); +} + +/* + * Seek for old fragment queue header that can be reused. Try to + * reuse a header from currently locked hash bucket. + */ +static struct ipq * +ipq_reuse(int start) +{ + struct ipq *fp; + int i; + + IPQ_LOCK_ASSERT(start); + + for (i = start;; i++) { + if (i == IPREASS_NHASH) + i = 0; + if (i != start && IPQ_TRYLOCK(i) == 0) + continue; + fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); + if (fp) { + struct mbuf *m; + + IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); + while (fp->ipq_frags) { + m = fp->ipq_frags; + fp->ipq_frags = m->m_nextpkt; + m_freem(m); + } + TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list); + if (i != start) + IPQ_UNLOCK(i); + IPQ_LOCK_ASSERT(start); + return (fp); + } + if (i != start) + IPQ_UNLOCK(i); + } +} + +/* + * Free a fragment reassembly header and all associated datagrams. + */ +static void +ipq_free(struct ipqhead *fhp, struct ipq *fp) +{ + struct mbuf *q; + + while (fp->ipq_frags) { + q = fp->ipq_frags; + fp->ipq_frags = q->m_nextpkt; + m_freem(q); + } + TAILQ_REMOVE(fhp, fp, ipq_list); + uma_zfree(V_ipq_zone, fp); +} diff --git a/freebsd/sys/netinet/ip_var.h b/freebsd/sys/netinet/ip_var.h index b07ef162..847704fd 100644 --- a/freebsd/sys/netinet/ip_var.h +++ b/freebsd/sys/netinet/ip_var.h @@ -93,50 +93,54 @@ struct ip_moptions { u_short imo_max_memberships; /* max memberships this socket */ struct in_multi **imo_membership; /* group memberships */ struct in_mfilter *imo_mfilters; /* source filters */ + STAILQ_ENTRY(ip_moptions) imo_link; }; struct ipstat { - u_long ips_total; /* total packets received */ - u_long ips_badsum; /* checksum bad */ - u_long ips_tooshort; /* packet too short */ - u_long ips_toosmall; /* not enough data */ - u_long ips_badhlen; /* ip header length < data size */ - u_long ips_badlen; /* ip length < ip header length */ - u_long ips_fragments; /* fragments received */ - u_long ips_fragdropped; /* frags dropped (dups, out of space) */ - u_long ips_fragtimeout; /* fragments timed out */ - u_long ips_forward; /* packets forwarded */ - u_long ips_fastforward; /* packets fast forwarded */ - u_long ips_cantforward; /* packets rcvd for unreachable dest */ - u_long ips_redirectsent; /* packets forwarded on same net */ - u_long ips_noproto; /* unknown or unsupported protocol */ - u_long ips_delivered; /* datagrams delivered to upper level*/ - u_long ips_localout; /* total ip packets generated here */ - u_long ips_odropped; /* lost packets due to nobufs, etc. */ - u_long ips_reassembled; /* total packets reassembled ok */ - u_long ips_fragmented; /* datagrams successfully fragmented */ - u_long ips_ofragments; /* output fragments created */ - u_long ips_cantfrag; /* don't fragment flag was set, etc. */ - u_long ips_badoptions; /* error in option processing */ - u_long ips_noroute; /* packets discarded due to no route */ - u_long ips_badvers; /* ip version != 4 */ - u_long ips_rawout; /* total raw ip packets generated */ - u_long ips_toolong; /* ip length > max ip packet size */ - u_long ips_notmember; /* multicasts for unregistered grps */ - u_long ips_nogif; /* no match gif found */ - u_long ips_badaddr; /* invalid address on header */ + uint64_t ips_total; /* total packets received */ + uint64_t ips_badsum; /* checksum bad */ + uint64_t ips_tooshort; /* packet too short */ + uint64_t ips_toosmall; /* not enough data */ + uint64_t ips_badhlen; /* ip header length < data size */ + uint64_t ips_badlen; /* ip length < ip header length */ + uint64_t ips_fragments; /* fragments received */ + uint64_t ips_fragdropped; /* frags dropped (dups, out of space) */ + uint64_t ips_fragtimeout; /* fragments timed out */ + uint64_t ips_forward; /* packets forwarded */ + uint64_t ips_fastforward; /* packets fast forwarded */ + uint64_t ips_cantforward; /* packets rcvd for unreachable dest */ + uint64_t ips_redirectsent; /* packets forwarded on same net */ + uint64_t ips_noproto; /* unknown or unsupported protocol */ + uint64_t ips_delivered; /* datagrams delivered to upper level*/ + uint64_t ips_localout; /* total ip packets generated here */ + uint64_t ips_odropped; /* lost packets due to nobufs, etc. */ + uint64_t ips_reassembled; /* total packets reassembled ok */ + uint64_t ips_fragmented; /* datagrams successfully fragmented */ + uint64_t ips_ofragments; /* output fragments created */ + uint64_t ips_cantfrag; /* don't fragment flag was set, etc. */ + uint64_t ips_badoptions; /* error in option processing */ + uint64_t ips_noroute; /* packets discarded due to no route */ + uint64_t ips_badvers; /* ip version != 4 */ + uint64_t ips_rawout; /* total raw ip packets generated */ + uint64_t ips_toolong; /* ip length > max ip packet size */ + uint64_t ips_notmember; /* multicasts for unregistered grps */ + uint64_t ips_nogif; /* no match gif found */ + uint64_t ips_badaddr; /* invalid address on header */ }; #ifdef _KERNEL +#include #include +VNET_PCPUSTAT_DECLARE(struct ipstat, ipstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define IPSTAT_ADD(name, val) V_ipstat.name += (val) -#define IPSTAT_SUB(name, val) V_ipstat.name -= (val) +#define IPSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct ipstat, ipstat, name, (val)) +#define IPSTAT_SUB(name, val) IPSTAT_ADD(name, -(val)) #define IPSTAT_INC(name) IPSTAT_ADD(name, 1) #define IPSTAT_DEC(name) IPSTAT_SUB(name, 1) @@ -144,11 +148,11 @@ struct ipstat { * Kernel module consumers must use this accessor macro. */ void kmod_ipstat_inc(int statnum); -#define KMOD_IPSTAT_INC(name) \ - kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(u_long)) +#define KMOD_IPSTAT_INC(name) \ + kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(uint64_t)) void kmod_ipstat_dec(int statnum); -#define KMOD_IPSTAT_DEC(name) \ - kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(u_long)) +#define KMOD_IPSTAT_DEC(name) \ + kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(uint64_t)) /* flags passed to ip_output as last parameter */ #define IP_FORWARDING 0x1 /* most of ip header exists */ @@ -157,12 +161,7 @@ void kmod_ipstat_dec(int statnum); #define IP_SENDTOIF 0x8 /* send on specific ifnet */ #define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */ #define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ - -/* - * mbuf flag used by ip_fastfwd - */ -#define M_FASTFWD_OURS M_PROTO1 /* changed dst to local */ -#define M_IP_NEXTHOP M_PROTO2 /* explicit ip nexthop */ +#define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 @@ -175,8 +174,6 @@ struct inpcb; struct route; struct sockopt; -VNET_DECLARE(struct ipstat, ipstat); -VNET_DECLARE(u_short, ip_id); /* ip packet ctr, for ids */ VNET_DECLARE(int, ip_defttl); /* default IP ttl */ VNET_DECLARE(int, ipforwarding); /* ip forwarding */ #ifdef IPSTEALTH @@ -191,7 +188,6 @@ VNET_DECLARE(int, rsvp_on); VNET_DECLARE(int, drop_redirect); extern struct pr_usrreqs rip_usrreqs; -#define V_ipstat VNET(ipstat) #define V_ip_id VNET(ip_id) #define V_ip_defttl VNET(ip_defttl) #define V_ipforwarding VNET(ipforwarding) @@ -210,12 +206,9 @@ int inp_setmoptions(struct inpcb *, struct sockopt *); int ip_ctloutput(struct socket *, struct sockopt *sopt); void ip_drain(void); int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, - u_long if_hwassist_flags, int sw_csum); + u_long if_hwassist_flags); void ip_forward(struct mbuf *m, int srcrt); void ip_init(void); -#ifdef VIMAGE -void ip_destroy(void); -#endif extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); @@ -226,27 +219,22 @@ int ipproto_register(short); int ipproto_unregister(short); struct mbuf * ip_reass(struct mbuf *); -struct in_ifaddr * - ip_rtaddr(struct in_addr, u_int fibnum); void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, struct mbuf *); void ip_slowtimo(void); -u_int16_t ip_randomid(void); +void ip_fillid(struct ip *); int rip_ctloutput(struct socket *, struct sockopt *); void rip_ctlinput(int, struct sockaddr *, void *); void rip_init(void); -#ifdef VIMAGE -void rip_destroy(void); -#endif -void rip_input(struct mbuf *, int); -int rip_output(struct mbuf *, struct socket *, u_long); -void ipip_input(struct mbuf *, int); -void rsvp_input(struct mbuf *, int); +int rip_input(struct mbuf **, int *, int); +int rip_output(struct mbuf *, struct socket *, ...); +int ipip_input(struct mbuf **, int *, int); +int rsvp_input(struct mbuf **, int *, int); int ip_rsvp_init(struct socket *); int ip_rsvp_done(void); extern int (*ip_rsvp_vif)(struct socket *, struct sockopt *); extern void (*ip_rsvp_force_done)(struct socket *); -extern void (*rsvp_input_p)(struct mbuf *m, int off); +extern int (*rsvp_input_p)(struct mbuf **, int *, int); VNET_DECLARE(struct pfil_head, inet_pfil_hook); /* packet filter hooks */ #define V_inet_pfil_hook VNET(inet_pfil_hook) @@ -285,7 +273,7 @@ enum { IPFW_IS_MASK = 0x30000000, /* which source ? */ IPFW_IS_DIVERT = 0x20000000, IPFW_IS_DUMMYNET =0x10000000, - IPFW_IS_PIPE = 0x08000000, /* pip1=1, queue = 0 */ + IPFW_IS_PIPE = 0x08000000, /* pipe=1, queue = 0 */ }; #define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ #define MTAG_IPFW_RULE 1262273568 /* rule reference */ @@ -294,9 +282,7 @@ enum { struct ip_fw_args; typedef int (*ip_fw_chk_ptr_t)(struct ip_fw_args *args); typedef int (*ip_fw_ctl_ptr_t)(struct sockopt *); -VNET_DECLARE(ip_fw_chk_ptr_t, ip_fw_chk_ptr); VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr); -#define V_ip_fw_chk_ptr VNET(ip_fw_chk_ptr) #define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) /* Divert hooks. */ @@ -307,12 +293,6 @@ extern int (*ng_ipfw_input_p)(struct mbuf **, int, extern int (*ip_dn_ctl_ptr)(struct sockopt *); extern int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); - -VNET_DECLARE(int, ip_do_randomid); -#define V_ip_do_randomid VNET(ip_do_randomid) -#define ip_newid() ((V_ip_do_randomid != 0) ? ip_randomid() : \ - htons(V_ip_id++)) - #endif /* _KERNEL */ #endif /* !_NETINET_IP_VAR_H_ */ diff --git a/freebsd/sys/netinet/libalias/alias.c b/freebsd/sys/netinet/libalias/alias.c index 9e975122..a2cd987c 100644 --- a/freebsd/sys/netinet/libalias/alias.c +++ b/freebsd/sys/netinet/libalias/alias.c @@ -1724,7 +1724,7 @@ LibAliasUnLoadAllModule(void) /* Unload all modules then reload everything. */ while ((p = first_handler()) != NULL) { - detach_handler(p); + LibAliasDetachHandlers(p); } while ((t = walk_dll_chain()) != NULL) { dlclose(t->handle); @@ -1751,40 +1751,22 @@ LibAliasUnLoadAllModule(void) struct mbuf * m_megapullup(struct mbuf *m, int len) { struct mbuf *mcl; - + if (len > m->m_pkthdr.len) goto bad; - - /* Do not reallocate packet if it is sequentional, - * writable and has some extra space for expansion. - * XXX: Constant 100bytes is completely empirical. */ -#define RESERVE 100 - if (m->m_next == NULL && M_WRITABLE(m) && M_TRAILINGSPACE(m) >= RESERVE) + + if (m->m_next == NULL && M_WRITABLE(m)) return (m); - if (len <= MCLBYTES - RESERVE) { - mcl = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); - } else if (len < MJUM16BYTES) { - int size; - if (len <= MJUMPAGESIZE - RESERVE) { - size = MJUMPAGESIZE; - } else if (len <= MJUM9BYTES - RESERVE) { - size = MJUM9BYTES; - } else { - size = MJUM16BYTES; - }; - mcl = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, size); - } else { - goto bad; - } + mcl = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR); if (mcl == NULL) goto bad; - + m_align(mcl, len); m_move_pkthdr(mcl, m); m_copydata(m, 0, len, mtod(mcl, caddr_t)); mcl->m_len = mcl->m_pkthdr.len = len; m_freem(m); - + return (mcl); bad: m_freem(m); diff --git a/freebsd/sys/netinet/libalias/alias_cuseeme.c b/freebsd/sys/netinet/libalias/alias_cuseeme.c index 1bdb7c4a..d6c9520c 100644 --- a/freebsd/sys/netinet/libalias/alias_cuseeme.c +++ b/freebsd/sys/netinet/libalias/alias_cuseeme.c @@ -58,14 +58,14 @@ __FBSDID("$FreeBSD$"); #define CUSEEME_PORT_NUMBER 7648 static void -AliasHandleCUSeeMeOut(struct libalias *la, struct ip *pip, +AliasHandleCUSeeMeOut(struct libalias *la, struct ip *pip, struct alias_link *lnk); static void -AliasHandleCUSeeMeIn(struct libalias *la, struct ip *pip, +AliasHandleCUSeeMeIn(struct libalias *la, struct ip *pip, struct in_addr original_addr); -static int +static int fingerprint(struct libalias *la, struct alias_data *ah) { @@ -76,7 +76,7 @@ fingerprint(struct libalias *la, struct alias_data *ah) return (-1); } -static int +static int protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -84,7 +84,7 @@ protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah) return (0); } -static int +static int protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -94,20 +94,20 @@ protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah) /* Kernel module definition. */ struct proto_handler handlers[] = { - { - .pri = 120, - .dir = OUT, - .proto = UDP, - .fingerprint = &fingerprint, + { + .pri = 120, + .dir = OUT, + .proto = UDP, + .fingerprint = &fingerprint, .protohandler = &protohandlerout - }, + }, { - .pri = 120, - .dir = IN, - .proto = UDP, - .fingerprint = &fingerprint, + .pri = 120, + .dir = IN, + .proto = UDP, + .fingerprint = &fingerprint, .protohandler = &protohandlerin - }, + }, { EOH } }; @@ -132,9 +132,9 @@ mod_handler(module_t mod, int type, void *data) } #ifdef _KERNEL -static +static #endif -moduledata_t +moduledata_t alias_mod = { "alias_cuseeme", mod_handler, NULL }; diff --git a/freebsd/sys/netinet/libalias/alias_db.c b/freebsd/sys/netinet/libalias/alias_db.c index fabe586e..219d5d34 100644 --- a/freebsd/sys/netinet/libalias/alias_db.c +++ b/freebsd/sys/netinet/libalias/alias_db.c @@ -148,6 +148,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -350,24 +351,16 @@ MODULE_VERSION(libalias, 1); static int alias_mod_handler(module_t mod, int type, void *data) { - int error; switch (type) { - case MOD_LOAD: - error = 0; - handler_chain_init(); - break; case MOD_QUIESCE: case MOD_UNLOAD: - handler_chain_destroy(); finishoff(); - error = 0; - break; + case MOD_LOAD: + return (0); default: - error = EINVAL; + return (EINVAL); } - - return (error); } static moduledata_t alias_mod = { @@ -793,9 +786,9 @@ FindNewPortGroup(struct libalias *la, struct alias_link *search_result; for (j = 0; j < port_count; j++) - if (0 != (search_result = FindLinkIn(la, dst_addr, alias_addr, - dst_port, htons(port_sys + j), - link_type, 0))) + if ((search_result = FindLinkIn(la, dst_addr, + alias_addr, dst_port, htons(port_sys + j), + link_type, 0)) != NULL) break; /* Found a good range, return base */ diff --git a/freebsd/sys/netinet/libalias/alias_dummy.c b/freebsd/sys/netinet/libalias/alias_dummy.c index eacfac86..b4c00c20 100644 --- a/freebsd/sys/netinet/libalias/alias_dummy.c +++ b/freebsd/sys/netinet/libalias/alias_dummy.c @@ -29,7 +29,7 @@ #include __FBSDID("$FreeBSD$"); -/* +/* * Alias_dummy is just an empty skeleton used to demostrate how to write * a module for libalias, that will run unalterated in userland or in * kernel land. @@ -61,19 +61,19 @@ __FBSDID("$FreeBSD$"); static void AliasHandleDummy(struct libalias *la, struct ip *ip, struct alias_data *ah); -static int +static int fingerprint(struct libalias *la, struct alias_data *ah) { - /* - * Check here all the data that will be used later, if any field + /* + * Check here all the data that will be used later, if any field * is empy/NULL, return a -1 value. */ - if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || ah->maxpktsize == 0) return (-1); - /* - * Fingerprint the incoming packet, if it matches any conditions + /* + * Fingerprint the incoming packet, if it matches any conditions * return an OK value. */ if (ntohs(*ah->dport) == 123 @@ -82,12 +82,12 @@ fingerprint(struct libalias *la, struct alias_data *ah) return (-1); /* I don't recognize this packet. */ } -/* - * Wrap in this general purpose function, the real function used to alias the +/* + * Wrap in this general purpose function, the real function used to alias the * packets. */ -static int +static int protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -95,22 +95,22 @@ protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) return (0); } -/* - * NOTA BENE: the next variable MUST NOT be renamed in any case if you want - * your module to work in userland, cause it's used to find and use all +/* + * NOTA BENE: the next variable MUST NOT be renamed in any case if you want + * your module to work in userland, cause it's used to find and use all * the protocol handlers present in every module. - * So WATCH OUT, your module needs this variables and it needs it with + * So WATCH OUT, your module needs this variables and it needs it with * ITS EXACT NAME: handlers. */ struct proto_handler handlers [] = { - { - .pri = 666, - .dir = IN|OUT, - .proto = UDP|TCP, - .fingerprint = &fingerprint, + { + .pri = 666, + .dir = IN|OUT, + .proto = UDP|TCP, + .fingerprint = &fingerprint, .protohandler = &protohandler - }, + }, { EOH } }; @@ -119,7 +119,7 @@ mod_handler(module_t mod, int type, void *data) { int error; - switch (type) { + switch (type) { case MOD_LOAD: error = 0; LibAliasAttachHandlers(handlers); diff --git a/freebsd/sys/netinet/libalias/alias_irc.c b/freebsd/sys/netinet/libalias/alias_irc.c index 880d897e..44ff6d92 100644 --- a/freebsd/sys/netinet/libalias/alias_irc.c +++ b/freebsd/sys/netinet/libalias/alias_irc.c @@ -46,7 +46,7 @@ __FBSDID("$FreeBSD$"); Version 2.1: May, 1997 (cjm) Very minor changes to conform with local/global/function naming conventions - withing the packet alising module. + within the packet alising module. */ /* Includes */ @@ -94,11 +94,11 @@ static void AliasHandleIrcOut(struct libalias *, struct ip *, struct alias_link *, int maxpacketsize); -static int +static int fingerprint(struct libalias *la, struct alias_data *ah) { - if (ah->dport == NULL || ah->dport == NULL || ah->lnk == NULL || + if (ah->dport == NULL || ah->dport == NULL || ah->lnk == NULL || ah->maxpktsize == 0) return (-1); if (ntohs(*ah->dport) == IRC_CONTROL_PORT_NUMBER_1 @@ -107,7 +107,7 @@ fingerprint(struct libalias *la, struct alias_data *ah) return (-1); } -static int +static int protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -120,13 +120,13 @@ protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) } struct proto_handler handlers[] = { - { - .pri = 90, - .dir = OUT, - .proto = TCP, - .fingerprint = &fingerprint, + { + .pri = 90, + .dir = OUT, + .proto = TCP, + .fingerprint = &fingerprint, .protohandler = &protohandler - }, + }, { EOH } }; @@ -151,7 +151,7 @@ mod_handler(module_t mod, int type, void *data) } #ifdef _KERNEL -static +static #endif moduledata_t alias_mod = { "alias_irc", mod_handler, NULL @@ -484,7 +484,7 @@ lPACKET_DONE: which will generate a type-error on all but 32-bit machines. [Note 2] This routine really ought to be replaced with one that - creates a transparent proxy on the aliasing host, to allow arbitary + creates a transparent proxy on the aliasing host, to allow arbitrary changes in the TCP stream. This should not be too difficult given this base; I (ee) will try to do this some time later. */ diff --git a/freebsd/sys/netinet/libalias/alias_local.h b/freebsd/sys/netinet/libalias/alias_local.h index a7b3fe19..3010be84 100644 --- a/freebsd/sys/netinet/libalias/alias_local.h +++ b/freebsd/sys/netinet/libalias/alias_local.h @@ -357,7 +357,7 @@ void PunchFWHole(struct alias_link *_lnk); /* Housekeeping function */ void HouseKeeping(struct libalias *); -/* Tcp specfic routines */ +/* Tcp specific routines */ /* lint -save -library Suppress flexelint warnings */ /* Transparent proxy routines */ diff --git a/freebsd/sys/netinet/libalias/alias_mod.c b/freebsd/sys/netinet/libalias/alias_mod.c index 0e0bd56a..6acbbee6 100644 --- a/freebsd/sys/netinet/libalias/alias_mod.c +++ b/freebsd/sys/netinet/libalias/alias_mod.c @@ -54,201 +54,82 @@ __FBSDID("$FreeBSD$"); #endif /* Protocol and userland module handlers chains. */ -LIST_HEAD(handler_chain, proto_handler) handler_chain = LIST_HEAD_INITIALIZER(handler_chain); -#ifdef _KERNEL -struct rwlock handler_rw; -#endif -SLIST_HEAD(dll_chain, dll) dll_chain = SLIST_HEAD_INITIALIZER(dll_chain); - -#ifdef _KERNEL - -#define LIBALIAS_RWLOCK_INIT() \ - rw_init(&handler_rw, "Libalias_modules_rwlock") -#define LIBALIAS_RWLOCK_DESTROY() rw_destroy(&handler_rw) -#define LIBALIAS_WLOCK_ASSERT() \ - rw_assert(&handler_rw, RA_WLOCKED) - -static __inline void -LIBALIAS_RLOCK(void) -{ - rw_rlock(&handler_rw); -} - -static __inline void -LIBALIAS_RUNLOCK(void) -{ - rw_runlock(&handler_rw); -} - -static __inline void -LIBALIAS_WLOCK(void) -{ - rw_wlock(&handler_rw); -} - -static __inline void -LIBALIAS_WUNLOCK(void) -{ - rw_wunlock(&handler_rw); -} - -static void -_handler_chain_init(void) -{ - - if (!rw_initialized(&handler_rw)) - LIBALIAS_RWLOCK_INIT(); -} - -static void -_handler_chain_destroy(void) -{ - - if (rw_initialized(&handler_rw)) - LIBALIAS_RWLOCK_DESTROY(); -} - -#else -#define LIBALIAS_RWLOCK_INIT() ; -#define LIBALIAS_RWLOCK_DESTROY() ; -#define LIBALIAS_WLOCK_ASSERT() ; -#define LIBALIAS_RLOCK() ; -#define LIBALIAS_RUNLOCK() ; -#define LIBALIAS_WLOCK() ; -#define LIBALIAS_WUNLOCK() ; -#define _handler_chain_init() ; -#define _handler_chain_destroy() ; -#endif - -void -handler_chain_init(void) -{ - _handler_chain_init(); -} - -void -handler_chain_destroy(void) -{ - _handler_chain_destroy(); -} +static TAILQ_HEAD(handler_chain, proto_handler) handler_chain = + TAILQ_HEAD_INITIALIZER(handler_chain); static int -_attach_handler(struct proto_handler *p) +attach_handler(struct proto_handler *p) { struct proto_handler *b; - LIBALIAS_WLOCK_ASSERT(); - b = NULL; - LIST_FOREACH(b, &handler_chain, entries) { - if ((b->pri == p->pri) && + TAILQ_FOREACH(b, &handler_chain, link) { + if ((b->pri == p->pri) && (b->dir == p->dir) && (b->proto == p->proto)) - return (EEXIST); /* Priority conflict. */ + return (EEXIST); if (b->pri > p->pri) { - LIST_INSERT_BEFORE(b, p, entries); + TAILQ_INSERT_BEFORE(b, p, link); return (0); } } - /* End of list or found right position, inserts here. */ - if (b) - LIST_INSERT_AFTER(b, p, entries); - else - LIST_INSERT_HEAD(&handler_chain, p, entries); - return (0); -} -static int -_detach_handler(struct proto_handler *p) -{ - struct proto_handler *b, *b_tmp; + TAILQ_INSERT_TAIL(&handler_chain, p, link); - LIBALIAS_WLOCK_ASSERT(); - LIST_FOREACH_SAFE(b, &handler_chain, entries, b_tmp) { - if (b == p) { - LIST_REMOVE(b, entries); - return (0); - } - } - return (ENOENT); /* Handler not found. */ + return (0); } int -LibAliasAttachHandlers(struct proto_handler *_p) +LibAliasAttachHandlers(struct proto_handler *p) { - int i, error; + int error; - LIBALIAS_WLOCK(); - error = -1; - for (i = 0; 1; i++) { - if (*((int *)&_p[i]) == EOH) - break; - error = _attach_handler(&_p[i]); - if (error != 0) - break; + while (p->dir != NODIR) { + error = attach_handler(p); + if (error) + return (error); + p++; } - LIBALIAS_WUNLOCK(); - return (error); + + return (0); } +/* XXXGL: should be void, but no good reason to break ABI */ int -LibAliasDetachHandlers(struct proto_handler *_p) +LibAliasDetachHandlers(struct proto_handler *p) { - int i, error; - LIBALIAS_WLOCK(); - error = -1; - for (i = 0; 1; i++) { - if (*((int *)&_p[i]) == EOH) - break; - error = _detach_handler(&_p[i]); - if (error != 0) - break; + while (p->dir != NODIR) { + TAILQ_REMOVE(&handler_chain, p, link); + p++; } - LIBALIAS_WUNLOCK(); - return (error); -} - -int -detach_handler(struct proto_handler *_p) -{ - int error; - LIBALIAS_WLOCK(); - error = -1; - error = _detach_handler(_p); - LIBALIAS_WUNLOCK(); - return (error); + return (0); } int -find_handler(int8_t dir, int8_t proto, struct libalias *la, __unused struct ip *pip, +find_handler(int8_t dir, int8_t proto, struct libalias *la, struct ip *ip, struct alias_data *ad) { struct proto_handler *p; - int error; - LIBALIAS_RLOCK(); - error = ENOENT; - LIST_FOREACH(p, &handler_chain, entries) { - if ((p->dir & dir) && (p->proto & proto)) - if (p->fingerprint(la, ad) == 0) { - error = p->protohandler(la, pip, ad); - break; - } - } - LIBALIAS_RUNLOCK(); - return (error); + TAILQ_FOREACH(p, &handler_chain, link) + if ((p->dir & dir) && (p->proto & proto) && + p->fingerprint(la, ad) == 0) + return (p->protohandler(la, ip, ad)); + + return (ENOENT); } struct proto_handler * first_handler(void) { - - return (LIST_FIRST(&handler_chain)); + + return (TAILQ_FIRST(&handler_chain)); } +#ifndef _KERNEL /* Dll manipulation code - this code is not thread safe... */ - +SLIST_HEAD(dll_chain, dll) dll_chain = SLIST_HEAD_INITIALIZER(dll_chain); int attach_dll(struct dll *p) { @@ -272,7 +153,7 @@ detach_dll(char *p) error = NULL; SLIST_FOREACH_SAFE(b, &dll_chain, next, b_tmp) if (!strncmp(b->name, p, DLL_LEN)) { - SLIST_REMOVE(&dll_chain, b, dll, next); + SLIST_REMOVE(&dll_chain, b, dll, next); error = b; break; } @@ -290,3 +171,4 @@ walk_dll_chain(void) SLIST_REMOVE_HEAD(&dll_chain, next); return (t); } +#endif /* !_KERNEL */ diff --git a/freebsd/sys/netinet/libalias/alias_mod.h b/freebsd/sys/netinet/libalias/alias_mod.h index 727df8e6..fd020c46 100644 --- a/freebsd/sys/netinet/libalias/alias_mod.h +++ b/freebsd/sys/netinet/libalias/alias_mod.h @@ -54,102 +54,92 @@ MALLOC_DECLARE(M_ALIAS); #endif #endif -/* Protocol handlers struct & function. */ +/* Packet flow direction flags. */ +#define IN 0x0001 +#define OUT 0x0002 +#define NODIR 0x4000 -/* Packet flow direction. */ -#define IN 1 -#define OUT 2 +/* Working protocol flags. */ +#define IP 0x01 +#define TCP 0x02 +#define UDP 0x04 -/* Working protocol. */ -#define IP 1 -#define TCP 2 -#define UDP 4 - -/* +/* * Data passed to protocol handler module, it must be filled * right before calling find_handler() to determine which * module is elegible to be called. */ +struct alias_data { + struct alias_link *lnk; + struct in_addr *oaddr; /* Original address. */ + struct in_addr *aaddr; /* Alias address. */ + uint16_t *aport; /* Alias port. */ + uint16_t *sport, *dport; /* Source & destination port */ + uint16_t maxpktsize; /* Max packet size. */ +}; -struct alias_data { - struct alias_link *lnk; - struct in_addr *oaddr; /* Original address. */ - struct in_addr *aaddr; /* Alias address. */ - uint16_t *aport; /* Alias port. */ - uint16_t *sport, *dport; /* Source & destination port */ - uint16_t maxpktsize; /* Max packet size. */ -}; - -/* +/* * This structure contains all the information necessary to make * a protocol handler correctly work. */ - struct proto_handler { - u_int pri; /* Handler priority. */ - int16_t dir; /* Flow direction. */ - uint8_t proto; /* Working protocol. */ - int (*fingerprint)(struct libalias *, /* Fingerprint * function. */ - struct alias_data *); - int (*protohandler)(struct libalias *, /* Aliasing * function. */ - struct ip *, struct alias_data *); - LIST_ENTRY(proto_handler) entries; + u_int pri; /* Handler priority. */ + int16_t dir; /* Flow direction. */ + uint8_t proto; /* Working protocol. */ + /* Fingerprint * function. */ + int (*fingerprint)(struct libalias *, struct alias_data *); + /* Aliasing * function. */ + int (*protohandler)(struct libalias *, struct ip *, + struct alias_data *); + TAILQ_ENTRY(proto_handler) link; }; +/* End of handlers. */ +#define EOH .dir = NODIR -/* +/* Functions used with protocol handlers. */ +int LibAliasAttachHandlers(struct proto_handler *); +int LibAliasDetachHandlers(struct proto_handler *); +int find_handler(int8_t, int8_t, struct libalias *, struct ip *, + struct alias_data *); +struct proto_handler *first_handler(void); + +#ifndef _KERNEL +/* * Used only in userland when libalias needs to keep track of all * module loaded. In kernel land (kld mode) we don't need to care * care about libalias modules cause it's kld to do it for us. */ - -#define DLL_LEN 32 -struct dll { - char name[DLL_LEN]; /* Name of module. */ - void *handle; /* - * Ptr to shared obj obtained through - * dlopen() - use this ptr to get access - * to any symbols from a loaded module - * via dlsym(). - */ - SLIST_ENTRY(dll) next; +#define DLL_LEN 32 +struct dll { + char name[DLL_LEN]; /* Name of module. */ + void *handle; /* + * Ptr to shared obj obtained through + * dlopen() - use this ptr to get access + * to any symbols from a loaded module + * via dlsym(). + */ + SLIST_ENTRY(dll) next; }; -/* Functions used with protocol handlers. */ - -void handler_chain_init(void); -void handler_chain_destroy(void); -int LibAliasAttachHandlers(struct proto_handler *); -int LibAliasDetachHandlers(struct proto_handler *); -int detach_handler(struct proto_handler *); -int find_handler(int8_t, int8_t, struct libalias *, - struct ip *, struct alias_data *); -struct proto_handler *first_handler(void); - /* Functions used with dll module. */ +void dll_chain_init(void); +void dll_chain_destroy(void); +int attach_dll(struct dll *); +void *detach_dll(char *); +struct dll *walk_dll_chain(void); -void dll_chain_init(void); -void dll_chain_destroy(void); -int attach_dll(struct dll *); -void *detach_dll(char *); -struct dll *walk_dll_chain(void); - -/* End of handlers. */ -#define EOH -1 - -/* +/* * Some defines borrowed from sys/module.h used to compile a kld * in userland as a shared lib. */ - -#ifndef _KERNEL typedef enum modeventtype { - MOD_LOAD, - MOD_UNLOAD, - MOD_SHUTDOWN, - MOD_QUIESCE + MOD_LOAD, + MOD_UNLOAD, + MOD_SHUTDOWN, + MOD_QUIESCE } modeventtype_t; - + typedef struct module *module_t; typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *); @@ -157,10 +147,10 @@ typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *); * Struct for registering modules statically via SYSINIT. */ typedef struct moduledata { - const char *name; /* module name */ - modeventhand_t evhand; /* event handler */ - void *priv; /* extra data */ + const char *name; /* module name */ + modeventhand_t evhand; /* event handler */ + void *priv; /* extra data */ } moduledata_t; -#endif +#endif /* !_KERNEL */ -#endif /* !_ALIAS_MOD_H_ */ +#endif /* !_ALIAS_MOD_H_ */ diff --git a/freebsd/sys/netinet/libalias/alias_nbt.c b/freebsd/sys/netinet/libalias/alias_nbt.c index 5a917872..c10f9b48 100644 --- a/freebsd/sys/netinet/libalias/alias_nbt.c +++ b/freebsd/sys/netinet/libalias/alias_nbt.c @@ -72,17 +72,17 @@ __FBSDID("$FreeBSD$"); #define NETBIOS_DGM_PORT_NUMBER 138 static int -AliasHandleUdpNbt(struct libalias *, struct ip *, struct alias_link *, +AliasHandleUdpNbt(struct libalias *, struct ip *, struct alias_link *, struct in_addr *, u_short); static int AliasHandleUdpNbtNS(struct libalias *, struct ip *, struct alias_link *, struct in_addr *, u_short *, struct in_addr *, u_short *); -static int +static int fingerprint1(struct libalias *la, struct alias_data *ah) { - if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || ah->aaddr == NULL || ah->aport == NULL) return (-1); if (ntohs(*ah->dport) == NETBIOS_DGM_PORT_NUMBER @@ -91,18 +91,18 @@ fingerprint1(struct libalias *la, struct alias_data *ah) return (-1); } -static int +static int protohandler1(struct libalias *la, struct ip *pip, struct alias_data *ah) { return (AliasHandleUdpNbt(la, pip, ah->lnk, ah->aaddr, *ah->aport)); } -static int +static int fingerprint2(struct libalias *la, struct alias_data *ah) { - if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || ah->aaddr == NULL || ah->aport == NULL) return (-1); if (ntohs(*ah->dport) == NETBIOS_NS_PORT_NUMBER @@ -111,7 +111,7 @@ fingerprint2(struct libalias *la, struct alias_data *ah) return (-1); } -static int +static int protohandler2in(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -120,7 +120,7 @@ protohandler2in(struct libalias *la, struct ip *pip, struct alias_data *ah) return (0); } -static int +static int protohandler2out(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -130,27 +130,27 @@ protohandler2out(struct libalias *la, struct ip *pip, struct alias_data *ah) /* Kernel module definition. */ struct proto_handler handlers[] = { - { - .pri = 130, - .dir = IN|OUT, - .proto = UDP, - .fingerprint = &fingerprint1, + { + .pri = 130, + .dir = IN|OUT, + .proto = UDP, + .fingerprint = &fingerprint1, .protohandler = &protohandler1 - }, - { - .pri = 140, - .dir = IN, - .proto = UDP, - .fingerprint = &fingerprint2, + }, + { + .pri = 140, + .dir = IN, + .proto = UDP, + .fingerprint = &fingerprint2, .protohandler = &protohandler2in - }, - { - .pri = 140, - .dir = OUT, - .proto = UDP, - .fingerprint = &fingerprint2, + }, + { + .pri = 140, + .dir = OUT, + .proto = UDP, + .fingerprint = &fingerprint2, .protohandler = &protohandler2out - }, + }, { EOH } }; @@ -175,7 +175,7 @@ mod_handler(module_t mod, int type, void *data) } #ifdef _KERNEL -static +static #endif moduledata_t alias_mod = { "alias_nbt", mod_handler, NULL diff --git a/freebsd/sys/netinet/libalias/alias_pptp.c b/freebsd/sys/netinet/libalias/alias_pptp.c index e8205db0..39861c5c 100644 --- a/freebsd/sys/netinet/libalias/alias_pptp.c +++ b/freebsd/sys/netinet/libalias/alias_pptp.c @@ -80,7 +80,7 @@ AliasHandlePptpGreOut(struct libalias *, struct ip *); static int AliasHandlePptpGreIn(struct libalias *, struct ip *); -static int +static int fingerprint(struct libalias *la, struct alias_data *ah) { @@ -92,14 +92,14 @@ fingerprint(struct libalias *la, struct alias_data *ah) return (-1); } -static int +static int fingerprintgre(struct libalias *la, struct alias_data *ah) { return (0); } -static int +static int protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -107,7 +107,7 @@ protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah) return (0); } -static int +static int protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -115,7 +115,7 @@ protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah) return (0); } -static int +static int protohandlergrein(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -125,7 +125,7 @@ protohandlergrein(struct libalias *la, struct ip *pip, struct alias_data *ah) return (-1); } -static int +static int protohandlergreout(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -136,39 +136,39 @@ protohandlergreout(struct libalias *la, struct ip *pip, struct alias_data *ah) /* Kernel module definition. */ struct proto_handler handlers[] = { - { - .pri = 200, - .dir = IN, - .proto = TCP, - .fingerprint = &fingerprint, + { + .pri = 200, + .dir = IN, + .proto = TCP, + .fingerprint = &fingerprint, .protohandler = &protohandlerin }, - { - .pri = 210, - .dir = OUT, - .proto = TCP, - .fingerprint = &fingerprint, + { + .pri = 210, + .dir = OUT, + .proto = TCP, + .fingerprint = &fingerprint, .protohandler = &protohandlerout }, -/* - * WATCH OUT!!! these 2 handlers NEED a priority of INT_MAX (highest possible) +/* + * WATCH OUT!!! these 2 handlers NEED a priority of INT_MAX (highest possible) * cause they will ALWAYS process packets, so they must be the last one * in chain: look fingerprintgre() above. */ - { - .pri = INT_MAX, - .dir = IN, - .proto = IP, - .fingerprint = &fingerprintgre, + { + .pri = INT_MAX, + .dir = IN, + .proto = IP, + .fingerprint = &fingerprintgre, .protohandler = &protohandlergrein }, - { - .pri = INT_MAX, - .dir = OUT, - .proto = IP, - .fingerprint = &fingerprintgre, + { + .pri = INT_MAX, + .dir = OUT, + .proto = IP, + .fingerprint = &fingerprintgre, .protohandler = &protohandlergreout - }, + }, { EOH } }; static int @@ -192,7 +192,7 @@ mod_handler(module_t mod, int type, void *data) } #ifdef _KERNEL -static +static #endif moduledata_t alias_mod = { "alias_pptp", mod_handler, NULL diff --git a/freebsd/sys/netinet/libalias/alias_sctp.h b/freebsd/sys/netinet/libalias/alias_sctp.h index 840917ad..99cceee4 100644 --- a/freebsd/sys/netinet/libalias/alias_sctp.h +++ b/freebsd/sys/netinet/libalias/alias_sctp.h @@ -92,7 +92,6 @@ #ifndef _KERNEL #include #include -#include #endif //#ifdef _KERNEL diff --git a/freebsd/sys/netinet/libalias/alias_skinny.c b/freebsd/sys/netinet/libalias/alias_skinny.c index 9f292916..b1f8f8c7 100644 --- a/freebsd/sys/netinet/libalias/alias_skinny.c +++ b/freebsd/sys/netinet/libalias/alias_skinny.c @@ -58,7 +58,7 @@ static void AliasHandleSkinny(struct libalias *, struct ip *, struct alias_link *); -static int +static int fingerprint(struct libalias *la, struct alias_data *ah) { @@ -70,7 +70,7 @@ fingerprint(struct libalias *la, struct alias_data *ah) return (-1); } -static int +static int protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -79,13 +79,13 @@ protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) } struct proto_handler handlers[] = { - { - .pri = 110, - .dir = IN|OUT, - .proto = TCP, - .fingerprint = &fingerprint, + { + .pri = 110, + .dir = IN|OUT, + .proto = TCP, + .fingerprint = &fingerprint, .protohandler = &protohandler - }, + }, { EOH } }; @@ -110,7 +110,7 @@ mod_handler(module_t mod, int type, void *data) } #ifdef _KERNEL -static +static #endif moduledata_t alias_mod = { "alias_skinny", mod_handler, NULL @@ -342,7 +342,7 @@ AliasHandleSkinny(struct libalias *la, struct ip *pip, struct alias_link *lnk) * through the packet using len to determine message boundaries. * This comes into play big time with port messages being in the * same packet as register messages. Also, open receive channel - * acks are usually buried in a pakcet some 400 bytes long. + * acks are usually buried in a packet some 400 bytes long. */ while (dlen >= skinny_hdr_len) { len = (sd->len); diff --git a/freebsd/sys/netinet/libalias/alias_smedia.c b/freebsd/sys/netinet/libalias/alias_smedia.c index 47ae2748..9578a4af 100644 --- a/freebsd/sys/netinet/libalias/alias_smedia.c +++ b/freebsd/sys/netinet/libalias/alias_smedia.c @@ -133,14 +133,14 @@ __FBSDID("$FreeBSD$"); static void AliasHandleRtspOut(struct libalias *, struct ip *, struct alias_link *, int maxpacketsize); -static int +static int fingerprint(struct libalias *la, struct alias_data *ah) { if (ah->dport != NULL && ah->aport != NULL && ah->sport != NULL && ntohs(*ah->dport) == TFTP_PORT_NUMBER) return (0); - if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || ah->maxpktsize == 0) return (-1); if (ntohs(*ah->dport) == RTSP_CONTROL_PORT_NUMBER_1 @@ -151,7 +151,7 @@ fingerprint(struct libalias *la, struct alias_data *ah) return (-1); } -static int +static int protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) { @@ -163,13 +163,13 @@ protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) } struct proto_handler handlers[] = { - { - .pri = 100, - .dir = OUT, + { + .pri = 100, + .dir = OUT, .proto = TCP|UDP, - .fingerprint = &fingerprint, + .fingerprint = &fingerprint, .protohandler = &protohandler - }, + }, { EOH } }; @@ -194,7 +194,7 @@ mod_handler(module_t mod, int type, void *data) } #ifdef _KERNEL -static +static #endif moduledata_t alias_mod = { "alias_smedia", mod_handler, NULL @@ -408,7 +408,7 @@ alias_rtsp_out(struct libalias *la, struct ip *pip, SetAckModified(lnk); tc = (struct tcphdr *)ip_next(pip); delta = GetDeltaSeqOut(tc->th_seq, lnk); - AddSeq(lnk, delta + new_dlen - dlen, pip->ip_hl, pip->ip_len, + AddSeq(lnk, delta + new_dlen - dlen, pip->ip_hl, pip->ip_len, tc->th_seq, tc->th_off); new_len = htons(hlen + new_dlen); @@ -520,7 +520,7 @@ AliasHandleRtspOut(struct libalias *la, struct ip *pip, struct alias_link *lnk, /* * When aliasing a server, check for the 200 reply - * Accomodate varying number of blanks between 200 & OK + * Accommodate varying number of blanks between 200 & OK */ if (dlen >= (int)strlen(str200)) { diff --git a/freebsd/sys/netinet/pim_var.h b/freebsd/sys/netinet/pim_var.h index 41657b61..ae876c94 100644 --- a/freebsd/sys/netinet/pim_var.h +++ b/freebsd/sys/netinet/pim_var.h @@ -46,38 +46,33 @@ * PIM statistics kept in the kernel */ struct pimstat { - u_quad_t pims_rcv_total_msgs; /* total PIM messages received */ - u_quad_t pims_rcv_total_bytes; /* total PIM bytes received */ - u_quad_t pims_rcv_tooshort; /* rcvd with too few bytes */ - u_quad_t pims_rcv_badsum; /* rcvd with bad checksum */ - u_quad_t pims_rcv_badversion; /* rcvd bad PIM version */ - u_quad_t pims_rcv_registers_msgs; /* rcvd regs. msgs (data only) */ - u_quad_t pims_rcv_registers_bytes; /* rcvd regs. bytes (data only) */ - u_quad_t pims_rcv_registers_wrongiif; /* rcvd regs. on wrong iif */ - u_quad_t pims_rcv_badregisters; /* rcvd invalid registers */ - u_quad_t pims_snd_registers_msgs; /* sent regs. msgs (data only) */ - u_quad_t pims_snd_registers_bytes; /* sent regs. bytes (data only) */ + uint64_t pims_rcv_total_msgs; /* total PIM messages received */ + uint64_t pims_rcv_total_bytes; /* total PIM bytes received */ + uint64_t pims_rcv_tooshort; /* rcvd with too few bytes */ + uint64_t pims_rcv_badsum; /* rcvd with bad checksum */ + uint64_t pims_rcv_badversion; /* rcvd bad PIM version */ + uint64_t pims_rcv_registers_msgs; /* rcvd regs. msgs (data only) */ + uint64_t pims_rcv_registers_bytes; /* rcvd regs. bytes (data only) */ + uint64_t pims_rcv_registers_wrongiif; /* rcvd regs. on wrong iif */ + uint64_t pims_rcv_badregisters; /* rcvd invalid registers */ + uint64_t pims_snd_registers_msgs; /* sent regs. msgs (data only) */ + uint64_t pims_snd_registers_bytes; /* sent regs. bytes (data only) */ }; #ifdef _KERNEL -#define PIMSTAT_ADD(name, val) V_pimstat.name += (val) +#define PIMSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct pimstat, pimstat, name, (val)) #define PIMSTAT_INC(name) PIMSTAT_ADD(name, 1) #endif /* - * Names for PIM sysctl objects + * Identifiers for PIM sysctl nodes */ #define PIMCTL_STATS 1 /* statistics (read-only) */ -#define PIMCTL_MAXID 2 - -#define PIMCTL_NAMES { \ - { 0, 0 }, \ - { "stats", CTLTYPE_STRUCT }, \ -} #ifdef _KERNEL -void pim_input(struct mbuf *, int); +int pim_input(struct mbuf **, int *, int); SYSCTL_DECL(_net_inet_pim); #endif diff --git a/freebsd/sys/netinet/raw_ip.c b/freebsd/sys/netinet/raw_ip.c index 827eca6e..a4679586 100644 --- a/freebsd/sys/netinet/raw_ip.c +++ b/freebsd/sys/netinet/raw_ip.c @@ -42,12 +42,14 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -59,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -70,15 +73,17 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #ifdef IPSEC #include #endif /*IPSEC*/ +#include #include VNET_DEFINE(int, ip_defttl) = IPDEFTTL; -SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_defttl), 0, "Maximum TTL on IP packets"); @@ -102,9 +107,6 @@ void (*ip_divert_ptr)(struct mbuf *, int); int (*ng_ipfw_input_p)(struct mbuf **, int, struct ip_fw_args *, int); -/* Hook for telling pf that the destination address changed */ -void (*m_addr_chg_pf_p)(struct mbuf *m); - #ifdef INET /* * Hooks for multicast routing. They all default to NULL, so leave them not @@ -128,11 +130,13 @@ int (*mrt_ioctl)(u_long, caddr_t, int); int (*legal_vif_num)(int); u_long (*ip_mcast_src)(int); -void (*rsvp_input_p)(struct mbuf *m, int off); +int (*rsvp_input_p)(struct mbuf **, int *, int); int (*ip_rsvp_vif)(struct socket *, struct sockopt *); void (*ip_rsvp_force_done)(struct socket *); #endif /* INET */ +extern struct protosw inetsw[]; + u_long rip_sendspace = 9216; SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); @@ -210,19 +214,19 @@ rip_init(void) { in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, - 1, "ripcb", rip_inpcb_init, NULL, UMA_ZONE_NOFREE, - IPI_HASHFIELDS_NONE); + 1, "ripcb", rip_inpcb_init, NULL, 0, IPI_HASHFIELDS_NONE); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } #ifdef VIMAGE -void -rip_destroy(void) +static void +rip_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_ripcbinfo); } +VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL); #endif #ifdef INET @@ -274,16 +278,18 @@ rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. */ -void -rip_input(struct mbuf *m, int off) +int +rip_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; + struct mbuf *m = *mp; struct ip *ip = mtod(m, struct ip *); - int proto = ip->ip_p; struct inpcb *inp, *last; struct sockaddr_in ripsrc; int hash; + *mp = NULL; + bzero(&ripsrc, sizeof(ripsrc)); ripsrc.sin_len = sizeof(ripsrc); ripsrc.sin_family = AF_INET; @@ -411,10 +417,15 @@ rip_input(struct mbuf *m, int off) IPSTAT_INC(ips_delivered); INP_RUNLOCK(last); } else { - m_freem(m); - IPSTAT_INC(ips_noproto); - IPSTAT_DEC(ips_delivered); + if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) { + IPSTAT_INC(ips_noproto); + IPSTAT_DEC(ips_delivered); + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); + } else { + m_freem(m); + } } + return (IPPROTO_DONE); } /* @@ -422,14 +433,20 @@ rip_input(struct mbuf *m, int off) * have setup with control call. */ int -rip_output(struct mbuf *m, struct socket *so, u_long dst) +rip_output(struct mbuf *m, struct socket *so, ...) { struct ip *ip; int error; struct inpcb *inp = sotoinpcb(so); + va_list ap; + u_long dst; int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST; + va_start(ap, so); + dst = va_arg(ap, u_long); + va_end(ap); + /* * If the user handed us a complete IP packet, use it. Otherwise, * allocate an mbuf for a header and fill it in. @@ -439,7 +456,7 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst) m_freem(m); return(EMSGSIZE); } - M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); + M_PREPEND(m, sizeof(struct ip), M_NOWAIT); if (m == NULL) return(ENOBUFS); @@ -447,32 +464,32 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst) ip = mtod(m, struct ip *); ip->ip_tos = inp->inp_ip_tos; if (inp->inp_flags & INP_DONTFRAG) - ip->ip_off = IP_DF; + ip->ip_off = htons(IP_DF); else - ip->ip_off = 0; + ip->ip_off = htons(0); ip->ip_p = inp->inp_ip_p; - ip->ip_len = m->m_pkthdr.len; + ip->ip_len = htons(m->m_pkthdr.len); ip->ip_src = inp->inp_laddr; + ip->ip_dst.s_addr = dst; if (jailed(inp->inp_cred)) { /* * prison_local_ip4() would be good enough but would * let a source of INADDR_ANY pass, which we do not - * want to see from jails. We do not go through the - * pain of in_pcbladdr() for raw sockets. + * want to see from jails. */ - if (ip->ip_src.s_addr == INADDR_ANY) - error = prison_get_ip4(inp->inp_cred, - &ip->ip_src); - else + if (ip->ip_src.s_addr == INADDR_ANY) { + error = in_pcbladdr(inp, &ip->ip_dst, &ip->ip_src, + inp->inp_cred); + } else { error = prison_local_ip4(inp->inp_cred, &ip->ip_src); + } if (error != 0) { INP_RUNLOCK(inp); m_freem(m); return (error); } } - ip->ip_dst.s_addr = dst; ip->ip_ttl = inp->inp_ip_ttl; } else { if (m->m_pkthdr.len > IP_MAXPACKET) { @@ -493,14 +510,18 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst) * and don't allow packet length sizes that will crash. */ if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) - || (ip->ip_len > m->m_pkthdr.len) - || (ip->ip_len < (ip->ip_hl << 2))) { + || (ntohs(ip->ip_len) > m->m_pkthdr.len) + || (ntohs(ip->ip_len) < (ip->ip_hl << 2))) { INP_RUNLOCK(inp); m_freem(m); return (EINVAL); } + /* + * This doesn't allow application to specify ID of zero, + * but we got this limitation from the beginning of history. + */ if (ip->ip_id == 0) - ip->ip_id = ip_newid(); + ip_fillid(ip); /* * XXX prevent ip_output from overwriting header fields. @@ -539,6 +560,8 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst) * * When adding new socket options here, make sure to add access control * checks here as necessary. + * + * XXX-BZ inp locking? */ int rip_ctloutput(struct socket *so, struct sockopt *sopt) @@ -712,6 +735,7 @@ rip_ctloutput(struct socket *so, struct sockopt *sopt) void rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) { + struct rm_priotracker in_ifa_tracker; struct in_ifaddr *ia; struct ifnet *ifp; int err; @@ -719,16 +743,16 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) switch (cmd) { case PRC_IFDOWN: - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa && (ia->ia_flags & IFA_ROUTE)) { ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); /* - * in_ifscrub kills the interface route. + * in_scrubprefix() kills the interface route. */ - in_ifscrub(ia->ia_ifp, ia, 0); + in_scrubprefix(ia, 0); /* * in_ifadown gets rid of all the rest of the * routes. This is not quite the right thing @@ -741,21 +765,21 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) } } if (ia == NULL) /* If ia matched, already unlocked. */ - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); break; case PRC_IFUP: - IN_IFADDR_RLOCK(); + IN_IFADDR_RLOCK(&in_ifa_tracker); TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifa.ifa_addr == sa) break; } if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); return; } ifa_ref(&ia->ia_ifa); - IN_IFADDR_RUNLOCK(); + IN_IFADDR_RUNLOCK(&in_ifa_tracker); flags = RTF_UP; ifp = ia->ia_ifa.ifa_ifp; @@ -764,16 +788,12 @@ rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) flags |= RTF_HOST; err = ifa_del_loopback_route((struct ifaddr *)ia, sa); - if (err == 0) - ia->ia_flags &= ~IFA_RTSELF; err = rtinit(&ia->ia_ifa, RTM_ADD, flags); if (err == 0) ia->ia_flags |= IFA_ROUTE; err = ifa_add_loopback_route((struct ifaddr *)ia, sa); - if (err == 0) - ia->ia_flags |= IFA_RTSELF; ifa_free(&ia->ia_ifa); break; @@ -1036,7 +1056,7 @@ rip_pcblist(SYSCTL_HANDLER_ARGS) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == 0) + if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&V_ripcbinfo); diff --git a/freebsd/sys/netinet/sctp.h b/freebsd/sys/netinet/sctp.h index 4c5c03dc..ec42cffa 100644 --- a/freebsd/sys/netinet/sctp.h +++ b/freebsd/sys/netinet/sctp.h @@ -121,6 +121,14 @@ struct sctp_paramhdr { #define SCTP_DEFAULT_PRINFO 0x00000022 #define SCTP_PEER_ADDR_THLDS 0x00000023 #define SCTP_REMOTE_UDP_ENCAPS_PORT 0x00000024 +#define SCTP_ECN_SUPPORTED 0x00000025 +#define SCTP_PR_SUPPORTED 0x00000026 +#define SCTP_AUTH_SUPPORTED 0x00000027 +#define SCTP_ASCONF_SUPPORTED 0x00000028 +#define SCTP_RECONFIG_SUPPORTED 0x00000029 +#define SCTP_NRSACK_SUPPORTED 0x00000030 +#define SCTP_PKTDROP_SUPPORTED 0x00000031 +#define SCTP_MAX_CWND 0x00000032 /* * read-only options @@ -133,6 +141,8 @@ struct sctp_paramhdr { #define SCTP_GET_ASSOC_NUMBER 0x00000104 /* ro */ #define SCTP_GET_ASSOC_ID_LIST 0x00000105 /* ro */ #define SCTP_TIMEOUTS 0x00000106 +#define SCTP_PR_STREAM_STATUS 0x00000107 +#define SCTP_PR_ASSOC_STATUS 0x00000108 /* * user socket options: BSD implementation specific @@ -186,6 +196,9 @@ struct sctp_paramhdr { #define SCTP_SS_VALUE 0x00001204 #define SCTP_CC_OPTION 0x00001205 /* Options for CC * modules */ +/* For I-DATA */ +#define SCTP_INTERLEAVING_SUPPORTED 0x00001206 + /* read only */ #define SCTP_GET_SNDBUF_USE 0x00001101 #define SCTP_GET_STAT_LOG 0x00001103 @@ -378,33 +391,32 @@ struct sctp_error_cause { } SCTP_PACKED; struct sctp_error_invalid_stream { - struct sctp_error_cause cause; /* code=SCTP_ERROR_INVALID_STREAM */ + struct sctp_error_cause cause; /* code=SCTP_CAUSE_INVALID_STREAM */ uint16_t stream_id; /* stream id of the DATA in error */ uint16_t reserved; } SCTP_PACKED; struct sctp_error_missing_param { - struct sctp_error_cause cause; /* code=SCTP_ERROR_MISSING_PARAM */ + struct sctp_error_cause cause; /* code=SCTP_CAUSE_MISSING_PARAM */ uint32_t num_missing_params; /* number of missing parameters */ - /* uint16_t param_type's follow */ + uint16_t type[]; } SCTP_PACKED; struct sctp_error_stale_cookie { - struct sctp_error_cause cause; /* code=SCTP_ERROR_STALE_COOKIE */ + struct sctp_error_cause cause; /* code=SCTP_CAUSE_STALE_COOKIE */ uint32_t stale_time; /* time in usec of staleness */ } SCTP_PACKED; struct sctp_error_out_of_resource { - struct sctp_error_cause cause; /* code=SCTP_ERROR_OUT_OF_RESOURCES */ + struct sctp_error_cause cause; /* code=SCTP_CAUSE_OUT_OF_RESOURCES */ } SCTP_PACKED; struct sctp_error_unresolv_addr { - struct sctp_error_cause cause; /* code=SCTP_ERROR_UNRESOLVABLE_ADDR */ - + struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNRESOLVABLE_ADDR */ } SCTP_PACKED; struct sctp_error_unrecognized_chunk { - struct sctp_error_cause cause; /* code=SCTP_ERROR_UNRECOG_CHUNK */ + struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNRECOG_CHUNK */ struct sctp_chunkhdr ch;/* header from chunk in error */ } SCTP_PACKED; @@ -413,6 +425,11 @@ struct sctp_error_no_user_data { uint32_t tsn; /* TSN of the empty data chunk */ } SCTP_PACKED; +struct sctp_error_auth_invalid_hmac { + struct sctp_error_cause cause; /* code=SCTP_CAUSE_UNSUPPORTED_HMACID */ + uint16_t hmac_id; +} SCTP_PACKED; + /* * Main SCTP chunk types we place these here so natd and f/w's in user land * can find them. @@ -438,6 +455,7 @@ struct sctp_error_no_user_data { /* EY nr_sack chunk id*/ #define SCTP_NR_SELECTIVE_ACK 0x10 /************0x40 series ***********/ +#define SCTP_IDATA 0x40 /************0x80 series ***********/ /* RFC5061 */ #define SCTP_ASCONF_ACK 0x80 @@ -453,7 +471,7 @@ struct sctp_error_no_user_data { #define SCTP_FORWARD_CUM_TSN 0xc0 /* RFC5061 */ #define SCTP_ASCONF 0xc1 - +#define SCTP_IFORWARD_CUM_TSN 0xc2 /* ABORT and SHUTDOWN COMPLETE FLAG */ #define SCTP_HAD_NO_TCB 0x01 diff --git a/freebsd/sys/netinet/sctp_asconf.c b/freebsd/sys/netinet/sctp_asconf.c index 551f0690..4256ab51 100644 --- a/freebsd/sys/netinet/sctp_asconf.c +++ b/freebsd/sys/netinet/sctp_asconf.c @@ -82,7 +82,7 @@ sctp_asconf_success_response(uint32_t id) struct sctp_asconf_paramhdr *aph; m_reply = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_paramhdr), - 0, M_DONTWAIT, 1, MT_DATA); + 0, M_NOWAIT, 1, MT_DATA); if (m_reply == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_success_response: couldn't get mbuf!\n"); @@ -110,7 +110,7 @@ sctp_asconf_error_response(uint32_t id, uint16_t cause, uint8_t * error_tlv, m_reply = sctp_get_mbuf_for_msg((sizeof(struct sctp_asconf_paramhdr) + tlv_length + sizeof(struct sctp_error_cause)), - 0, M_DONTWAIT, 1, MT_DATA); + 0, M_NOWAIT, 1, MT_DATA); if (m_reply == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_error_response: couldn't get mbuf!\n"); @@ -150,7 +150,7 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap { struct sctp_nets *net; struct mbuf *m_reply = NULL; - struct sockaddr_storage sa_store; + union sctp_sockstore store; struct sctp_paramhdr *ph; uint16_t param_type, aparam_length; @@ -179,7 +179,7 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap #if defined(INET) || defined(INET6) param_length = ntohs(ph->param_length); #endif - sa = (struct sockaddr *)&sa_store; + sa = &store.sa; switch (param_type) { #ifdef INET case SCTP_IPV4_ADDRESS: @@ -188,7 +188,7 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap return (NULL); } v4addr = (struct sctp_ipv4addr_param *)ph; - sin = (struct sockaddr_in *)&sa_store; + sin = &store.sin; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); @@ -211,7 +211,7 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap return (NULL); } v6addr = (struct sctp_ipv6addr_param *)ph; - sin6 = (struct sockaddr_in6 *)&sa_store; + sin6 = &store.sin6; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); @@ -246,7 +246,8 @@ sctp_process_asconf_add_ip(struct sockaddr *src, struct sctp_asconf_paramhdr *ap m_reply = sctp_asconf_error_response(aph->correlation_id, SCTP_CAUSE_INVALID_PARAM, (uint8_t *) aph, aparam_length); - } else if (sctp_add_remote_addr(stcb, sa, &net, SCTP_DONOT_SETSCOPE, + } else if (sctp_add_remote_addr(stcb, sa, &net, stcb->asoc.port, + SCTP_DONOT_SETSCOPE, SCTP_ADDR_DYNAMIC_ADDED) != 0) { SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: error adding address\n"); @@ -304,7 +305,7 @@ sctp_process_asconf_delete_ip(struct sockaddr *src, struct sctp_tcb *stcb, int response_required) { struct mbuf *m_reply = NULL; - struct sockaddr_storage sa_store; + union sctp_sockstore store; struct sctp_paramhdr *ph; uint16_t param_type, aparam_length; @@ -333,7 +334,7 @@ sctp_process_asconf_delete_ip(struct sockaddr *src, #if defined(INET) || defined(INET6) param_length = ntohs(ph->param_length); #endif - sa = (struct sockaddr *)&sa_store; + sa = &store.sa; switch (param_type) { #ifdef INET case SCTP_IPV4_ADDRESS: @@ -342,7 +343,7 @@ sctp_process_asconf_delete_ip(struct sockaddr *src, return (NULL); } v4addr = (struct sctp_ipv4addr_param *)ph; - sin = (struct sockaddr_in *)&sa_store; + sin = &store.sin; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); @@ -362,7 +363,7 @@ sctp_process_asconf_delete_ip(struct sockaddr *src, return (NULL); } v6addr = (struct sctp_ipv6addr_param *)ph; - sin6 = (struct sockaddr_in6 *)&sa_store; + sin6 = &store.sin6; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); @@ -439,7 +440,7 @@ sctp_process_asconf_set_primary(struct sockaddr *src, struct sctp_tcb *stcb, int response_required) { struct mbuf *m_reply = NULL; - struct sockaddr_storage sa_store; + union sctp_sockstore store; struct sctp_paramhdr *ph; uint16_t param_type, aparam_length; @@ -467,7 +468,7 @@ sctp_process_asconf_set_primary(struct sockaddr *src, #if defined(INET) || defined(INET6) param_length = ntohs(ph->param_length); #endif - sa = (struct sockaddr *)&sa_store; + sa = &store.sa; switch (param_type) { #ifdef INET case SCTP_IPV4_ADDRESS: @@ -476,7 +477,7 @@ sctp_process_asconf_set_primary(struct sockaddr *src, return (NULL); } v4addr = (struct sctp_ipv4addr_param *)ph; - sin = (struct sockaddr_in *)&sa_store; + sin = &store.sin; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); @@ -494,7 +495,7 @@ sctp_process_asconf_set_primary(struct sockaddr *src, return (NULL); } v6addr = (struct sctp_ipv6addr_param *)ph; - sin6 = (struct sockaddr_in6 *)&sa_store; + sin6 = &store.sin6; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(struct sockaddr_in6); @@ -557,7 +558,9 @@ sctp_process_asconf_set_primary(struct sockaddr *src, (stcb->asoc.primary_destination->dest_state & SCTP_ADDR_UNCONFIRMED) == 0) { - sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_TIMER + SCTP_LOC_7); + sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, + stcb->sctp_ep, stcb, NULL, + SCTP_FROM_SCTP_ASCONF + SCTP_LOC_1); if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { sctp_assoc_immediate_retrans(stcb, @@ -598,7 +601,7 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, uint32_t serial_num; struct mbuf *n, *m_ack, *m_result, *m_tail; struct sctp_asconf_ack_chunk *ack_cp; - struct sctp_asconf_paramhdr *aph, *ack_aph; + struct sctp_asconf_paramhdr *aph; struct sctp_ipv6addr_param *p_addr; unsigned int asconf_limit, cnt; int error = 0; /* did an error occur? */ @@ -653,7 +656,7 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, } } m_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_ack_chunk), 0, - M_DONTWAIT, 1, MT_DATA); + M_NOWAIT, 1, MT_DATA); if (m_ack == NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: couldn't get mbuf!\n"); @@ -681,13 +684,6 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, } /* param_length is already validated in process_control... */ offset += ntohs(p_addr->ph.param_length); /* skip lookup addr */ - - /* get pointer to first asconf param in ASCONF-ACK */ - ack_aph = (struct sctp_asconf_paramhdr *)(mtod(m_ack, caddr_t)+sizeof(struct sctp_asconf_ack_chunk)); - if (ack_aph == NULL) { - SCTPDBG(SCTP_DEBUG_ASCONF1, "Gak in asconf2\n"); - return; - } /* get pointer to first asconf param in ASCONF */ aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_asconf_paramhdr), (uint8_t *) & aparam_buf); if (aph == NULL) { @@ -726,13 +722,11 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, } switch (param_type) { case SCTP_ADD_IP_ADDRESS: - asoc->peer_supports_asconf = 1; m_result = sctp_process_asconf_add_ip(src, aph, stcb, (cnt < SCTP_BASE_SYSCTL(sctp_hb_maxburst)), error); cnt++; break; case SCTP_DEL_IP_ADDRESS: - asoc->peer_supports_asconf = 1; m_result = sctp_process_asconf_delete_ip(src, aph, stcb, error); break; @@ -740,7 +734,6 @@ sctp_handle_asconf(struct mbuf *m, unsigned int offset, /* not valid in an ASCONF chunk */ break; case SCTP_SET_PRIM_ADDR: - asoc->peer_supports_asconf = 1; m_result = sctp_process_asconf_set_primary(src, aph, stcb, error); break; @@ -932,8 +925,6 @@ sctp_addr_match(struct sctp_paramhdr *ph, struct sockaddr *sa) void sctp_asconf_cleanup(struct sctp_tcb *stcb, struct sctp_nets *net) { - /* mark peer as ASCONF incapable */ - stcb->asoc.peer_supports_asconf = 0; /* * clear out any existing asconfs going out */ @@ -1005,7 +996,7 @@ sctp_assoc_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *dstnet) SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.primary_destination->ro._l_addr.sa); sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, stcb->asoc.deleted_primary, - SCTP_FROM_SCTP_TIMER + SCTP_LOC_8); + SCTP_FROM_SCTP_ASCONF + SCTP_LOC_3); stcb->asoc.num_send_timers_up--; if (stcb->asoc.num_send_timers_up < 0) { stcb->asoc.num_send_timers_up = 0; @@ -1044,7 +1035,7 @@ sctp_net_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *net) SCTPDBG(SCTP_DEBUG_ASCONF1, "net_immediate_retrans: RTO is %d\n", net->RTO); sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, - SCTP_FROM_SCTP_TIMER + SCTP_LOC_5); + SCTP_FROM_SCTP_ASCONF + SCTP_LOC_4); stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); net->error_count = 0; TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { @@ -1121,7 +1112,8 @@ sctp_path_check_and_react(struct sctp_tcb *stcb, struct sctp_ifa *newifa) * not be changed. */ SCTP_RTALLOC((sctp_route_t *) & net->ro, - stcb->sctp_ep->def_vrf_id); + stcb->sctp_ep->def_vrf_id, + stcb->sctp_ep->fibnum); if (net->ro.ro_rt == NULL) continue; @@ -1275,7 +1267,7 @@ sctp_asconf_queue_mgmt(struct sctp_tcb *stcb, struct sctp_ifa *ifa, { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)&ifa->address.sa; + sin6 = &ifa->address.sin6; aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS; aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv6addr_param)); aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + @@ -1290,7 +1282,7 @@ sctp_asconf_queue_mgmt(struct sctp_tcb *stcb, struct sctp_ifa *ifa, { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)&ifa->address.sa; + sin = &ifa->address.sin; aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS; aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv4addr_param)); aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + @@ -1340,24 +1332,31 @@ sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa, { uint32_t status; int pending_delete_queued = 0; + int last; /* see if peer supports ASCONF */ - if (stcb->asoc.peer_supports_asconf == 0) { + if (stcb->asoc.asconf_supported == 0) { return (-1); } /* * if this is deleting the last address from the assoc, mark it as * pending. */ - if ((type == SCTP_DEL_IP_ADDRESS) && !stcb->asoc.asconf_del_pending && - (sctp_local_addr_count(stcb) < 2)) { - /* set the pending delete info only */ - stcb->asoc.asconf_del_pending = 1; - stcb->asoc.asconf_addr_del_pending = ifa; - atomic_add_int(&ifa->refcount, 1); - SCTPDBG(SCTP_DEBUG_ASCONF2, - "asconf_queue_add: mark delete last address pending\n"); - return (-1); + if ((type == SCTP_DEL_IP_ADDRESS) && !stcb->asoc.asconf_del_pending) { + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + last = (sctp_local_addr_count(stcb) == 0); + } else { + last = (sctp_local_addr_count(stcb) == 1); + } + if (last) { + /* set the pending delete info only */ + stcb->asoc.asconf_del_pending = 1; + stcb->asoc.asconf_addr_del_pending = ifa; + atomic_add_int(&ifa->refcount, 1); + SCTPDBG(SCTP_DEBUG_ASCONF2, + "asconf_queue_add: mark delete last address pending\n"); + return (-1); + } } /* queue an asconf parameter */ status = sctp_asconf_queue_mgmt(stcb, ifa, type); @@ -1426,13 +1425,12 @@ sctp_asconf_queue_sa_delete(struct sctp_tcb *stcb, struct sockaddr *sa) { struct sctp_ifa *ifa; struct sctp_asconf_addr *aa, *aa_next; - uint32_t vrf_id; if (stcb == NULL) { return (-1); } /* see if peer supports ASCONF */ - if (stcb->asoc.peer_supports_asconf == 0) { + if (stcb->asoc.asconf_supported == 0) { return (-1); } /* make sure the request isn't already in the queue */ @@ -1458,12 +1456,7 @@ sctp_asconf_queue_sa_delete(struct sctp_tcb *stcb, struct sockaddr *sa) } /* for each aa */ /* find any existing ifa-- NOTE ifa CAN be allowed to be NULL */ - if (stcb) { - vrf_id = stcb->asoc.vrf_id; - } else { - vrf_id = SCTP_DEFAULT_VRFID; - } - ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED); + ifa = sctp_find_ifa_by_addr(sa, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED); /* adding new request to the queue */ SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), @@ -1552,7 +1545,7 @@ sctp_asconf_find_param(struct sctp_tcb *stcb, uint32_t correlation_id) * notifications based on the error response */ static void -sctp_asconf_process_error(struct sctp_tcb *stcb, +sctp_asconf_process_error(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_asconf_paramhdr *aph) { struct sctp_error_cause *eh; @@ -1590,10 +1583,7 @@ sctp_asconf_process_error(struct sctp_tcb *stcb, switch (param_type) { case SCTP_ADD_IP_ADDRESS: case SCTP_DEL_IP_ADDRESS: - stcb->asoc.peer_supports_asconf = 0; - break; case SCTP_SET_PRIM_ADDR: - stcb->asoc.peer_supports_asconf = 0; break; default: break; @@ -1629,8 +1619,6 @@ sctp_asconf_process_param_ack(struct sctp_tcb *stcb, SCTPDBG(SCTP_DEBUG_ASCONF1, "process_param_ack: set primary IP address\n"); /* nothing to do... peer may start using this addr */ - if (flag == 0) - stcb->asoc.peer_supports_asconf = 0; break; default: /* should NEVER happen */ @@ -1648,11 +1636,11 @@ sctp_asconf_process_param_ack(struct sctp_tcb *stcb, * cleanup from a bad asconf ack parameter */ static void -sctp_asconf_ack_clear(struct sctp_tcb *stcb) +sctp_asconf_ack_clear(struct sctp_tcb *stcb SCTP_UNUSED) { /* assume peer doesn't really know how to do asconfs */ - stcb->asoc.peer_supports_asconf = 0; /* XXX we could free the pending queue here */ + } void @@ -1695,8 +1683,14 @@ sctp_handle_asconf_ack(struct mbuf *m, int offset, * abort the asoc, since someone probably just hijacked us... */ if (serial_num == (asoc->asconf_seq_out + 1)) { + struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got unexpected next serial number! Aborting asoc!\n"); - sctp_abort_an_association(stcb->sctp_ep, stcb, NULL, SCTP_SO_NOT_LOCKED); + snprintf(msg, sizeof(msg), "Never sent serial number %8.8x", + serial_num); + op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); + sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_no_unlock = 1; return; } @@ -1709,7 +1703,7 @@ sctp_handle_asconf_ack(struct mbuf *m, int offset, if (serial_num == asoc->asconf_seq_out - 1) { /* stop our timer */ sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net, - SCTP_FROM_SCTP_ASCONF + SCTP_LOC_3); + SCTP_FROM_SCTP_ASCONF + SCTP_LOC_5); } /* process the ASCONF-ACK contents */ ack_length = ntohs(cp->ch.chunk_length) - @@ -1937,7 +1931,7 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)&ifa->address.sin6; + sin6 = &ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* we skip unspecifed addresses */ return; @@ -1970,7 +1964,7 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, SCTP_IPV6_V6ONLY(inp6)) return; - sin = (struct sockaddr_in *)&ifa->address.sa; + sin = &ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* we skip unspecifed addresses */ return; @@ -1990,7 +1984,7 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, /* queue an asconf for this address add/delete */ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF)) { /* does the peer do asconf? */ - if (stcb->asoc.peer_supports_asconf) { + if (stcb->asoc.asconf_supported) { /* queue an asconf for this addr */ status = sctp_asconf_queue_add(stcb, ifa, type); @@ -2000,7 +1994,8 @@ sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, * sent when the state goes open. */ if (status == 0 && - SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { + ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED))) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, stcb->asoc.primary_destination); @@ -2127,7 +2122,7 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, else continue; } - sin6 = (struct sockaddr_in6 *)&ifa->address.sin6; + sin6 = &ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* we skip unspecifed addresses */ continue; @@ -2161,7 +2156,7 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, SCTP_IPV6_V6ONLY(inp6)) continue; - sin = (struct sockaddr_in *)&ifa->address.sa; + sin = &ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* we skip unspecifed addresses */ continue; @@ -2240,7 +2235,7 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, } /* queue an asconf for this address add/delete */ if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF) && - stcb->asoc.peer_supports_asconf) { + stcb->asoc.asconf_supported == 1) { /* queue an asconf for this addr */ status = sctp_asconf_queue_add(stcb, ifa, type); /* @@ -2248,7 +2243,8 @@ sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, * count of queued params. If in the non-open * state, these get sent when the assoc goes open. */ - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { + if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { if (status >= 0) { num_queued++; } @@ -2308,7 +2304,8 @@ sctp_set_primary_ip_address_sa(struct sctp_tcb *stcb, struct sockaddr *sa) "set_primary_ip_address_sa: queued on tcb=%p, ", (void *)stcb); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { + if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, @@ -2344,7 +2341,8 @@ sctp_set_primary_ip_address(struct sctp_ifa *ifa) SCTPDBG(SCTP_DEBUG_ASCONF1, "set_primary_ip_address: queued on stcb=%p, ", (void *)stcb); SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &ifa->address.sa); - if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { + if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, @@ -2478,7 +2476,7 @@ sctp_find_valid_localaddr(struct sctp_tcb *stcb, int addr_locked) if (stcb->asoc.scope.ipv4_addr_legal) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)&sctp_ifa->address.sa; + sin = &sctp_ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* skip unspecifed addresses */ continue; @@ -2512,7 +2510,7 @@ sctp_find_valid_localaddr(struct sctp_tcb *stcb, int addr_locked) if (sctp_ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { continue; } - sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa; + sin6 = &sctp_ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * we skip unspecifed @@ -2606,14 +2604,14 @@ sctp_compose_asconf(struct sctp_tcb *stcb, int *retlen, int addr_locked) * it's simpler to fill in the asconf chunk header lookup address on * the fly */ - m_asconf_chk = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_chunk), 0, M_DONTWAIT, 1, MT_DATA); + m_asconf_chk = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_chunk), 0, M_NOWAIT, 1, MT_DATA); if (m_asconf_chk == NULL) { /* no mbuf's */ SCTPDBG(SCTP_DEBUG_ASCONF1, "compose_asconf: couldn't get chunk mbuf!\n"); return (NULL); } - m_asconf = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + m_asconf = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (m_asconf == NULL) { /* no mbuf's */ SCTPDBG(SCTP_DEBUG_ASCONF1, @@ -2784,19 +2782,16 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m, struct sctp_paramhdr tmp_param, *ph; uint16_t plen, ptype; struct sctp_ifa *sctp_ifa; + union sctp_sockstore store; #ifdef INET6 struct sctp_ipv6addr_param addr6_store; - struct sockaddr_in6 sin6; #endif #ifdef INET struct sctp_ipv4addr_param addr4_store; - struct sockaddr_in sin; #endif - struct sockaddr *sa; - uint32_t vrf_id; SCTPDBG(SCTP_DEBUG_ASCONF2, "processing init-ack addresses\n"); if (stcb == NULL) /* Un-needed check for SA */ @@ -2808,21 +2803,6 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m, if ((offset + sizeof(struct sctp_paramhdr)) > length) { return; } - /* init the addresses */ -#ifdef INET6 - bzero(&sin6, sizeof(sin6)); - sin6.sin6_family = AF_INET6; - sin6.sin6_len = sizeof(sin6); - sin6.sin6_port = stcb->rport; -#endif - -#ifdef INET - bzero(&sin, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_len = sizeof(sin); - sin.sin_port = stcb->rport; -#endif - /* go through the addresses in the init-ack */ ph = (struct sctp_paramhdr *) sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), @@ -2845,9 +2825,11 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m, a6p == NULL) { return; } - memcpy(&sin6.sin6_addr, a6p->addr, - sizeof(struct in6_addr)); - sa = (struct sockaddr *)&sin6; + memset(&store, 0, sizeof(union sctp_sockstore)); + store.sin6.sin6_family = AF_INET6; + store.sin6.sin6_len = sizeof(struct sockaddr_in6); + store.sin6.sin6_port = stcb->rport; + memcpy(&store.sin6.sin6_addr, a6p->addr, sizeof(struct in6_addr)); break; } #endif @@ -2864,8 +2846,11 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m, a4p == NULL) { return; } - sin.sin_addr.s_addr = a4p->addr; - sa = (struct sockaddr *)&sin; + memset(&store, 0, sizeof(union sctp_sockstore)); + store.sin.sin_family = AF_INET; + store.sin.sin_len = sizeof(struct sockaddr_in); + store.sin.sin_port = stcb->rport; + store.sin.sin_addr.s_addr = a4p->addr; break; } #endif @@ -2874,12 +2859,7 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m, } /* see if this address really (still) exists */ - if (stcb) { - vrf_id = stcb->asoc.vrf_id; - } else { - vrf_id = SCTP_DEFAULT_VRFID; - } - sctp_ifa = sctp_find_ifa_by_addr(sa, vrf_id, + sctp_ifa = sctp_find_ifa_by_addr(&store.sa, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED); if (sctp_ifa == NULL) { /* address doesn't exist anymore */ @@ -2888,9 +2868,9 @@ sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m, /* are ASCONFs allowed ? */ if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) && - stcb->asoc.peer_supports_asconf) { + stcb->asoc.asconf_supported) { /* queue an ASCONF DEL_IP_ADDRESS */ - status = sctp_asconf_queue_sa_delete(stcb, sa); + status = sctp_asconf_queue_sa_delete(stcb, &store.sa); /* * if queued ok, and in correct state, send * out the ASCONF. @@ -3137,7 +3117,7 @@ sctp_check_address_list_all(struct sctp_tcb *stcb, struct mbuf *m, int offset, switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: - sin = (struct sockaddr_in *)&sctp_ifa->address.sin; + sin = &sctp_ifa->address.sin; if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { continue; @@ -3151,7 +3131,7 @@ sctp_check_address_list_all(struct sctp_tcb *stcb, struct mbuf *m, int offset, #endif #ifdef INET6 case AF_INET6: - sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sin6; + sin6 = &sctp_ifa->address.sin6; if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { continue; @@ -3271,6 +3251,7 @@ sctp_addr_mgmt_ep_sa(struct sctp_inpcb *inp, struct sockaddr *sa, } else { struct sctp_asconf_iterator *asc; struct sctp_laddr *wi; + int ret; SCTP_MALLOC(asc, struct sctp_asconf_iterator *, sizeof(struct sctp_asconf_iterator), @@ -3292,7 +3273,7 @@ sctp_addr_mgmt_ep_sa(struct sctp_inpcb *inp, struct sockaddr *sa, wi->action = type; atomic_add_int(&ifa->refcount, 1); LIST_INSERT_HEAD(&asc->list_of_work, wi, sctp_nxt_addr); - (void)sctp_initiate_iterator(sctp_asconf_iterator_ep, + ret = sctp_initiate_iterator(sctp_asconf_iterator_ep, sctp_asconf_iterator_stcb, sctp_asconf_iterator_ep_end, SCTP_PCB_ANY_FLAGS, @@ -3300,6 +3281,12 @@ sctp_addr_mgmt_ep_sa(struct sctp_inpcb *inp, struct sockaddr *sa, SCTP_ASOC_ANY_STATE, (void *)asc, 0, sctp_asconf_iterator_end, inp, 0); + if (ret) { + SCTP_PRINTF("Failed to initiate iterator for addr_mgmt_ep_sa\n"); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EFAULT); + sctp_asconf_iterator_end(asc, 0); + return (EFAULT); + } } return (0); } else { @@ -3389,6 +3376,11 @@ sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb, TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); break; #endif + default: + SCTPDBG(SCTP_DEBUG_ASCONF1, + "sctp_asconf_send_nat_state_update: unknown address family\n"); + SCTP_FREE(aa, SCTP_M_ASC_ADDR); + return; } SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), SCTP_M_ASC_ADDR); @@ -3422,6 +3414,11 @@ sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb, TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); break; #endif + default: + SCTPDBG(SCTP_DEBUG_ASCONF1, + "sctp_asconf_send_nat_state_update: unknown address family\n"); + SCTP_FREE(aa, SCTP_M_ASC_ADDR); + return; } /* Now we must hunt the addresses and add all global addresses */ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { diff --git a/freebsd/sys/netinet/sctp_auth.c b/freebsd/sys/netinet/sctp_auth.c index fc649032..19e30718 100644 --- a/freebsd/sys/netinet/sctp_auth.c +++ b/freebsd/sys/netinet/sctp_auth.c @@ -135,11 +135,6 @@ sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t * list) if (list == NULL) return (-1); - /* is chunk restricted? */ - if ((chunk == SCTP_ASCONF) || - (chunk == SCTP_ASCONF_ACK)) { - return (-1); - } if (list->chunks[chunk] == 1) { list->chunks[chunk] = 0; list->num_chunks--; @@ -159,16 +154,6 @@ sctp_auth_get_chklist_size(const sctp_auth_chklist_t * list) return (list->num_chunks); } -/* - * set the default list of chunks requiring AUTH - */ -void -sctp_auth_set_default_chunks(sctp_auth_chklist_t * list) -{ - (void)sctp_auth_add_chunk(SCTP_ASCONF, list); - (void)sctp_auth_add_chunk(SCTP_ASCONF_ACK, list); -} - /* * return the current number and list of required chunks caller must * guarantee ptr has space for up to 256 bytes @@ -559,7 +544,7 @@ sctp_insert_sharedkey(struct sctp_keyhead *shared_keys, } } /* shouldn't reach here */ - return (0); + return (EINVAL); } void @@ -575,7 +560,7 @@ sctp_auth_key_acquire(struct sctp_tcb *stcb, uint16_t key_id) atomic_add_int(&skey->refcount, 1); SCTPDBG(SCTP_DEBUG_AUTH2, "%s: stcb %p key %u refcount acquire to %d\n", - __FUNCTION__, (void *)stcb, key_id, skey->refcount); + __func__, (void *)stcb, key_id, skey->refcount); } } @@ -593,20 +578,20 @@ sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t key_id, int so_locked /* decrement the ref count */ if (skey) { - sctp_free_sharedkey(skey); SCTPDBG(SCTP_DEBUG_AUTH2, "%s: stcb %p key %u refcount release to %d\n", - __FUNCTION__, (void *)stcb, key_id, skey->refcount); + __func__, (void *)stcb, key_id, skey->refcount); /* see if a notification should be generated */ - if ((skey->refcount <= 1) && (skey->deactivated)) { + if ((skey->refcount <= 2) && (skey->deactivated)) { /* notify ULP that key is no longer used */ sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb, key_id, 0, so_locked); SCTPDBG(SCTP_DEBUG_AUTH2, "%s: stcb %p key %u no longer used, %d\n", - __FUNCTION__, (void *)stcb, key_id, skey->refcount); + __func__, (void *)stcb, key_id, skey->refcount); } + sctp_free_sharedkey(skey); } } @@ -639,8 +624,11 @@ sctp_copy_skeylist(const struct sctp_keyhead *src, struct sctp_keyhead *dest) LIST_FOREACH(skey, src, next) { new_skey = sctp_copy_sharedkey(skey); if (new_skey != NULL) { - (void)sctp_insert_sharedkey(dest, new_skey); - count++; + if (sctp_insert_sharedkey(dest, new_skey)) { + sctp_free_sharedkey(new_skey); + } else { + count++; + } } } return (count); @@ -648,7 +636,7 @@ sctp_copy_skeylist(const struct sctp_keyhead *src, struct sctp_keyhead *dest) sctp_hmaclist_t * -sctp_alloc_hmaclist(uint8_t num_hmacs) +sctp_alloc_hmaclist(uint16_t num_hmacs) { sctp_hmaclist_t *new_list; int alloc_size; @@ -1455,8 +1443,8 @@ sctp_auth_get_cookie_params(struct sctp_tcb *stcb, struct mbuf *m, p_random = (struct sctp_auth_random *)phdr; random_len = plen - sizeof(*p_random); } else if (ptype == SCTP_HMAC_LIST) { - int num_hmacs; - int i; + uint16_t num_hmacs; + uint16_t i; if (plen > sizeof(hmacs_store)) break; @@ -1668,8 +1656,8 @@ sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *auth, /* is the indicated HMAC supported? */ if (!sctp_auth_is_supported_hmac(stcb->asoc.local_hmacs, hmac_id)) { - struct mbuf *m_err; - struct sctp_auth_invalid_hmac *err; + struct mbuf *op_err; + struct sctp_error_auth_invalid_hmac *cause; SCTP_STAT_INCR(sctps_recvivalhmacid); SCTPDBG(SCTP_DEBUG_AUTH1, @@ -1679,20 +1667,19 @@ sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *auth, * report this in an Error Chunk: Unsupported HMAC * Identifier */ - m_err = sctp_get_mbuf_for_msg(sizeof(*err), 0, M_DONTWAIT, - 1, MT_HEADER); - if (m_err != NULL) { + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_auth_invalid_hmac), + 0, M_NOWAIT, 1, MT_HEADER); + if (op_err != NULL) { /* pre-reserve some space */ - SCTP_BUF_RESV_UF(m_err, sizeof(struct sctp_chunkhdr)); + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); /* fill in the error */ - err = mtod(m_err, struct sctp_auth_invalid_hmac *); - bzero(err, sizeof(*err)); - err->ph.param_type = htons(SCTP_CAUSE_UNSUPPORTED_HMACID); - err->ph.param_length = htons(sizeof(*err)); - err->hmac_id = ntohs(hmac_id); - SCTP_BUF_LEN(m_err) = sizeof(*err); + cause = mtod(op_err, struct sctp_error_auth_invalid_hmac *); + cause->cause.code = htons(SCTP_CAUSE_UNSUPPORTED_HMACID); + cause->cause.length = htons(sizeof(struct sctp_error_auth_invalid_hmac)); + cause->hmac_id = ntohs(hmac_id); + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_auth_invalid_hmac); /* queue it */ - sctp_queue_op_err(stcb, m_err); + sctp_queue_op_err(stcb, op_err); } return (-1); } @@ -1785,7 +1772,7 @@ sctp_notify_authentication(struct sctp_tcb *stcb, uint32_t indication, return; m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_authkey_event), - 0, M_DONTWAIT, 1, MT_HEADER); + 0, M_NOWAIT, 1, MT_HEADER); if (m_notify == NULL) /* no space left */ return; @@ -1951,8 +1938,7 @@ sctp_validate_init_auth_params(struct mbuf *m, int offset, int limit) "SCTP: peer sent chunk list w/o AUTH\n"); return (-1); } - if (!SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) && peer_supports_asconf && - !peer_supports_auth) { + if (peer_supports_asconf && !peer_supports_auth) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: peer supports ASCONF but not AUTH\n"); return (-1); diff --git a/freebsd/sys/netinet/sctp_auth.h b/freebsd/sys/netinet/sctp_auth.h index 535c0fc0..b98764e2 100644 --- a/freebsd/sys/netinet/sctp_auth.h +++ b/freebsd/sys/netinet/sctp_auth.h @@ -112,7 +112,6 @@ extern sctp_auth_chklist_t *sctp_copy_chunklist(sctp_auth_chklist_t * chklist); extern int sctp_auth_add_chunk(uint8_t chunk, sctp_auth_chklist_t * list); extern int sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t * list); extern size_t sctp_auth_get_chklist_size(const sctp_auth_chklist_t * list); -extern void sctp_auth_set_default_chunks(sctp_auth_chklist_t * list); extern int sctp_serialize_auth_chunks(const sctp_auth_chklist_t * list, uint8_t * ptr); @@ -155,7 +154,7 @@ sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t keyid, /* hmac list handling */ -extern sctp_hmaclist_t *sctp_alloc_hmaclist(uint8_t num_hmacs); +extern sctp_hmaclist_t *sctp_alloc_hmaclist(uint16_t num_hmacs); extern void sctp_free_hmaclist(sctp_hmaclist_t * list); extern int sctp_auth_add_hmacid(sctp_hmaclist_t * list, uint16_t hmac_id); extern sctp_hmaclist_t *sctp_copy_hmaclist(sctp_hmaclist_t * list); diff --git a/freebsd/sys/netinet/sctp_bsd_addr.c b/freebsd/sys/netinet/sctp_bsd_addr.c index d558bd82..bfd7f816 100644 --- a/freebsd/sys/netinet/sctp_bsd_addr.c +++ b/freebsd/sys/netinet/sctp_bsd_addr.c @@ -295,9 +295,12 @@ sctp_addr_change(struct ifaddr *ifa, int cmd) { uint32_t ifa_flags = 0; + if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { + return; + } /* * BSD only has one VRF, if this changes we will need to hook in the - * right things here to get the id to pass to the address managment + * right things here to get the id to pass to the address management * routine. */ if (SCTP_BASE_VAR(first_time) == 0) { @@ -383,17 +386,7 @@ sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header, return (m); } if (allonebuf) { - int siz; - - if (SCTP_BUF_IS_EXTENDED(m)) { - siz = SCTP_BUF_EXTEND_SIZE(m); - } else { - if (want_header) - siz = MHLEN; - else - siz = MLEN; - } - if (siz < space_needed) { + if (SCTP_BUF_SIZE(m) < space_needed) { m_freem(m); return (NULL); } @@ -404,9 +397,7 @@ sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header, } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - if (SCTP_BUF_IS_EXTENDED(m)) { - sctp_log_mb(m, SCTP_MBUF_IALLOC); - } + sctp_log_mb(m, SCTP_MBUF_IALLOC); } #endif return (m); diff --git a/freebsd/sys/netinet/sctp_cc_functions.c b/freebsd/sys/netinet/sctp_cc_functions.c index 9758e011..68dc460a 100644 --- a/freebsd/sys/netinet/sctp_cc_functions.c +++ b/freebsd/sys/netinet/sctp_cc_functions.c @@ -54,6 +54,19 @@ __FBSDID("$FreeBSD$"); #define SHIFT_MPTCP_MULTI_Z 16 #define SHIFT_MPTCP_MULTI 8 +static void +sctp_enforce_cwnd_limit(struct sctp_association *assoc, struct sctp_nets *net) +{ + if ((assoc->max_cwnd > 0) && + (net->cwnd > assoc->max_cwnd) && + (net->cwnd > (net->mtu - sizeof(struct sctphdr)))) { + net->cwnd = assoc->max_cwnd; + if (net->cwnd < (net->mtu - sizeof(struct sctphdr))) { + net->cwnd = net->mtu - sizeof(struct sctphdr); + } + } +} + static void sctp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net) { @@ -82,8 +95,9 @@ sctp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net) net->cwnd = net->mtu - sizeof(struct sctphdr); } } + sctp_enforce_cwnd_limit(assoc, net); net->ssthresh = assoc->peers_rwnd; - SDT_PROBE(sctp, cwnd, net, init, + SDT_PROBE5(sctp, cwnd, net, init, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, 0, net->cwnd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & @@ -180,7 +194,8 @@ sctp_cwnd_update_after_fr(struct sctp_tcb *stcb, } } net->cwnd = net->ssthresh; - SDT_PROBE(sctp, cwnd, net, fr, + sctp_enforce_cwnd_limit(asoc, net); + SDT_PROBE5(sctp, cwnd, net, fr, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, old_cwnd, net->cwnd); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { @@ -213,7 +228,8 @@ sctp_cwnd_update_after_fr(struct sctp_tcb *stcb, } sctp_timer_stop(SCTP_TIMER_TYPE_SEND, - stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32); + stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_CC_FUNCTIONS + SCTP_LOC_1); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net); } @@ -228,7 +244,7 @@ sctp_cwnd_update_after_fr(struct sctp_tcb *stcb, } /* Defines for instantaneous bw decisions */ -#define SCTP_INST_LOOSING 1 /* Loosing to other flows */ +#define SCTP_INST_LOOSING 1 /* Losing to other flows */ #define SCTP_INST_NEUTRAL 2 /* Neutral, no indication */ #define SCTP_INST_GAINING 3 /* Gaining, step down possible */ @@ -247,7 +263,7 @@ cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, */ /* Probe point 5 */ probepoint |= ((5 << 16) | 1); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -268,7 +284,7 @@ cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -292,7 +308,7 @@ cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, */ /* Probe point 6 */ probepoint |= ((6 << 16) | 0); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -304,7 +320,7 @@ cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -335,7 +351,7 @@ cc_bw_same(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, */ /* Probe point 7 */ probepoint |= ((7 << 16) | net->cc_mod.rtcc.ret_from_eq); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -384,7 +400,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 /* We caused it maybe.. back off? */ /* PROBE POINT 1 */ probepoint |= ((1 << 16) | 1); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -402,7 +418,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 } /* Probe point 2 */ probepoint |= ((2 << 16) | 0); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -415,7 +431,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -428,6 +444,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 if ((net->cc_mod.rtcc.vol_reduce) && (inst_ind != SCTP_INST_GAINING)) { net->cwnd += net->mtu; + sctp_enforce_cwnd_limit(&stcb->asoc, net); net->cc_mod.rtcc.vol_reduce--; } net->cc_mod.rtcc.last_step_state = 2; @@ -438,7 +455,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 /* bw & rtt decreased */ /* Probe point 3 */ probepoint |= ((3 << 16) | 0); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -450,7 +467,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -459,6 +476,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 if ((net->cc_mod.rtcc.vol_reduce) && (inst_ind != SCTP_INST_GAINING)) { net->cwnd += net->mtu; + sctp_enforce_cwnd_limit(&stcb->asoc, net); net->cc_mod.rtcc.vol_reduce--; } net->cc_mod.rtcc.last_step_state = 3; @@ -469,7 +487,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 /* The bw decreased but rtt stayed the same */ /* Probe point 4 */ probepoint |= ((4 << 16) | 0); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -481,7 +499,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -490,6 +508,7 @@ cc_bw_decrease(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 if ((net->cc_mod.rtcc.vol_reduce) && (inst_ind != SCTP_INST_GAINING)) { net->cwnd += net->mtu; + sctp_enforce_cwnd_limit(&stcb->asoc, net); net->cc_mod.rtcc.vol_reduce--; } net->cc_mod.rtcc.last_step_state = 4; @@ -518,7 +537,7 @@ cc_bw_increase(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 */ /* PROBE POINT 0 */ probepoint = (((uint64_t) net->cwnd) << 32); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -530,7 +549,7 @@ cc_bw_increase(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 oth |= net->cc_mod.rtcc.step_cnt; oth <<= 16; oth |= net->cc_mod.rtcc.last_step_state; - SDT_PROBE(sctp, cwnd, net, rttstep, + SDT_PROBE5(sctp, cwnd, net, rttstep, vtag, ((net->cc_mod.rtcc.lbw << 32) | nbw), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -546,7 +565,7 @@ cc_bw_increase(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw, uint6 return (0); } -/* RTCC Algoritm to limit growth of cwnd, return +/* RTCC Algorithm to limit growth of cwnd, return * true if you want to NOT allow cwnd growth */ static int @@ -630,7 +649,7 @@ cc_bw_limit(struct sctp_tcb *stcb, struct sctp_nets *net, uint64_t nbw) /* Can't determine do not change */ probepoint |= ((0xd << 16) | inst_ind); } - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((nbw << 32) | inst_bw), ((net->cc_mod.rtcc.lbw_rtt << 32) | rtt), @@ -790,7 +809,7 @@ sctp_cwnd_update_after_sack_common(struct sctp_tcb *stcb, (((uint32_t) (stcb->sctp_ep->sctp_lport)) << 16) | (stcb->rport); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, nbw, ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -884,11 +903,12 @@ sctp_cwnd_update_after_sack_common(struct sctp_tcb *stcb, break; } net->cwnd += incr; + sctp_enforce_cwnd_limit(asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, incr, SCTP_CWND_LOG_FROM_SS); } - SDT_PROBE(sctp, cwnd, net, ack, + SDT_PROBE5(sctp, cwnd, net, ack, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, @@ -950,7 +970,8 @@ sctp_cwnd_update_after_sack_common(struct sctp_tcb *stcb, break; } net->cwnd += incr; - SDT_PROBE(sctp, cwnd, net, ack, + sctp_enforce_cwnd_limit(asoc, net); + SDT_PROBE5(sctp, cwnd, net, ack, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, @@ -982,7 +1003,7 @@ sctp_cwnd_update_exit_pf_common(struct sctp_tcb *stcb, struct sctp_nets *net) old_cwnd = net->cwnd; net->cwnd = net->mtu; - SDT_PROBE(sctp, cwnd, net, ack, + SDT_PROBE5(sctp, cwnd, net, ack, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, old_cwnd, net->cwnd); SCTPDBG(SCTP_DEBUG_INDATA1, "Destination %p moved from PF to reachable with cwnd %d.\n", @@ -1053,7 +1074,7 @@ sctp_cwnd_update_after_timeout(struct sctp_tcb *stcb, struct sctp_nets *net) } net->cwnd = net->mtu; net->partial_bytes_acked = 0; - SDT_PROBE(sctp, cwnd, net, to, + SDT_PROBE5(sctp, cwnd, net, to, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, @@ -1091,7 +1112,7 @@ sctp_cwnd_update_after_ecn_echo_common(struct sctp_tcb *stcb, struct sctp_nets * } else { /* * Further tuning down required over the drastic - * orginal cut + * original cut */ net->ssthresh -= (net->mtu * num_pkt_lost); net->cwnd -= (net->mtu * num_pkt_lost); @@ -1113,7 +1134,7 @@ sctp_cwnd_update_after_ecn_echo_common(struct sctp_tcb *stcb, struct sctp_nets * net->RTO <<= 1; } net->cwnd = net->ssthresh; - SDT_PROBE(sctp, cwnd, net, ecn, + SDT_PROBE5(sctp, cwnd, net, ecn, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, @@ -1132,12 +1153,9 @@ sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb, uint32_t * bottle_bw, uint32_t * on_queue) { uint32_t bw_avail; - int rtt; unsigned int incr; int old_cwnd = net->cwnd; - /* need real RTT in msd for this calc */ - rtt = net->rtt / 1000; /* get bottle neck bw */ *bottle_bw = ntohl(cp->bottle_bw); /* and whats on queue */ @@ -1146,10 +1164,11 @@ sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb, * adjust the on-queue if our flight is more it could be that the * router has not yet gotten data "in-flight" to it */ - if (*on_queue < net->flight_size) + if (*on_queue < net->flight_size) { *on_queue = net->flight_size; - /* calculate the available space */ - bw_avail = (*bottle_bw * rtt) / 1000; + } + /* rtt is measured in micro seconds, bottle_bw in bytes per second */ + bw_avail = (uint32_t) (((uint64_t) (*bottle_bw) * net->rtt) / (uint64_t) 1000000); if (bw_avail > *bottle_bw) { /* * Cap the growth to no more than the bottle neck. This can @@ -1169,7 +1188,6 @@ sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb, int seg_inflight, seg_onqueue, my_portion; net->partial_bytes_acked = 0; - /* how much are we over queue size? */ incr = *on_queue - bw_avail; if (stcb->asoc.seen_a_sack_this_pkt) { @@ -1232,9 +1250,10 @@ sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb, /* We always have 1 MTU */ net->cwnd = net->mtu; } + sctp_enforce_cwnd_limit(&stcb->asoc, net); if (net->cwnd - old_cwnd != 0) { /* log only changes */ - SDT_PROBE(sctp, cwnd, net, pd, + SDT_PROBE5(sctp, cwnd, net, pd, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, @@ -1256,7 +1275,8 @@ sctp_cwnd_update_after_output(struct sctp_tcb *stcb, net->ssthresh = net->cwnd; if (burst_limit) { net->cwnd = (net->flight_size + (burst_limit * net->mtu)); - SDT_PROBE(sctp, cwnd, net, bl, + sctp_enforce_cwnd_limit(&stcb->asoc, net); + SDT_PROBE5(sctp, cwnd, net, bl, stcb->asoc.my_vtag, ((stcb->sctp_ep->sctp_lport << 16) | (stcb->rport)), net, @@ -1272,7 +1292,7 @@ sctp_cwnd_update_after_sack(struct sctp_tcb *stcb, struct sctp_association *asoc, int accum_moved, int reneged_all, int will_exit) { - /* Passing a zero argument in last disables the rtcc algoritm */ + /* Passing a zero argument in last disables the rtcc algorithm */ sctp_cwnd_update_after_sack_common(stcb, asoc, accum_moved, reneged_all, will_exit, 0); } @@ -1280,13 +1300,13 @@ static void sctp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net, int in_window, int num_pkt_lost) { - /* Passing a zero argument in last disables the rtcc algoritm */ + /* Passing a zero argument in last disables the rtcc algorithm */ sctp_cwnd_update_after_ecn_echo_common(stcb, net, in_window, num_pkt_lost, 0); } /* Here starts the RTCCVAR type CC invented by RRS which * is a slight mod to RFC2581. We reuse a common routine or - * two since these algoritms are so close and need to + * two since these algorithms are so close and need to * remain the same. */ static void @@ -1332,7 +1352,7 @@ sctp_cwnd_new_rtcc_transmission_begins(struct sctp_tcb *stcb, probepoint = (((uint64_t) net->cwnd) << 32); /* Probe point 8 */ probepoint |= ((8 << 16) | 0); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, ((net->cc_mod.rtcc.lbw << 32) | 0), ((net->cc_mod.rtcc.lbw_rtt << 32) | net->rtt), @@ -1395,7 +1415,7 @@ sctp_set_rtcc_initial_cc_param(struct sctp_tcb *stcb, vtag = (net->rtt << 32) | (((uint32_t) (stcb->sctp_ep->sctp_lport)) << 16) | (stcb->rport); - SDT_PROBE(sctp, cwnd, net, rttvar, + SDT_PROBE5(sctp, cwnd, net, rttvar, vtag, 0, 0, @@ -1492,7 +1512,7 @@ sctp_cwnd_update_rtcc_after_sack(struct sctp_tcb *stcb, struct sctp_association *asoc, int accum_moved, int reneged_all, int will_exit) { - /* Passing a one argument at the last enables the rtcc algoritm */ + /* Passing a one argument at the last enables the rtcc algorithm */ sctp_cwnd_update_after_sack_common(stcb, asoc, accum_moved, reneged_all, will_exit, 1); } @@ -1508,13 +1528,13 @@ sctp_rtt_rtcc_calculated(struct sctp_tcb *stcb SCTP_UNUSED, struct sctp_hs_raise_drop { int32_t cwnd; - int32_t increase; - int32_t drop_percent; + int8_t increase; + int8_t drop_percent; }; #define SCTP_HS_TABLE_SIZE 73 -struct sctp_hs_raise_drop sctp_cwnd_adjust[SCTP_HS_TABLE_SIZE] = { +static const struct sctp_hs_raise_drop sctp_cwnd_adjust[SCTP_HS_TABLE_SIZE] = { {38, 1, 50}, /* 0 */ {118, 2, 44}, /* 1 */ {221, 3, 41}, /* 2 */ @@ -1594,6 +1614,7 @@ static void sctp_hs_cwnd_increase(struct sctp_tcb *stcb, struct sctp_nets *net) { int cur_val, i, indx, incr; + int old_cwnd = net->cwnd; cur_val = net->cwnd >> 10; indx = SCTP_HS_TABLE_SIZE - 1; @@ -1602,14 +1623,8 @@ sctp_hs_cwnd_increase(struct sctp_tcb *stcb, struct sctp_nets *net) /* normal mode */ if (net->net_ack > net->mtu) { net->cwnd += net->mtu; - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { - sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_FROM_SS); - } } else { net->cwnd += net->net_ack; - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { - sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_FROM_SS); - } } } else { for (i = net->last_hs_used; i < SCTP_HS_TABLE_SIZE; i++) { @@ -1619,11 +1634,12 @@ sctp_hs_cwnd_increase(struct sctp_tcb *stcb, struct sctp_nets *net) } } net->last_hs_used = indx; - incr = ((sctp_cwnd_adjust[indx].increase) << 10); + incr = (((int32_t) sctp_cwnd_adjust[indx].increase) << 10); net->cwnd += incr; - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { - sctp_log_cwnd(stcb, net, incr, SCTP_CWND_LOG_FROM_SS); - } + } + sctp_enforce_cwnd_limit(&stcb->asoc, net); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SS); } } @@ -1644,7 +1660,7 @@ sctp_hs_cwnd_decrease(struct sctp_tcb *stcb, struct sctp_nets *net) } else { /* drop by the proper amount */ net->ssthresh = net->cwnd - (int)((net->cwnd / 100) * - sctp_cwnd_adjust[net->last_hs_used].drop_percent); + (int32_t) sctp_cwnd_adjust[net->last_hs_used].drop_percent); net->cwnd = net->ssthresh; /* now where are we */ indx = net->last_hs_used; @@ -1662,6 +1678,7 @@ sctp_hs_cwnd_decrease(struct sctp_tcb *stcb, struct sctp_nets *net) net->last_hs_used = indx; } } + sctp_enforce_cwnd_limit(&stcb->asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_FR); } @@ -1718,7 +1735,8 @@ sctp_hs_cwnd_update_after_fr(struct sctp_tcb *stcb, } sctp_timer_stop(SCTP_TIMER_TYPE_SEND, - stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32); + stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_CC_FUNCTIONS + SCTP_LOC_2); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net); } @@ -1793,9 +1811,7 @@ sctp_hs_cwnd_update_after_sack(struct sctp_tcb *stcb, if (net->cwnd <= net->ssthresh) { /* We are in slow start */ if (net->flight_size + net->net_ack >= net->cwnd) { - sctp_hs_cwnd_increase(stcb, net); - } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->net_ack, @@ -1809,6 +1825,7 @@ sctp_hs_cwnd_update_after_sack(struct sctp_tcb *stcb, (net->partial_bytes_acked >= net->cwnd)) { net->partial_bytes_acked -= net->cwnd; net->cwnd += net->mtu; + sctp_enforce_cwnd_limit(asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_FROM_CA); @@ -2047,6 +2064,7 @@ htcp_cong_avoid(struct sctp_tcb *stcb, struct sctp_nets *net) SCTP_CWND_LOG_FROM_SS); } } + sctp_enforce_cwnd_limit(&stcb->asoc, net); } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, net->net_ack, @@ -2068,6 +2086,7 @@ htcp_cong_avoid(struct sctp_tcb *stcb, struct sctp_nets *net) */ net->cwnd += net->mtu; net->partial_bytes_acked = 0; + sctp_enforce_cwnd_limit(&stcb->asoc, net); htcp_alpha_update(&net->cc_mod.htcp_ca); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, net->mtu, @@ -2114,6 +2133,7 @@ sctp_htcp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net) */ net->cwnd = min((net->mtu * 4), max((2 * net->mtu), SCTP_INITIAL_CWND)); net->ssthresh = stcb->asoc.peers_rwnd; + sctp_enforce_cwnd_limit(&stcb->asoc, net); htcp_init(net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) { @@ -2217,6 +2237,7 @@ sctp_htcp_cwnd_update_after_fr(struct sctp_tcb *stcb, htcp_reset(&net->cc_mod.htcp_ca); net->ssthresh = htcp_recalc_ssthresh(net); net->cwnd = net->ssthresh; + sctp_enforce_cwnd_limit(asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_FR); @@ -2247,7 +2268,8 @@ sctp_htcp_cwnd_update_after_fr(struct sctp_tcb *stcb, } sctp_timer_stop(SCTP_TIMER_TYPE_SEND, - stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32); + stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_CC_FUNCTIONS + SCTP_LOC_3); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net); } @@ -2296,13 +2318,14 @@ sctp_htcp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb, net->RTO <<= 1; } net->cwnd = net->ssthresh; + sctp_enforce_cwnd_limit(&stcb->asoc, net); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT); } } } -struct sctp_cc_functions sctp_cc_functions[] = { +const struct sctp_cc_functions sctp_cc_functions[] = { { .sctp_set_initial_cc_param = sctp_set_initial_cc_param, .sctp_cwnd_update_after_sack = sctp_cwnd_update_after_sack, diff --git a/freebsd/sys/netinet/sctp_constants.h b/freebsd/sys/netinet/sctp_constants.h index 0ede04ca..ecde4fee 100644 --- a/freebsd/sys/netinet/sctp_constants.h +++ b/freebsd/sys/netinet/sctp_constants.h @@ -66,6 +66,10 @@ __FBSDID("$FreeBSD$"); */ #define SCTP_LARGEST_INIT_ACCEPTED (65535 - 2048) +/* Largest length of a chunk */ +#define SCTP_MAX_CHUNK_LENGTH 0xffff +/* Largest length of an error cause */ +#define SCTP_MAX_CAUSE_LENGTH 0xffff /* Number of addresses where we just skip the counting */ #define SCTP_COUNT_LIMIT 40 @@ -267,20 +271,11 @@ __FBSDID("$FreeBSD$"); /* how many addresses per assoc remote and local */ #define SCTP_SCALE_FOR_ADDR 2 -/* default AUTO_ASCONF mode enable(1)/disable(0) value (sysctl) */ -#define SCTP_DEFAULT_AUTO_ASCONF 1 - /* default MULTIPLE_ASCONF mode enable(1)/disable(0) value (sysctl) */ #define SCTP_DEFAULT_MULTIPLE_ASCONFS 0 -/* default MOBILITY_BASE mode enable(1)/disable(0) value (sysctl) */ -#define SCTP_DEFAULT_MOBILITY_BASE 0 - -/* default MOBILITY_FASTHANDOFF mode enable(1)/disable(0) value (sysctl) */ -#define SCTP_DEFAULT_MOBILITY_FASTHANDOFF 0 - /* - * Theshold for rwnd updates, we have to read (sb_hiwat >> + * Threshold for rwnd updates, we have to read (sb_hiwat >> * SCTP_RWND_HIWAT_SHIFT) before we will look to see if we need to send a * window update sack. When we look, we compare the last rwnd we sent vs the * current rwnd. It too must be greater than this value. Using 3 divdes the @@ -350,6 +345,7 @@ __FBSDID("$FreeBSD$"); #define SCTP_RTT_FROM_NON_DATA 0 #define SCTP_RTT_FROM_DATA 1 +#define PR_SCTP_UNORDERED_FLAG 0x0001 /* IP hdr (20/40) + 12+2+2 (enet) + sctp common 12 */ #define SCTP_FIRST_MBUF_RESV 68 @@ -391,8 +387,8 @@ __FBSDID("$FreeBSD$"); /* align to 32-bit sizes */ #define SCTP_SIZE32(x) ((((x) + 3) >> 2) << 2) -#define IS_SCTP_CONTROL(a) ((a)->chunk_type != SCTP_DATA) -#define IS_SCTP_DATA(a) ((a)->chunk_type == SCTP_DATA) +#define IS_SCTP_CONTROL(a) (((a)->chunk_type != SCTP_DATA) && ((a)->chunk_type != SCTP_IDATA)) +#define IS_SCTP_DATA(a) (((a)->chunk_type == SCTP_DATA) || ((a)->chunk_type == SCTP_IDATA)) /* SCTP parameter types */ @@ -467,7 +463,7 @@ __FBSDID("$FreeBSD$"); /* - * SCTP states for internal state machine XXX (should match "user" values) + * SCTP states for internal state machine */ #define SCTP_STATE_EMPTY 0x0000 #define SCTP_STATE_INUSE 0x0001 @@ -518,7 +514,7 @@ __FBSDID("$FreeBSD$"); /* Maximum the mapping array will grow to (TSN mapping array) */ #define SCTP_MAPPING_ARRAY 512 -/* size of the inital malloc on the mapping array */ +/* size of the initial malloc on the mapping array */ #define SCTP_INITIAL_MAPPING_ARRAY 16 /* how much we grow the mapping array each call */ #define SCTP_MAPPING_ARRAY_INCR 32 @@ -621,10 +617,6 @@ __FBSDID("$FreeBSD$"); /* 30 seconds + RTO (in ms) */ #define SCTP_HB_DEFAULT_MSEC 30000 -/* Max time I will wait for Shutdown to complete */ -#define SCTP_DEF_MAX_SHUTDOWN_SEC 180 - - /* * This is how long a secret lives, NOT how long a cookie lives how many * ticks the current secret will live. @@ -647,7 +639,7 @@ __FBSDID("$FreeBSD$"); #define SCTP_DEF_PMTU_RAISE_SEC 600 /* 10 min between raise attempts */ -/* How many streams I request initally by default */ +/* How many streams I request initially by default */ #define SCTP_OSTREAM_INITIAL 10 #define SCTP_ISTREAM_INITIAL 2048 @@ -774,18 +766,19 @@ __FBSDID("$FreeBSD$"); */ /* File defines */ -#define SCTP_FROM_SCTP_INPUT 0x10000000 -#define SCTP_FROM_SCTP_PCB 0x20000000 -#define SCTP_FROM_SCTP_INDATA 0x30000000 -#define SCTP_FROM_SCTP_TIMER 0x40000000 -#define SCTP_FROM_SCTP_USRREQ 0x50000000 -#define SCTP_FROM_SCTPUTIL 0x60000000 -#define SCTP_FROM_SCTP6_USRREQ 0x70000000 -#define SCTP_FROM_SCTP_ASCONF 0x80000000 -#define SCTP_FROM_SCTP_OUTPUT 0x90000000 -#define SCTP_FROM_SCTP_PEELOFF 0xa0000000 -#define SCTP_FROM_SCTP_PANDA 0xb0000000 -#define SCTP_FROM_SCTP_SYSCTL 0xc0000000 +#define SCTP_FROM_SCTP_INPUT 0x10000000 +#define SCTP_FROM_SCTP_PCB 0x20000000 +#define SCTP_FROM_SCTP_INDATA 0x30000000 +#define SCTP_FROM_SCTP_TIMER 0x40000000 +#define SCTP_FROM_SCTP_USRREQ 0x50000000 +#define SCTP_FROM_SCTPUTIL 0x60000000 +#define SCTP_FROM_SCTP6_USRREQ 0x70000000 +#define SCTP_FROM_SCTP_ASCONF 0x80000000 +#define SCTP_FROM_SCTP_OUTPUT 0x90000000 +#define SCTP_FROM_SCTP_PEELOFF 0xa0000000 +#define SCTP_FROM_SCTP_PANDA 0xb0000000 +#define SCTP_FROM_SCTP_SYSCTL 0xc0000000 +#define SCTP_FROM_SCTP_CC_FUNCTIONS 0xd0000000 /* Location ID's */ #define SCTP_LOC_1 0x00000001 @@ -821,6 +814,8 @@ __FBSDID("$FreeBSD$"); #define SCTP_LOC_31 0x0000001f #define SCTP_LOC_32 0x00000020 #define SCTP_LOC_33 0x00000021 +#define SCTP_LOC_34 0x00000022 +#define SCTP_LOC_35 0x00000023 /* Free assoc codes */ @@ -892,12 +887,19 @@ __FBSDID("$FreeBSD$"); /* modular comparison */ /* See RFC 1982 for details. */ -#define SCTP_SSN_GT(a, b) (((a < b) && ((uint16_t)(b - a) > (1U<<15))) || \ - ((a > b) && ((uint16_t)(a - b) < (1U<<15)))) -#define SCTP_SSN_GE(a, b) (SCTP_SSN_GT(a, b) || (a == b)) -#define SCTP_TSN_GT(a, b) (((a < b) && ((uint32_t)(b - a) > (1U<<31))) || \ - ((a > b) && ((uint32_t)(a - b) < (1U<<31)))) -#define SCTP_TSN_GE(a, b) (SCTP_TSN_GT(a, b) || (a == b)) +#define SCTP_UINT16_GT(a, b) (((a < b) && ((uint16_t)(b - a) > (1U<<15))) || \ + ((a > b) && ((uint16_t)(a - b) < (1U<<15)))) +#define SCTP_UINT16_GE(a, b) (SCTP_UINT16_GT(a, b) || (a == b)) +#define SCTP_UINT32_GT(a, b) (((a < b) && ((uint32_t)(b - a) > (1U<<31))) || \ + ((a > b) && ((uint32_t)(a - b) < (1U<<31)))) +#define SCTP_UINT32_GE(a, b) (SCTP_UINT32_GT(a, b) || (a == b)) + +#define SCTP_SSN_GT(a, b) SCTP_UINT16_GT(a, b) +#define SCTP_SSN_GE(a, b) SCTP_UINT16_GE(a, b) +#define SCTP_TSN_GT(a, b) SCTP_UINT32_GT(a, b) +#define SCTP_TSN_GE(a, b) SCTP_UINT32_GE(a, b) +#define SCTP_MSGID_GT(o, a, b) ((o == 1) ? SCTP_UINT16_GT((uint16_t)a, (uint16_t)b) : SCTP_UINT32_GT(a, b)) +#define SCTP_MSGID_GE(o, a, b) ((o == 1) ? SCTP_UINT16_GE((uint16_t)a, (uint16_t)b) : SCTP_UINT32_GE(a, b)) /* Mapping array manipulation routines */ #define SCTP_IS_TSN_PRESENT(arry, gap) ((arry[(gap >> 3)] >> (gap & 0x07)) & 0x01) @@ -920,7 +922,7 @@ __FBSDID("$FreeBSD$"); * element. Each entry will take 2 4 byte ints (and of course the overhead * of the next pointer as well). Using 15 as an example will yield * ((8 * * 15) + 8) or 128 bytes of overhead for each timewait block that gets - * initialized. Increasing it to 31 would yeild 256 bytes per block. + * initialized. Increasing it to 31 would yield 256 bytes per block. */ #define SCTP_NUMBER_IN_VTAG_BLOCK 15 /* @@ -986,10 +988,7 @@ __FBSDID("$FreeBSD$"); (((uint8_t *)&(a)->s_addr)[1] == 168))) #define IN4_ISLOOPBACK_ADDRESS(a) \ - ((((uint8_t *)&(a)->s_addr)[0] == 127) && \ - (((uint8_t *)&(a)->s_addr)[1] == 0) && \ - (((uint8_t *)&(a)->s_addr)[2] == 0) && \ - (((uint8_t *)&(a)->s_addr)[3] == 1)) + (((uint8_t *)&(a)->s_addr)[0] == 127) #define IN4_ISLINKLOCAL_ADDRESS(a) \ ((((uint8_t *)&(a)->s_addr)[0] == 169) && \ diff --git a/freebsd/sys/netinet/sctp_dtrace_declare.h b/freebsd/sys/netinet/sctp_dtrace_declare.h index f6fe48bd..c5c8f9ce 100644 --- a/freebsd/sys/netinet/sctp_dtrace_declare.h +++ b/freebsd/sys/netinet/sctp_dtrace_declare.h @@ -35,7 +35,6 @@ __FBSDID("$FreeBSD$"); #ifndef _NETINET_SCTP_DTRACE_DECLARE_H_ #define _NETINET_SCTP_DTRACE_DECLARE_H_ -#include #include #include diff --git a/freebsd/sys/netinet/sctp_dtrace_define.h b/freebsd/sys/netinet/sctp_dtrace_define.h index 0bfe18c0..19f44da4 100644 --- a/freebsd/sys/netinet/sctp_dtrace_define.h +++ b/freebsd/sys/netinet/sctp_dtrace_define.h @@ -35,7 +35,6 @@ __FBSDID("$FreeBSD$"); #ifndef _NETINET_SCTP_DTRACE_DEFINE_H_ #define _NETINET_SCTP_DTRACE_DEFINE_H_ -#include #include #include @@ -46,131 +45,131 @@ SDT_PROVIDER_DEFINE(sctp); /********************************************************/ /* Initial */ SDT_PROBE_DEFINE5(sctp, cwnd, net, init, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* - * The port number of the local side << 16 | port number - * of remote in network byte order. - */ - "uintptr_t", /* The pointer to the struct sctp_nets * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ + "uint32_t", /* The Vtag for this end */ + "uint32_t", /* The port number of the local side << 16 | + * port number of remote in network byte + * order. */ + "uintptr_t", /* The pointer to the struct sctp_nets * + * changing */ + "int", /* The old value of the cwnd */ + "int"); /* The new value of the cwnd */ /* ACK-INCREASE */ SDT_PROBE_DEFINE5(sctp, cwnd, net, ack, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* - * The port number of the local side << 16 | port number - * of remote in network byte order. - */ - "uintptr_t", /* The pointer to the struct sctp_nets * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ + "uint32_t", /* The Vtag for this end */ + "uint32_t", /* The port number of the local side << 16 | + * port number of remote in network byte + * order. */ + "uintptr_t", /* The pointer to the struct sctp_nets * + * changing */ + "int", /* The old value of the cwnd */ + "int"); /* The new value of the cwnd */ /* ACK-INCREASE */ SDT_PROBE_DEFINE5(sctp, cwnd, net, rttvar, - "uint64_t", /* The Vtag << 32 | localport << 16 | remoteport */ - "uint64_t", /* obw | nbw */ - "uint64_t", /* bwrtt | newrtt */ - "uint64_t", /* flight */ - "uint64_t"); /* (cwnd << 32) | point << 16 | retval(0/1) */ + "uint64_t", /* The Vtag << 32 | localport << 16 | + * remoteport */ + "uint64_t", /* obw | nbw */ + "uint64_t", /* bwrtt | newrtt */ + "uint64_t", /* flight */ + "uint64_t"); /* (cwnd << 32) | point << 16 | retval(0/1) */ SDT_PROBE_DEFINE5(sctp, cwnd, net, rttstep, - "uint64_t", /* The Vtag << 32 | localport << 16 | remoteport */ - "uint64_t", /* obw | nbw */ - "uint64_t", /* bwrtt | newrtt */ - "uint64_t", /* flight */ - "uint64_t"); /* (cwnd << 32) | point << 16 | retval(0/1) */ + "uint64_t", /* The Vtag << 32 | localport << 16 | + * remoteport */ + "uint64_t", /* obw | nbw */ + "uint64_t", /* bwrtt | newrtt */ + "uint64_t", /* flight */ + "uint64_t"); /* (cwnd << 32) | point << 16 | retval(0/1) */ /* FastRetransmit-DECREASE */ SDT_PROBE_DEFINE5(sctp, cwnd, net, fr, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* - * The port number of the local side << 16 | port number - * of remote in network byte order. - */ - "uintptr_t", /* The pointer to the struct sctp_nets * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ + "uint32_t", /* The Vtag for this end */ + "uint32_t", /* The port number of the local side << 16 | + * port number of remote in network byte + * order. */ + "uintptr_t", /* The pointer to the struct sctp_nets * + * changing */ + "int", /* The old value of the cwnd */ + "int"); /* The new value of the cwnd */ /* TimeOut-DECREASE */ SDT_PROBE_DEFINE5(sctp, cwnd, net, to, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* - * The port number of the local side << 16 | port number - * of remote in network byte order. - */ - "uintptr_t", /* The pointer to the struct sctp_nets * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ + "uint32_t", /* The Vtag for this end */ + "uint32_t", /* The port number of the local side << 16 | + * port number of remote in network byte + * order. */ + "uintptr_t", /* The pointer to the struct sctp_nets * + * changing */ + "int", /* The old value of the cwnd */ + "int"); /* The new value of the cwnd */ /* BurstLimit-DECREASE */ SDT_PROBE_DEFINE5(sctp, cwnd, net, bl, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* - * The port number of the local side << 16 | port number - * of remote in network byte order. - */ - "uintptr_t", /* The pointer to the struct sctp_nets * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ + "uint32_t", /* The Vtag for this end */ + "uint32_t", /* The port number of the local side << 16 | + * port number of remote in network byte + * order. */ + "uintptr_t", /* The pointer to the struct sctp_nets * + * changing */ + "int", /* The old value of the cwnd */ + "int"); /* The new value of the cwnd */ /* ECN-DECREASE */ SDT_PROBE_DEFINE5(sctp, cwnd, net, ecn, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* - * The port number of the local side << 16 | port number - * of remote in network byte order. - */ - "uintptr_t", /* The pointer to the struct sctp_nets * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ + "uint32_t", /* The Vtag for this end */ + "uint32_t", /* The port number of the local side << 16 | + * port number of remote in network byte + * order. */ + "uintptr_t", /* The pointer to the struct sctp_nets * + * changing */ + "int", /* The old value of the cwnd */ + "int"); /* The new value of the cwnd */ /* PacketDrop-DECREASE */ SDT_PROBE_DEFINE5(sctp, cwnd, net, pd, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* - * The port number of the local side << 16 | port number - * of remote in network byte order. - */ - "uintptr_t", /* The pointer to the struct sctp_nets * changing */ - "int", /* The old value of the cwnd */ - "int"); /* The new value of the cwnd */ + "uint32_t", /* The Vtag for this end */ + "uint32_t", /* The port number of the local side << 16 | + * port number of remote in network byte + * order. */ + "uintptr_t", /* The pointer to the struct sctp_nets * + * changing */ + "int", /* The old value of the cwnd */ + "int"); /* The new value of the cwnd */ /********************************************************/ /* Rwnd probe - tracks changes in the receiver window for an assoc */ /********************************************************/ SDT_PROBE_DEFINE4(sctp, rwnd, assoc, val, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* - * The port number of the local side << 16 | port number - * of remote in network byte order. - */ - "int", /* The up/down amount */ - "int"); /* The new value of the cwnd */ + "uint32_t", /* The Vtag for this end */ + "uint32_t", /* The port number of the local side << 16 | + * port number of remote in network byte + * order. */ + "int", /* The up/down amount */ + "int"); /* The new value of the cwnd */ /********************************************************/ /* flight probe - tracks changes in the flight size on a net or assoc */ /********************************************************/ SDT_PROBE_DEFINE5(sctp, flightsize, net, val, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* - * The port number of the local side << 16 | port number - * of remote in network byte order. - */ - "uintptr_t", /* The pointer to the struct sctp_nets * changing */ - "int", /* The up/down amount */ - "int"); /* The new value of the cwnd */ + "uint32_t", /* The Vtag for this end */ + "uint32_t", /* The port number of the local side << 16 | + * port number of remote in network byte + * order. */ + "uintptr_t", /* The pointer to the struct sctp_nets * + * changing */ + "int", /* The up/down amount */ + "int"); /* The new value of the cwnd */ /********************************************************/ /* The total flight version */ /********************************************************/ SDT_PROBE_DEFINE4(sctp, flightsize, assoc, val, - "uint32_t", /* The Vtag for this end */ - "uint32_t", /* - * The port number of the local side << 16 | port number - * of remote in network byte order. - */ - "int", /* The up/down amount */ - "int"); /* The new value of the cwnd */ + "uint32_t", /* The Vtag for this end */ + "uint32_t", /* The port number of the local side << 16 | + * port number of remote in network byte + * order. */ + "int", /* The up/down amount */ + "int"); /* The new value of the cwnd */ #endif diff --git a/freebsd/sys/netinet/sctp_header.h b/freebsd/sys/netinet/sctp_header.h index 8f898a4b..3f4948dd 100644 --- a/freebsd/sys/netinet/sctp_header.h +++ b/freebsd/sys/netinet/sctp_header.h @@ -82,12 +82,6 @@ struct sctp_supported_addr_param { uint16_t addr_type[2]; /* array of supported address types */ } SCTP_PACKED; -/* ECN parameter */ -struct sctp_ecn_supported_param { - struct sctp_paramhdr ph;/* type=SCTP_ECN_CAPABLE */ -} SCTP_PACKED; - - /* heartbeat info parameter */ struct sctp_heartbeat_info_param { struct sctp_paramhdr ph; @@ -158,6 +152,23 @@ struct sctp_data_chunk { struct sctp_data dp; } SCTP_PACKED; +struct sctp_idata { + uint32_t tsn; + uint16_t stream_id; + uint16_t reserved; /* Where does the SSN go? */ + uint32_t msg_id; + union { + uint32_t protocol_id; + uint32_t fsn; /* Fragment Sequence Number */ + } ppid_fsn; + /* user data follows */ +} SCTP_PACKED; + +struct sctp_idata_chunk { + struct sctp_chunkhdr ch; + struct sctp_idata dp; +} SCTP_PACKED; + /* * Structures for the control chunks */ @@ -208,34 +219,6 @@ struct sctp_state_cookie { /* this is our definition... */ */ } SCTP_PACKED; - -/* Used for NAT state error cause */ -struct sctp_missing_nat_state { - uint16_t cause; - uint16_t length; - uint8_t data[]; -} SCTP_PACKED; - - -struct sctp_inv_mandatory_param { - uint16_t cause; - uint16_t length; - uint32_t num_param; - uint16_t param; - /* - * We include this to 0 it since only a missing cookie will cause - * this error. - */ - uint16_t resv; -} SCTP_PACKED; - -struct sctp_unresolv_addr { - uint16_t cause; - uint16_t length; - uint16_t addr_type; - uint16_t reserved; /* Only one invalid addr type */ -} SCTP_PACKED; - /* state cookie parameter */ struct sctp_state_cookie_param { struct sctp_paramhdr ph; @@ -376,28 +359,11 @@ struct sctp_shutdown_complete_chunk { struct sctp_chunkhdr ch; } SCTP_PACKED; -/* Oper error holding a stale cookie */ -struct sctp_stale_cookie_msg { - struct sctp_paramhdr ph;/* really an error cause */ - uint32_t time_usec; -} SCTP_PACKED; - struct sctp_adaptation_layer_indication { struct sctp_paramhdr ph; uint32_t indication; } SCTP_PACKED; -struct sctp_cookie_while_shutting_down { - struct sctphdr sh; - struct sctp_chunkhdr ch; - struct sctp_paramhdr ph;/* really an error cause */ -} SCTP_PACKED; - -struct sctp_shutdown_complete_msg { - struct sctphdr sh; - struct sctp_shutdown_complete_chunk shut_cmp; -} SCTP_PACKED; - /* * draft-ietf-tsvwg-addip-sctp */ @@ -429,6 +395,12 @@ struct sctp_strseq { uint16_t sequence; } SCTP_PACKED; +struct sctp_strseq_mid { + uint16_t stream; + uint16_t flags; + uint32_t msg_id; +}; + struct sctp_forward_tsn_msg { struct sctphdr sh; struct sctp_forward_tsn_chunk msg; @@ -456,6 +428,11 @@ struct sctp_pktdrop_chunk { /**********STREAM RESET STUFF ******************/ +struct sctp_stream_reset_request { + struct sctp_paramhdr ph; + uint32_t request_seq; +} SCTP_PACKED; + struct sctp_stream_reset_out_request { struct sctp_paramhdr ph; uint32_t request_seq; /* monotonically increasing seq no */ @@ -470,7 +447,6 @@ struct sctp_stream_reset_in_request { uint16_t list_of_streams[]; /* if not all list of streams */ } SCTP_PACKED; - struct sctp_stream_reset_tsn_request { struct sctp_paramhdr ph; uint32_t request_seq; @@ -556,12 +532,6 @@ struct sctp_auth_chunk { uint8_t hmac[]; } SCTP_PACKED; -struct sctp_auth_invalid_hmac { - struct sctp_paramhdr ph; - uint16_t hmac_id; - uint16_t padding; -} SCTP_PACKED; - /* * we pre-reserve enough room for a ECNE or CWR AND a SACK with no missing * pieces. If ENCE is missing we could have a couple of blocks. This way we diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c index 07d8fd2b..12c2c80f 100644 --- a/freebsd/sys/netinet/sctp_indata.c +++ b/freebsd/sys/netinet/sctp_indata.c @@ -36,18 +36,22 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include -#include #include +#include #include #include -#include -#include #include +#include #include - - +#include +#include +#include +#include +#include +#include /* * NOTES: On the outbound side of things I need to check the sack timer to * see if I should generate a sack into the chunk queue (if I have data to @@ -57,6 +61,13 @@ __FBSDID("$FreeBSD$"); * This will cause sctp_service_queues() to get called on the top entry in * the list. */ +static void +sctp_add_chk_to_control(struct sctp_queued_to_read *control, + struct sctp_stream_in *strm, + struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_tmit_chunk *chk, int lock_held); + void sctp_set_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc) @@ -76,9 +87,9 @@ sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc) * sctp_soreceive then we will fix this so that ONLY this * associations data is taken into account. */ - if (stcb->sctp_socket == NULL) + if (stcb->sctp_socket == NULL) { return (calc); - + } if (stcb->asoc.sb_cc == 0 && asoc->size_on_reasm_queue == 0 && asoc->size_on_all_streams == 0) { @@ -88,7 +99,6 @@ sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc) } /* get actual space */ calc = (uint32_t) sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv); - /* * take out what has NOT been put on socket queue and we yet hold * for putting up. @@ -97,7 +107,6 @@ sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc) asoc->cnt_on_reasm_queue * MSIZE)); calc = sctp_sbspace_sub(calc, (uint32_t) (asoc->size_on_all_streams + asoc->cnt_on_all_streams * MSIZE)); - if (calc == 0) { /* out of space */ return (calc); @@ -124,7 +133,7 @@ sctp_build_readq_entry(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t tsn, uint32_t ppid, uint32_t context, uint16_t stream_no, - uint16_t stream_seq, uint8_t flags, + uint32_t stream_seq, uint8_t flags, struct mbuf *dm) { struct sctp_queued_to_read *read_queue_e = NULL; @@ -133,73 +142,26 @@ sctp_build_readq_entry(struct sctp_tcb *stcb, if (read_queue_e == NULL) { goto failed_build; } + memset(read_queue_e, 0, sizeof(struct sctp_queued_to_read)); read_queue_e->sinfo_stream = stream_no; read_queue_e->sinfo_ssn = stream_seq; read_queue_e->sinfo_flags = (flags << 8); read_queue_e->sinfo_ppid = ppid; read_queue_e->sinfo_context = context; - read_queue_e->sinfo_timetolive = 0; read_queue_e->sinfo_tsn = tsn; read_queue_e->sinfo_cumtsn = tsn; read_queue_e->sinfo_assoc_id = sctp_get_associd(stcb); + read_queue_e->top_fsn = read_queue_e->fsn_included = 0xffffffff; + TAILQ_INIT(&read_queue_e->reasm); read_queue_e->whoFrom = net; - read_queue_e->length = 0; atomic_add_int(&net->ref_count, 1); read_queue_e->data = dm; - read_queue_e->spec_flags = 0; - read_queue_e->tail_mbuf = NULL; - read_queue_e->aux_data = NULL; - read_queue_e->stcb = stcb; - read_queue_e->port_from = stcb->rport; - read_queue_e->do_not_ref_stcb = 0; - read_queue_e->end_added = 0; - read_queue_e->some_taken = 0; - read_queue_e->pdapi_aborted = 0; -failed_build: - return (read_queue_e); -} - - -/* - * Build out our readq entry based on the incoming packet. - */ -static struct sctp_queued_to_read * -sctp_build_readq_entry_chk(struct sctp_tcb *stcb, - struct sctp_tmit_chunk *chk) -{ - struct sctp_queued_to_read *read_queue_e = NULL; - - sctp_alloc_a_readq(stcb, read_queue_e); - if (read_queue_e == NULL) { - goto failed_build; - } - read_queue_e->sinfo_stream = chk->rec.data.stream_number; - read_queue_e->sinfo_ssn = chk->rec.data.stream_seq; - read_queue_e->sinfo_flags = (chk->rec.data.rcv_flags << 8); - read_queue_e->sinfo_ppid = chk->rec.data.payloadtype; - read_queue_e->sinfo_context = stcb->asoc.context; - read_queue_e->sinfo_timetolive = 0; - read_queue_e->sinfo_tsn = chk->rec.data.TSN_seq; - read_queue_e->sinfo_cumtsn = chk->rec.data.TSN_seq; - read_queue_e->sinfo_assoc_id = sctp_get_associd(stcb); - read_queue_e->whoFrom = chk->whoTo; - read_queue_e->aux_data = NULL; - read_queue_e->length = 0; - atomic_add_int(&chk->whoTo->ref_count, 1); - read_queue_e->data = chk->data; - read_queue_e->tail_mbuf = NULL; read_queue_e->stcb = stcb; read_queue_e->port_from = stcb->rport; - read_queue_e->spec_flags = 0; - read_queue_e->do_not_ref_stcb = 0; - read_queue_e->end_added = 0; - read_queue_e->some_taken = 0; - read_queue_e->pdapi_aborted = 0; failed_build: return (read_queue_e); } - struct mbuf * sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo) { @@ -225,9 +187,9 @@ sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo) } seinfo = (struct sctp_extrcvinfo *)sinfo; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO) && - (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_AVAIL)) { + (seinfo->serinfo_next_flags & SCTP_NEXT_MSG_AVAIL)) { provide_nxt = 1; - len += CMSG_SPACE(sizeof(struct sctp_rcvinfo)); + len += CMSG_SPACE(sizeof(struct sctp_nxtinfo)); } else { provide_nxt = 0; } @@ -243,7 +205,7 @@ sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo) use_extended = 0; } - ret = sctp_get_mbuf_for_msg(len, 0, M_DONTWAIT, 1, MT_DATA); + ret = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); if (ret == NULL) { /* No space */ return (ret); @@ -278,20 +240,20 @@ sctp_build_ctl_nchunk(struct sctp_inpcb *inp, struct sctp_sndrcvinfo *sinfo) cmh->cmsg_len = CMSG_LEN(sizeof(struct sctp_nxtinfo)); cmh->cmsg_type = SCTP_NXTINFO; nxtinfo = (struct sctp_nxtinfo *)CMSG_DATA(cmh); - nxtinfo->nxt_sid = seinfo->sreinfo_next_stream; + nxtinfo->nxt_sid = seinfo->serinfo_next_stream; nxtinfo->nxt_flags = 0; - if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_IS_UNORDERED) { + if (seinfo->serinfo_next_flags & SCTP_NEXT_MSG_IS_UNORDERED) { nxtinfo->nxt_flags |= SCTP_UNORDERED; } - if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_IS_NOTIFICATION) { + if (seinfo->serinfo_next_flags & SCTP_NEXT_MSG_IS_NOTIFICATION) { nxtinfo->nxt_flags |= SCTP_NOTIFICATION; } - if (seinfo->sreinfo_next_flags & SCTP_NEXT_MSG_ISCOMPLETE) { + if (seinfo->serinfo_next_flags & SCTP_NEXT_MSG_ISCOMPLETE) { nxtinfo->nxt_flags |= SCTP_COMPLETE; } - nxtinfo->nxt_ppid = seinfo->sreinfo_next_ppid; - nxtinfo->nxt_length = seinfo->sreinfo_next_length; - nxtinfo->nxt_assoc_id = seinfo->sreinfo_next_aid; + nxtinfo->nxt_ppid = seinfo->serinfo_next_ppid; + nxtinfo->nxt_length = seinfo->serinfo_next_length; + nxtinfo->nxt_assoc_id = seinfo->serinfo_next_aid; cmh = (struct cmsghdr *)((caddr_t)cmh + CMSG_SPACE(sizeof(struct sctp_nxtinfo))); SCTP_BUF_LEN(ret) += CMSG_SPACE(sizeof(struct sctp_nxtinfo)); } @@ -319,6 +281,7 @@ sctp_mark_non_revokable(struct sctp_association *asoc, uint32_t tsn) { uint32_t gap, i, cumackp1; int fnd = 0; + int in_r = 0, in_nr = 0; if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { return; @@ -332,15 +295,20 @@ sctp_mark_non_revokable(struct sctp_association *asoc, uint32_t tsn) return; } SCTP_CALC_TSN_TO_GAP(gap, tsn, asoc->mapping_array_base_tsn); - if (!SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) { + in_r = SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap); + in_nr = SCTP_IS_TSN_PRESENT(asoc->nr_mapping_array, gap); + if ((in_r == 0) && (in_nr == 0)) { +#ifdef INVARIANTS + panic("Things are really messed up now"); +#else SCTP_PRINTF("gap:%x tsn:%x\n", gap, tsn); sctp_print_mapping_array(asoc); -#ifdef INVARIANTS - panic("Things are really messed up now!!"); #endif } - SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); - SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); + if (in_nr == 0) + SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); + if (in_r) + SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { asoc->highest_tsn_inside_nr_map = tsn; } @@ -360,197 +328,162 @@ sctp_mark_non_revokable(struct sctp_association *asoc, uint32_t tsn) } } - -/* - * We are delivering currently from the reassembly queue. We must continue to - * deliver until we either: 1) run out of space. 2) run out of sequential - * TSN's 3) hit the SCTP_DATA_LAST_FRAG flag. - */ -static void -sctp_service_reassembly(struct sctp_tcb *stcb, struct sctp_association *asoc) +static int +sctp_place_control_in_stream(struct sctp_stream_in *strm, + struct sctp_association *asoc, + struct sctp_queued_to_read *control) { - struct sctp_tmit_chunk *chk, *nchk; - uint16_t nxt_todel; - uint16_t stream_no; - int end = 0; - int cntDel; - struct sctp_queued_to_read *control, *ctl, *nctl; - - if (stcb == NULL) - return; - - cntDel = stream_no = 0; - if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || - (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) || - (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) { - /* socket above is long gone or going.. */ -abandon: - asoc->fragmented_delivery_inprogress = 0; - TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) { - TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); - asoc->size_on_reasm_queue -= chk->send_size; - sctp_ucount_decr(asoc->cnt_on_reasm_queue); - /* - * Lose the data pointer, since its in the socket - * buffer - */ - if (chk->data) { - sctp_m_freem(chk->data); - chk->data = NULL; + struct sctp_queued_to_read *at; + struct sctp_readhead *q; + uint8_t bits, unordered; + + bits = (control->sinfo_flags >> 8); + unordered = bits & SCTP_DATA_UNORDERED; + if (unordered) { + q = &strm->uno_inqueue; + if (asoc->idata_supported == 0) { + if (!TAILQ_EMPTY(q)) { + /* + * Only one stream can be here in old style + * -- abort + */ + return (-1); } - /* Now free the address and data */ - sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); - /* sa_ignore FREED_MEMORY */ + TAILQ_INSERT_TAIL(q, control, next_instrm); + control->on_strm_q = SCTP_ON_UNORDERED; + return (0); } - return; + } else { + q = &strm->inqueue; } - SCTP_TCB_LOCK_ASSERT(stcb); - TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) { - if (chk->rec.data.TSN_seq != (asoc->tsn_last_delivered + 1)) { - /* Can't deliver more :< */ - return; - } - stream_no = chk->rec.data.stream_number; - nxt_todel = asoc->strmin[stream_no].last_sequence_delivered + 1; - if (nxt_todel != chk->rec.data.stream_seq && - (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) { - /* - * Not the next sequence to deliver in its stream OR - * unordered - */ - return; - } - if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { - - control = sctp_build_readq_entry_chk(stcb, chk); - if (control == NULL) { - /* out of memory? */ - return; - } - /* save it off for our future deliveries */ - stcb->asoc.control_pdapi = control; - if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) - end = 1; - else - end = 0; - sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq); - sctp_add_to_readq(stcb->sctp_ep, - stcb, control, &stcb->sctp_socket->so_rcv, end, - SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); - cntDel++; + if ((bits & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) { + control->end_added = control->last_frag_seen = control->first_frag_seen = 1; + } + if (TAILQ_EMPTY(q)) { + /* Empty queue */ + TAILQ_INSERT_HEAD(q, control, next_instrm); + if (unordered) { + control->on_strm_q = SCTP_ON_UNORDERED; } else { - if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) - end = 1; - else - end = 0; - sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq); - if (sctp_append_to_readq(stcb->sctp_ep, stcb, - stcb->asoc.control_pdapi, - chk->data, end, chk->rec.data.TSN_seq, - &stcb->sctp_socket->so_rcv)) { + control->on_strm_q = SCTP_ON_ORDERED; + } + return (0); + } else { + TAILQ_FOREACH(at, q, next_instrm) { + if (SCTP_TSN_GT(at->msg_id, control->msg_id)) { /* - * something is very wrong, either - * control_pdapi is NULL, or the tail_mbuf - * is corrupt, or there is a EOM already on - * the mbuf chain. + * one in queue is bigger than the new one, + * insert before this one */ - if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { - goto abandon; + TAILQ_INSERT_BEFORE(at, control, next_instrm); + if (unordered) { + control->on_strm_q = SCTP_ON_UNORDERED; } else { -#ifdef INVARIANTS - if ((stcb->asoc.control_pdapi == NULL) || (stcb->asoc.control_pdapi->tail_mbuf == NULL)) { - panic("This should not happen control_pdapi NULL?"); + control->on_strm_q = SCTP_ON_ORDERED; + } + break; + } else if (at->msg_id == control->msg_id) { + /* + * Gak, He sent me a duplicate msg id + * number?? return -1 to abort. + */ + return (-1); + } else { + if (TAILQ_NEXT(at, next_instrm) == NULL) { + /* + * We are at the end, insert it + * after this one + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { + sctp_log_strm_del(control, at, + SCTP_STR_LOG_FROM_INSERT_TL); } - /* if we did not panic, it was a EOM */ - panic("Bad chunking ??"); -#else - if ((stcb->asoc.control_pdapi == NULL) || (stcb->asoc.control_pdapi->tail_mbuf == NULL)) { - SCTP_PRINTF("This should not happen control_pdapi NULL?\n"); + TAILQ_INSERT_AFTER(q, + at, control, next_instrm); + if (unordered) { + control->on_strm_q = SCTP_ON_UNORDERED; + } else { + control->on_strm_q = SCTP_ON_ORDERED; } - SCTP_PRINTF("Bad chunking ??\n"); - SCTP_PRINTF("Dumping re-assembly queue this will probably hose the association\n"); - -#endif - goto abandon; + break; } } - cntDel++; - } - /* pull it we did it */ - TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); - if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { - asoc->fragmented_delivery_inprogress = 0; - if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) { - asoc->strmin[stream_no].last_sequence_delivered++; - } - if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) { - SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs); - } - } else if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { - /* - * turn the flag back on since we just delivered - * yet another one. - */ - asoc->fragmented_delivery_inprogress = 1; } - asoc->tsn_of_pdapi_last_delivered = chk->rec.data.TSN_seq; - asoc->last_flags_delivered = chk->rec.data.rcv_flags; - asoc->last_strm_seq_delivered = chk->rec.data.stream_seq; - asoc->last_strm_no_delivered = chk->rec.data.stream_number; + } + return (0); +} - asoc->tsn_last_delivered = chk->rec.data.TSN_seq; - asoc->size_on_reasm_queue -= chk->send_size; - sctp_ucount_decr(asoc->cnt_on_reasm_queue); - /* free up the chk */ - chk->data = NULL; - sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); +static void +sctp_abort_in_reasm(struct sctp_tcb *stcb, + struct sctp_queued_to_read *control, + struct sctp_tmit_chunk *chk, + int *abort_flag, int opspot) +{ + char msg[SCTP_DIAG_INFO_LEN]; + struct mbuf *oper; + + if (stcb->asoc.idata_supported) { + snprintf(msg, sizeof(msg), + "Reass %x,CF:%x,TSN=%8.8x,SID=%4.4x,FSN=%8.8x,MID:%8.8x", + opspot, + control->fsn_included, + chk->rec.data.TSN_seq, + chk->rec.data.stream_number, + chk->rec.data.fsn_num, chk->rec.data.stream_seq); + } else { + snprintf(msg, sizeof(msg), + "Reass %x,CI:%x,TSN=%8.8x,SID=%4.4x,FSN=%4.4x,SSN:%4.4x", + opspot, + control->fsn_included, + chk->rec.data.TSN_seq, + chk->rec.data.stream_number, + chk->rec.data.fsn_num, + (uint16_t) chk->rec.data.stream_seq); + } + oper = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); + sctp_m_freem(chk->data); + chk->data = NULL; + sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_1; + sctp_abort_an_association(stcb->sctp_ep, stcb, oper, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; +} - if (asoc->fragmented_delivery_inprogress == 0) { - /* - * Now lets see if we can deliver the next one on - * the stream - */ - struct sctp_stream_in *strm; +static void +sctp_clean_up_control(struct sctp_tcb *stcb, struct sctp_queued_to_read *control) +{ + /* + * The control could not be placed and must be cleaned. + */ + struct sctp_tmit_chunk *chk, *nchk; - strm = &asoc->strmin[stream_no]; - nxt_todel = strm->last_sequence_delivered + 1; - TAILQ_FOREACH_SAFE(ctl, &strm->inqueue, next, nctl) { - /* Deliver more if we can. */ - if (nxt_todel == ctl->sinfo_ssn) { - TAILQ_REMOVE(&strm->inqueue, ctl, next); - asoc->size_on_all_streams -= ctl->length; - sctp_ucount_decr(asoc->cnt_on_all_streams); - strm->last_sequence_delivered++; - sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); - sctp_add_to_readq(stcb->sctp_ep, stcb, - ctl, - &stcb->sctp_socket->so_rcv, 1, - SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); - } else { - break; - } - nxt_todel = strm->last_sequence_delivered + 1; - } - break; - } + TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { + TAILQ_REMOVE(&control->reasm, chk, sctp_next); + if (chk->data) + sctp_m_freem(chk->data); + chk->data = NULL; + sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } + sctp_free_a_readq(stcb, control); } /* * Queue the chunk either right into the socket buffer if it is the next one * to go OR put it in the correct place in the delivery queue. If we do - * append to the so_buf, keep doing so until we are out of order. One big - * question still remains, what to do when the socket buffer is FULL?? + * append to the so_buf, keep doing so until we are out of order as + * long as the control's entered are non-fragmented. */ static void -sctp_queue_data_to_stream(struct sctp_tcb *stcb, struct sctp_association *asoc, - struct sctp_queued_to_read *control, int *abort_flag) +sctp_queue_data_to_stream(struct sctp_tcb *stcb, + struct sctp_stream_in *strm, + struct sctp_association *asoc, + struct sctp_queued_to_read *control, int *abort_flag, int *need_reasm) { /* * FIX-ME maybe? What happens when the ssn wraps? If we are getting * all the data in one stream this could happen quite rapidly. One * could use the TSN to keep track of things, but this scheme breaks - * down in the other type of stream useage that could occur. Send a + * down in the other type of stream usage that could occur. Send a * single msg to stream 0, send 4Billion messages to stream 1, now * send a message to stream 0. You have a situation where the TSN * has wrapped but not in the stream. Is this worth worrying about @@ -564,47 +497,57 @@ sctp_queue_data_to_stream(struct sctp_tcb *stcb, struct sctp_association *asoc, * SSN alone. Maybe a hybred approach is the answer * */ - struct sctp_stream_in *strm; struct sctp_queued_to_read *at; int queue_needed; - uint16_t nxt_todel; + uint32_t nxt_todel; struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; - queue_needed = 1; - asoc->size_on_all_streams += control->length; - sctp_ucount_incr(asoc->cnt_on_all_streams); - strm = &asoc->strmin[control->sinfo_stream]; - nxt_todel = strm->last_sequence_delivered + 1; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_INTO_STRD); } - SCTPDBG(SCTP_DEBUG_INDATA1, - "queue to stream called for ssn:%u lastdel:%u nxt:%u\n", - (uint32_t) control->sinfo_stream, - (uint32_t) strm->last_sequence_delivered, - (uint32_t) nxt_todel); - if (SCTP_SSN_GE(strm->last_sequence_delivered, control->sinfo_ssn)) { + if (SCTP_MSGID_GT((!asoc->idata_supported), strm->last_sequence_delivered, control->sinfo_ssn)) { /* The incoming sseq is behind where we last delivered? */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Duplicate S-SEQ:%d delivered:%d from peer, Abort association\n", + SCTPDBG(SCTP_DEBUG_INDATA1, "Duplicate S-SEQ: %u delivered: %u from peer, Abort association\n", control->sinfo_ssn, strm->last_sequence_delivered); protocol_error: /* * throw it in the stream so it gets cleaned up in * association destruction */ - TAILQ_INSERT_HEAD(&strm->inqueue, control, next); + TAILQ_INSERT_HEAD(&strm->inqueue, control, next_instrm); snprintf(msg, sizeof(msg), "Delivered SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", strm->last_sequence_delivered, control->sinfo_tsn, control->sinfo_stream, control->sinfo_ssn); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_1; + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_2; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return; } + if ((SCTP_TSN_GE(asoc->cumulative_tsn, control->sinfo_tsn)) && (asoc->idata_supported == 0)) { + goto protocol_error; + } + queue_needed = 1; + asoc->size_on_all_streams += control->length; + sctp_ucount_incr(asoc->cnt_on_all_streams); + nxt_todel = strm->last_sequence_delivered + 1; if (nxt_todel == control->sinfo_ssn) { +#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + SCTP_SOCKET_UNLOCK(so, 1); + return; + } +#endif /* can be delivered right away? */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_IMMED_DEL); @@ -614,19 +557,27 @@ protocol_error: asoc->size_on_all_streams -= control->length; sctp_ucount_decr(asoc->cnt_on_all_streams); strm->last_sequence_delivered++; - sctp_mark_non_revokable(asoc, control->sinfo_tsn); sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, - SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); - TAILQ_FOREACH_SAFE(control, &strm->inqueue, next, at) { + SCTP_READ_LOCK_NOT_HELD, SCTP_SO_LOCKED); + TAILQ_FOREACH_SAFE(control, &strm->inqueue, next_instrm, at) { /* all delivered */ nxt_todel = strm->last_sequence_delivered + 1; - if (nxt_todel == control->sinfo_ssn) { - TAILQ_REMOVE(&strm->inqueue, control, next); + if ((nxt_todel == control->sinfo_ssn) && + (((control->sinfo_flags >> 8) & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG)) { asoc->size_on_all_streams -= control->length; sctp_ucount_decr(asoc->cnt_on_all_streams); + if (control->on_strm_q == SCTP_ON_ORDERED) { + TAILQ_REMOVE(&strm->inqueue, control, next_instrm); +#ifdef INVARIANTS + } else { + panic("Huh control: %p is on_strm_q: %d", + control, control->on_strm_q); +#endif + } + control->on_strm_q = 0; strm->last_sequence_delivered++; /* * We ignore the return of deliver_data here @@ -643,650 +594,1051 @@ protocol_error: control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, - SCTP_SO_NOT_LOCKED); + SCTP_SO_LOCKED); continue; + } else if (nxt_todel == control->sinfo_ssn) { + *need_reasm = 1; } break; } +#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif } if (queue_needed) { /* * Ok, we did not deliver this guy, find the correct place * to put it on the queue. */ - if (SCTP_TSN_GE(asoc->cumulative_tsn, control->sinfo_tsn)) { - goto protocol_error; - } - if (TAILQ_EMPTY(&strm->inqueue)) { - /* Empty queue */ - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { - sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_INSERT_HD); - } - TAILQ_INSERT_HEAD(&strm->inqueue, control, next); - } else { - TAILQ_FOREACH(at, &strm->inqueue, next) { - if (SCTP_SSN_GT(at->sinfo_ssn, control->sinfo_ssn)) { - /* - * one in queue is bigger than the - * new one, insert before this one - */ - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { - sctp_log_strm_del(control, at, - SCTP_STR_LOG_FROM_INSERT_MD); - } - TAILQ_INSERT_BEFORE(at, control, next); - break; - } else if (at->sinfo_ssn == control->sinfo_ssn) { - /* - * Gak, He sent me a duplicate str - * seq number - */ - /* - * foo bar, I guess I will just free - * this new guy, should we abort - * too? FIX ME MAYBE? Or it COULD be - * that the SSN's have wrapped. - * Maybe I should compare to TSN - * somehow... sigh for now just blow - * away the chunk! - */ - - if (control->data) - sctp_m_freem(control->data); - control->data = NULL; - asoc->size_on_all_streams -= control->length; - sctp_ucount_decr(asoc->cnt_on_all_streams); - if (control->whoFrom) { - sctp_free_remote_addr(control->whoFrom); - control->whoFrom = NULL; - } - sctp_free_a_readq(stcb, control); - return; - } else { - if (TAILQ_NEXT(at, next) == NULL) { - /* - * We are at the end, insert - * it after this one - */ - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { - sctp_log_strm_del(control, at, - SCTP_STR_LOG_FROM_INSERT_TL); - } - TAILQ_INSERT_AFTER(&strm->inqueue, - at, control, next); - break; - } - } - } + if (sctp_place_control_in_stream(strm, asoc, control)) { + snprintf(msg, sizeof(msg), + "Queue to str msg_id: %u duplicate", + control->msg_id); + sctp_clean_up_control(stcb, control); + op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_3; + sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; } } } -/* - * Returns two things: You get the total size of the deliverable parts of the - * first fragmented message on the reassembly queue. And you get a 1 back if - * all of the message is ready or a 0 back if the message is still incomplete - */ -static int -sctp_is_all_msg_on_reasm(struct sctp_association *asoc, uint32_t * t_size) -{ - struct sctp_tmit_chunk *chk; - uint32_t tsn; - *t_size = 0; - chk = TAILQ_FIRST(&asoc->reasmqueue); - if (chk == NULL) { - /* nothing on the queue */ - return (0); - } - if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) { - /* Not a first on the queue */ - return (0); - } - tsn = chk->rec.data.TSN_seq; - TAILQ_FOREACH(chk, &asoc->reasmqueue, sctp_next) { - if (tsn != chk->rec.data.TSN_seq) { - return (0); +static void +sctp_setup_tail_pointer(struct sctp_queued_to_read *control) +{ + struct mbuf *m, *prev = NULL; + struct sctp_tcb *stcb; + + stcb = control->stcb; + control->held_length = 0; + control->length = 0; + m = control->data; + while (m) { + if (SCTP_BUF_LEN(m) == 0) { + /* Skip mbufs with NO length */ + if (prev == NULL) { + /* First one */ + control->data = sctp_m_free(m); + m = control->data; + } else { + SCTP_BUF_NEXT(prev) = sctp_m_free(m); + m = SCTP_BUF_NEXT(prev); + } + if (m == NULL) { + control->tail_mbuf = prev; + } + continue; } - *t_size += chk->send_size; - if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { - return (1); + prev = m; + atomic_add_int(&control->length, SCTP_BUF_LEN(m)); + if (control->on_read_q) { + /* + * On read queue so we must increment the SB stuff, + * we assume caller has done any locks of SB. + */ + sctp_sballoc(stcb, &stcb->sctp_socket->so_rcv, m); } - tsn++; + m = SCTP_BUF_NEXT(m); + } + if (prev) { + control->tail_mbuf = prev; } - return (0); } static void -sctp_deliver_reasm_check(struct sctp_tcb *stcb, struct sctp_association *asoc) +sctp_add_to_tail_pointer(struct sctp_queued_to_read *control, struct mbuf *m) { - struct sctp_tmit_chunk *chk; - uint16_t nxt_todel; - uint32_t tsize, pd_point; - -doit_again: - chk = TAILQ_FIRST(&asoc->reasmqueue); - if (chk == NULL) { - /* Huh? */ - asoc->size_on_reasm_queue = 0; - asoc->cnt_on_reasm_queue = 0; + struct mbuf *prev = NULL; + struct sctp_tcb *stcb; + + stcb = control->stcb; + if (stcb == NULL) { +#ifdef INVARIANTS + panic("Control broken"); +#else return; +#endif } - if (asoc->fragmented_delivery_inprogress == 0) { - nxt_todel = - asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered + 1; - if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) && - (nxt_todel == chk->rec.data.stream_seq || - (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED))) { - /* - * Yep the first one is here and its ok to deliver - * but should we? - */ - if (stcb->sctp_socket) { - pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket) >> SCTP_PARTIAL_DELIVERY_SHIFT, - stcb->sctp_ep->partial_delivery_point); + if (control->tail_mbuf == NULL) { + /* TSNH */ + control->data = m; + sctp_setup_tail_pointer(control); + return; + } + control->tail_mbuf->m_next = m; + while (m) { + if (SCTP_BUF_LEN(m) == 0) { + /* Skip mbufs with NO length */ + if (prev == NULL) { + /* First one */ + control->tail_mbuf->m_next = sctp_m_free(m); + m = control->tail_mbuf->m_next; } else { - pd_point = stcb->sctp_ep->partial_delivery_point; + SCTP_BUF_NEXT(prev) = sctp_m_free(m); + m = SCTP_BUF_NEXT(prev); } - if (sctp_is_all_msg_on_reasm(asoc, &tsize) || (tsize >= pd_point)) { - /* - * Yes, we setup to start reception, by - * backing down the TSN just in case we - * can't deliver. If we - */ - asoc->fragmented_delivery_inprogress = 1; - asoc->tsn_last_delivered = - chk->rec.data.TSN_seq - 1; - asoc->str_of_pdapi = - chk->rec.data.stream_number; - asoc->ssn_of_pdapi = chk->rec.data.stream_seq; - asoc->pdapi_ppid = chk->rec.data.payloadtype; - asoc->fragment_flags = chk->rec.data.rcv_flags; - sctp_service_reassembly(stcb, asoc); + if (m == NULL) { + control->tail_mbuf = prev; } + continue; } - } else { - /* - * Service re-assembly will deliver stream data queued at - * the end of fragmented delivery.. but it wont know to go - * back and call itself again... we do that here with the - * got doit_again - */ - sctp_service_reassembly(stcb, asoc); - if (asoc->fragmented_delivery_inprogress == 0) { + prev = m; + if (control->on_read_q) { /* - * finished our Fragmented delivery, could be more - * waiting? + * On read queue so we must increment the SB stuff, + * we assume caller has done any locks of SB. */ - goto doit_again; + sctp_sballoc(stcb, &stcb->sctp_socket->so_rcv, m); } + atomic_add_int(&control->length, SCTP_BUF_LEN(m)); + m = SCTP_BUF_NEXT(m); + } + if (prev) { + control->tail_mbuf = prev; } } -/* - * Dump onto the re-assembly queue, in its proper place. After dumping on the - * queue, see if anthing can be delivered. If so pull it off (or as much as - * we can. If we run out of space then we must dump what we can and set the - * appropriate flag to say we queued what we could. - */ static void -sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc, - struct sctp_tmit_chunk *chk, int *abort_flag) +sctp_build_readq_entry_from_ctl(struct sctp_queued_to_read *nc, struct sctp_queued_to_read *control) { - struct mbuf *op_err; - char msg[SCTP_DIAG_INFO_LEN]; - uint32_t cum_ackp1, prev_tsn, post_tsn; - struct sctp_tmit_chunk *at, *prev, *next; - - prev = next = NULL; - cum_ackp1 = asoc->tsn_last_delivered + 1; - if (TAILQ_EMPTY(&asoc->reasmqueue)) { - /* This is the first one on the queue */ - TAILQ_INSERT_HEAD(&asoc->reasmqueue, chk, sctp_next); + memset(nc, 0, sizeof(struct sctp_queued_to_read)); + nc->sinfo_stream = control->sinfo_stream; + nc->sinfo_ssn = control->sinfo_ssn; + TAILQ_INIT(&nc->reasm); + nc->top_fsn = control->top_fsn; + nc->msg_id = control->msg_id; + nc->sinfo_flags = control->sinfo_flags; + nc->sinfo_ppid = control->sinfo_ppid; + nc->sinfo_context = control->sinfo_context; + nc->fsn_included = 0xffffffff; + nc->sinfo_tsn = control->sinfo_tsn; + nc->sinfo_cumtsn = control->sinfo_cumtsn; + nc->sinfo_assoc_id = control->sinfo_assoc_id; + nc->whoFrom = control->whoFrom; + atomic_add_int(&nc->whoFrom->ref_count, 1); + nc->stcb = control->stcb; + nc->port_from = control->port_from; +} + +static void +sctp_reset_a_control(struct sctp_queued_to_read *control, + struct sctp_inpcb *inp, uint32_t tsn) +{ + control->fsn_included = tsn; + if (control->on_read_q) { /* - * we do not check for delivery of anything when only one - * fragment is here + * We have to purge it from there, hopefully this will work + * :-) */ - asoc->size_on_reasm_queue = chk->send_size; - sctp_ucount_incr(asoc->cnt_on_reasm_queue); - if (chk->rec.data.TSN_seq == cum_ackp1) { - if (asoc->fragmented_delivery_inprogress == 0 && - (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) != - SCTP_DATA_FIRST_FRAG) { - /* - * An empty queue, no delivery inprogress, - * we hit the next one and it does NOT have - * a FIRST fragment mark. - */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, its not first, no fragmented delivery in progress\n"); - snprintf(msg, sizeof(msg), - "Expected B-bit for TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_2; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - } else if (asoc->fragmented_delivery_inprogress && - (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) { - /* - * We are doing a partial delivery and the - * NEXT chunk MUST be either the LAST or - * MIDDLE fragment NOT a FIRST - */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS a first and fragmented delivery in progress\n"); - snprintf(msg, sizeof(msg), - "Didn't expect B-bit for TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_3; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - } else if (asoc->fragmented_delivery_inprogress) { - /* - * Here we are ok with a MIDDLE or LAST - * piece - */ - if (chk->rec.data.stream_number != - asoc->str_of_pdapi) { - /* Got to be the right STR No */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS not same stream number %d vs %d\n", - chk->rec.data.stream_number, - asoc->str_of_pdapi); - snprintf(msg, sizeof(msg), - "Expected SID=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - asoc->str_of_pdapi, - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_4; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - } else if ((asoc->fragment_flags & SCTP_DATA_UNORDERED) != - SCTP_DATA_UNORDERED && - chk->rec.data.stream_seq != asoc->ssn_of_pdapi) { - /* Got to be the right STR Seq */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS not same stream seq %d vs %d\n", - chk->rec.data.stream_seq, - asoc->ssn_of_pdapi); - snprintf(msg, sizeof(msg), - "Expected SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - asoc->ssn_of_pdapi, - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_5; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; + TAILQ_REMOVE(&inp->read_queue, control, next); + control->on_read_q = 0; + } +} + +static int +sctp_handle_old_unordered_data(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_stream_in *strm, + struct sctp_queued_to_read *control, + uint32_t pd_point, + int inp_read_lock_held) +{ + /* + * Special handling for the old un-ordered data chunk. All the + * chunks/TSN's go to msg_id 0. So we have to do the old style + * watching to see if we have it all. If you return one, no other + * control entries on the un-ordered queue will be looked at. In + * theory there should be no others entries in reality, unless the + * guy is sending both unordered NDATA and unordered DATA... + */ + struct sctp_tmit_chunk *chk, *lchk, *tchk; + uint32_t fsn; + struct sctp_queued_to_read *nc; + int cnt_added; + + if (control->first_frag_seen == 0) { + /* Nothing we can do, we have not seen the first piece yet */ + return (1); + } + /* Collapse any we can */ + cnt_added = 0; +restart: + fsn = control->fsn_included + 1; + /* Now what can we add? */ + TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, lchk) { + if (chk->rec.data.fsn_num == fsn) { + /* Ok lets add it */ + sctp_alloc_a_readq(stcb, nc); + if (nc == NULL) { + break; + } + memset(nc, 0, sizeof(struct sctp_queued_to_read)); + TAILQ_REMOVE(&control->reasm, chk, sctp_next); + sctp_add_chk_to_control(control, strm, stcb, asoc, chk, SCTP_READ_LOCK_NOT_HELD); + fsn++; + cnt_added++; + chk = NULL; + if (control->end_added) { + /* We are done */ + if (!TAILQ_EMPTY(&control->reasm)) { + /* + * Ok we have to move anything left + * on the control queue to a new + * control. + */ + sctp_build_readq_entry_from_ctl(nc, control); + tchk = TAILQ_FIRST(&control->reasm); + if (tchk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { + TAILQ_REMOVE(&control->reasm, tchk, sctp_next); + nc->first_frag_seen = 1; + nc->fsn_included = tchk->rec.data.fsn_num; + nc->data = tchk->data; + nc->sinfo_ppid = tchk->rec.data.payloadtype; + nc->sinfo_tsn = tchk->rec.data.TSN_seq; + sctp_mark_non_revokable(asoc, tchk->rec.data.TSN_seq); + tchk->data = NULL; + sctp_free_a_chunk(stcb, tchk, SCTP_SO_NOT_LOCKED); + sctp_setup_tail_pointer(nc); + tchk = TAILQ_FIRST(&control->reasm); + } + /* Spin the rest onto the queue */ + while (tchk) { + TAILQ_REMOVE(&control->reasm, tchk, sctp_next); + TAILQ_INSERT_TAIL(&nc->reasm, tchk, sctp_next); + tchk = TAILQ_FIRST(&control->reasm); + } + /* + * Now lets add it to the queue + * after removing control + */ + TAILQ_INSERT_TAIL(&strm->uno_inqueue, nc, next_instrm); + nc->on_strm_q = SCTP_ON_UNORDERED; + if (control->on_strm_q) { + TAILQ_REMOVE(&strm->uno_inqueue, control, next_instrm); + control->on_strm_q = 0; + } + } + if (control->pdapi_started) { + strm->pd_api_started = 0; + control->pdapi_started = 0; } + if (control->on_strm_q) { + TAILQ_REMOVE(&strm->uno_inqueue, control, next_instrm); + control->on_strm_q = 0; + SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs); + } + if (control->on_read_q == 0) { + sctp_add_to_readq(stcb->sctp_ep, stcb, control, + &stcb->sctp_socket->so_rcv, control->end_added, + inp_read_lock_held, SCTP_SO_NOT_LOCKED); + } + sctp_wakeup_the_read_socket(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); + if ((nc->first_frag_seen) && !TAILQ_EMPTY(&nc->reasm)) { + /* + * Switch to the new guy and + * continue + */ + control = nc; + goto restart; + } else { + if (nc->on_strm_q == 0) { + sctp_free_a_readq(stcb, nc); + } + } + return (1); + } else { + sctp_free_a_readq(stcb, nc); } + } else { + /* Can't add more */ + break; } + } + if ((control->length > pd_point) && (strm->pd_api_started == 0)) { + strm->pd_api_started = 1; + control->pdapi_started = 1; + sctp_add_to_readq(stcb->sctp_ep, stcb, control, + &stcb->sctp_socket->so_rcv, control->end_added, + inp_read_lock_held, SCTP_SO_NOT_LOCKED); + sctp_wakeup_the_read_socket(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); + return (0); + } else { + return (1); + } +} + +static void +sctp_inject_old_unordered_data(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_queued_to_read *control, + struct sctp_tmit_chunk *chk, + int *abort_flag) +{ + struct sctp_tmit_chunk *at; + int inserted; + + /* + * Here we need to place the chunk into the control structure sorted + * in the correct order. + */ + if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { + /* Its the very first one. */ + SCTPDBG(SCTP_DEBUG_XXX, + "chunk is a first fsn: %u becomes fsn_included\n", + chk->rec.data.fsn_num); + if (control->first_frag_seen) { + /* + * In old un-ordered we can reassembly on one + * control multiple messages. As long as the next + * FIRST is greater then the old first (TSN i.e. FSN + * wise) + */ + struct mbuf *tdata; + uint32_t tmp; + + if (SCTP_TSN_GT(chk->rec.data.fsn_num, control->fsn_included)) { + /* + * Easy way the start of a new guy beyond + * the lowest + */ + goto place_chunk; + } + if ((chk->rec.data.fsn_num == control->fsn_included) || + (control->pdapi_started)) { + /* + * Ok this should not happen, if it does we + * started the pd-api on the higher TSN + * (since the equals part is a TSN failure + * it must be that). + * + * We are completly hosed in that case since I + * have no way to recover. This really will + * only happen if we can get more TSN's + * higher before the pd-api-point. + */ + sctp_abort_in_reasm(stcb, control, chk, + abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_4); + + return; + } + /* + * Ok we have two firsts and the one we just got is + * smaller than the one we previously placed.. yuck! + * We must swap them out. + */ + /* swap the mbufs */ + tdata = control->data; + control->data = chk->data; + chk->data = tdata; + /* Save the lengths */ + chk->send_size = control->length; + /* Recompute length of control and tail pointer */ + sctp_setup_tail_pointer(control); + /* Fix the FSN included */ + tmp = control->fsn_included; + control->fsn_included = chk->rec.data.fsn_num; + chk->rec.data.fsn_num = tmp; + /* Fix the TSN included */ + tmp = control->sinfo_tsn; + control->sinfo_tsn = chk->rec.data.TSN_seq; + chk->rec.data.TSN_seq = tmp; + /* Fix the PPID included */ + tmp = control->sinfo_ppid; + control->sinfo_ppid = chk->rec.data.payloadtype; + chk->rec.data.payloadtype = tmp; + /* Fix tail pointer */ + goto place_chunk; + } + control->first_frag_seen = 1; + control->top_fsn = control->fsn_included = chk->rec.data.fsn_num; + control->sinfo_tsn = chk->rec.data.TSN_seq; + control->sinfo_ppid = chk->rec.data.payloadtype; + control->data = chk->data; + sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq); + chk->data = NULL; + sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); + sctp_setup_tail_pointer(control); return; } - /* Find its place */ - TAILQ_FOREACH(at, &asoc->reasmqueue, sctp_next) { - if (SCTP_TSN_GT(at->rec.data.TSN_seq, chk->rec.data.TSN_seq)) { +place_chunk: + inserted = 0; + TAILQ_FOREACH(at, &control->reasm, sctp_next) { + if (SCTP_TSN_GT(at->rec.data.fsn_num, chk->rec.data.fsn_num)) { /* - * one in queue is bigger than the new one, insert - * before this one + * This one in queue is bigger than the new one, + * insert the new one before at. */ - /* A check */ asoc->size_on_reasm_queue += chk->send_size; sctp_ucount_incr(asoc->cnt_on_reasm_queue); - next = at; + inserted = 1; TAILQ_INSERT_BEFORE(at, chk, sctp_next); break; - } else if (at->rec.data.TSN_seq == chk->rec.data.TSN_seq) { - /* Gak, He sent me a duplicate str seq number */ + } else if (at->rec.data.fsn_num == chk->rec.data.fsn_num) { /* - * foo bar, I guess I will just free this new guy, - * should we abort too? FIX ME MAYBE? Or it COULD be - * that the SSN's have wrapped. Maybe I should - * compare to TSN somehow... sigh for now just blow - * away the chunk! + * They sent a duplicate fsn number. This really + * should not happen since the FSN is a TSN and it + * should have been dropped earlier. */ - if (chk->data) { - sctp_m_freem(chk->data); - chk->data = NULL; - } - sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); + sctp_abort_in_reasm(stcb, control, chk, + abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_5); return; + } + } + if (inserted == 0) { + /* Its at the end */ + asoc->size_on_reasm_queue += chk->send_size; + sctp_ucount_incr(asoc->cnt_on_reasm_queue); + control->top_fsn = chk->rec.data.fsn_num; + TAILQ_INSERT_TAIL(&control->reasm, chk, sctp_next); + } +} + +static int +sctp_deliver_reasm_check(struct sctp_tcb *stcb, struct sctp_association *asoc, + struct sctp_stream_in *strm, int inp_read_lock_held) +{ + /* + * Given a stream, strm, see if any of the SSN's on it that are + * fragmented are ready to deliver. If so go ahead and place them on + * the read queue. In so placing if we have hit the end, then we + * need to remove them from the stream's queue. + */ + struct sctp_queued_to_read *control, *nctl = NULL; + uint32_t next_to_del; + uint32_t pd_point; + int ret = 0; + + if (stcb->sctp_socket) { + pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket) >> SCTP_PARTIAL_DELIVERY_SHIFT, + stcb->sctp_ep->partial_delivery_point); + } else { + pd_point = stcb->sctp_ep->partial_delivery_point; + } + control = TAILQ_FIRST(&strm->uno_inqueue); + + if ((control) && + (asoc->idata_supported == 0)) { + /* Special handling needed for "old" data format */ + if (sctp_handle_old_unordered_data(stcb, asoc, strm, control, pd_point, inp_read_lock_held)) { + goto done_un; + } + } + if (strm->pd_api_started) { + /* Can't add more */ + return (0); + } + while (control) { + SCTPDBG(SCTP_DEBUG_XXX, "Looking at control: %p e(%d) ssn: %u top_fsn: %u inc_fsn: %u -uo\n", + control, control->end_added, control->sinfo_ssn, control->top_fsn, control->fsn_included); + nctl = TAILQ_NEXT(control, next_instrm); + if (control->end_added) { + /* We just put the last bit on */ + if (control->on_strm_q) { +#ifdef INVARIANTS + if (control->on_strm_q != SCTP_ON_UNORDERED) { + panic("Huh control: %p on_q: %d -- not unordered?", + control, control->on_strm_q); + } +#endif + SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs); + TAILQ_REMOVE(&strm->uno_inqueue, control, next_instrm); + control->on_strm_q = 0; + } + if (control->on_read_q == 0) { + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, control->end_added, + inp_read_lock_held, SCTP_SO_NOT_LOCKED); + } } else { - prev = at; - if (TAILQ_NEXT(at, sctp_next) == NULL) { - /* - * We are at the end, insert it after this - * one - */ - /* check it first */ - asoc->size_on_reasm_queue += chk->send_size; - sctp_ucount_incr(asoc->cnt_on_reasm_queue); - TAILQ_INSERT_AFTER(&asoc->reasmqueue, at, chk, sctp_next); + /* Can we do a PD-API for this un-ordered guy? */ + if ((control->length >= pd_point) && (strm->pd_api_started == 0)) { + strm->pd_api_started = 1; + control->pdapi_started = 1; + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, control->end_added, + inp_read_lock_held, SCTP_SO_NOT_LOCKED); + break; } } + control = nctl; } - /* Now the audits */ - if (prev) { - prev_tsn = chk->rec.data.TSN_seq - 1; - if (prev_tsn == prev->rec.data.TSN_seq) { - /* - * Ok the one I am dropping onto the end is the - * NEXT. A bit of valdiation here. - */ - if ((prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == - SCTP_DATA_FIRST_FRAG || - (prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == - SCTP_DATA_MIDDLE_FRAG) { - /* - * Insert chk MUST be a MIDDLE or LAST - * fragment - */ - if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == - SCTP_DATA_FIRST_FRAG) { - SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - It can be a midlle or last but not a first\n"); - SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it's a FIRST!\n"); - snprintf(msg, sizeof(msg), - "Can't handle B-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_6; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - return; - } - if (chk->rec.data.stream_number != - prev->rec.data.stream_number) { - /* - * Huh, need the correct STR here, - * they must be the same. - */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, sid:%d not the same as at:%d\n", - chk->rec.data.stream_number, - prev->rec.data.stream_number); - snprintf(msg, sizeof(msg), - "Expect SID=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - prev->rec.data.stream_number, - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_7; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - return; +done_un: + control = TAILQ_FIRST(&strm->inqueue); + if (strm->pd_api_started) { + /* Can't add more */ + return (0); + } + if (control == NULL) { + return (ret); + } + if (strm->last_sequence_delivered == control->sinfo_ssn) { + /* + * Ok the guy at the top was being partially delivered + * completed, so we remove it. Note the pd_api flag was + * taken off when the chunk was merged on in + * sctp_queue_data_for_reasm below. + */ + nctl = TAILQ_NEXT(control, next_instrm); + SCTPDBG(SCTP_DEBUG_XXX, + "Looking at control: %p e(%d) ssn: %u top_fsn: %u inc_fsn: %u (lastdel: %u)- o\n", + control, control->end_added, control->sinfo_ssn, + control->top_fsn, control->fsn_included, + strm->last_sequence_delivered); + if (control->end_added) { + if (control->on_strm_q) { +#ifdef INVARIANTS + if (control->on_strm_q != SCTP_ON_ORDERED) { + panic("Huh control: %p on_q: %d -- not ordered?", + control, control->on_strm_q); } - if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != - (prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) { - /* - * Huh, need the same ordering here, - * they must be the same. - */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, U-bit not constant\n"); - snprintf(msg, sizeof(msg), - "Expect U-bit=%d for TSN=%8.8x, got U-bit=%d", - (prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0, - chk->rec.data.TSN_seq, - (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_7; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - return; +#endif + SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs); + TAILQ_REMOVE(&strm->inqueue, control, next_instrm); + control->on_strm_q = 0; + } + if (strm->pd_api_started && control->pdapi_started) { + control->pdapi_started = 0; + strm->pd_api_started = 0; + } + if (control->on_read_q == 0) { + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, control->end_added, + inp_read_lock_held, SCTP_SO_NOT_LOCKED); + } + control = nctl; + } + } + if (strm->pd_api_started) { + /* + * Can't add more must have gotten an un-ordered above being + * partially delivered. + */ + return (0); + } +deliver_more: + next_to_del = strm->last_sequence_delivered + 1; + if (control) { + SCTPDBG(SCTP_DEBUG_XXX, + "Looking at control: %p e(%d) ssn: %u top_fsn: %u inc_fsn: %u (nxtdel: %u)- o\n", + control, control->end_added, control->sinfo_ssn, control->top_fsn, control->fsn_included, + next_to_del); + nctl = TAILQ_NEXT(control, next_instrm); + if ((control->sinfo_ssn == next_to_del) && + (control->first_frag_seen)) { + int done; + + /* Ok we can deliver it onto the stream. */ + if (control->end_added) { + /* We are done with it afterwards */ + if (control->on_strm_q) { +#ifdef INVARIANTS + if (control->on_strm_q != SCTP_ON_ORDERED) { + panic("Huh control: %p on_q: %d -- not ordered?", + control, control->on_strm_q); + } +#endif + SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs); + TAILQ_REMOVE(&strm->inqueue, control, next_instrm); + control->on_strm_q = 0; } - if ((prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0 && - chk->rec.data.stream_seq != - prev->rec.data.stream_seq) { + ret++; + } + if (((control->sinfo_flags >> 8) & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) { + /* + * A singleton now slipping through - mark + * it non-revokable too + */ + sctp_mark_non_revokable(asoc, control->sinfo_tsn); + } else if (control->end_added == 0) { + /* + * Check if we can defer adding until its + * all there + */ + if ((control->length < pd_point) || (strm->pd_api_started)) { /* - * Huh, need the correct STR here, - * they must be the same. + * Don't need it or cannot add more + * (one being delivered that way) */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, sseq:%d not the same as at:%d\n", - chk->rec.data.stream_seq, - prev->rec.data.stream_seq); - snprintf(msg, sizeof(msg), - "Expect SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - prev->rec.data.stream_seq, - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_8; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - return; - } - } else if ((prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == - SCTP_DATA_LAST_FRAG) { - /* Insert chk MUST be a FIRST */ - if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) != - SCTP_DATA_FIRST_FRAG) { - SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, evil plot, its not FIRST and it must be!\n"); - snprintf(msg, sizeof(msg), - "Expect B-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_9; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - return; + goto out; } } + done = (control->end_added) && (control->last_frag_seen); + if (control->on_read_q == 0) { + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, control->end_added, + inp_read_lock_held, SCTP_SO_NOT_LOCKED); + } + strm->last_sequence_delivered = next_to_del; + if (done) { + control = nctl; + goto deliver_more; + } else { + /* We are now doing PD API */ + strm->pd_api_started = 1; + control->pdapi_started = 1; + } + } + } +out: + return (ret); +} + + +void +sctp_add_chk_to_control(struct sctp_queued_to_read *control, + struct sctp_stream_in *strm, + struct sctp_tcb *stcb, struct sctp_association *asoc, + struct sctp_tmit_chunk *chk, int hold_rlock) +{ + /* + * Given a control and a chunk, merge the data from the chk onto the + * control and free up the chunk resources. + */ + int i_locked = 0; + + if (control->on_read_q && (hold_rlock == 0)) { + /* + * Its being pd-api'd so we must do some locks. + */ + SCTP_INP_READ_LOCK(stcb->sctp_ep); + i_locked = 1; + } + if (control->data == NULL) { + control->data = chk->data; + sctp_setup_tail_pointer(control); + } else { + sctp_add_to_tail_pointer(control, chk->data); + } + control->fsn_included = chk->rec.data.fsn_num; + asoc->size_on_reasm_queue -= chk->send_size; + sctp_ucount_decr(asoc->cnt_on_reasm_queue); + sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq); + chk->data = NULL; + if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { + control->first_frag_seen = 1; + } + if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { + /* Its complete */ + if ((control->on_strm_q) && (control->on_read_q)) { + if (control->pdapi_started) { + control->pdapi_started = 0; + strm->pd_api_started = 0; + } + if (control->on_strm_q == SCTP_ON_UNORDERED) { + /* Unordered */ + TAILQ_REMOVE(&strm->uno_inqueue, control, next_instrm); + control->on_strm_q = 0; + } else if (control->on_strm_q == SCTP_ON_ORDERED) { + /* Ordered */ + TAILQ_REMOVE(&strm->inqueue, control, next_instrm); + control->on_strm_q = 0; +#ifdef INVARIANTS + } else if (control->on_strm_q) { + panic("Unknown state on ctrl: %p on_strm_q: %d", control, + control->on_strm_q); +#endif + } + } + control->end_added = 1; + control->last_frag_seen = 1; + } + if (i_locked) { + SCTP_INP_READ_UNLOCK(stcb->sctp_ep); + } + sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); +} + +/* + * Dump onto the re-assembly queue, in its proper place. After dumping on the + * queue, see if anthing can be delivered. If so pull it off (or as much as + * we can. If we run out of space then we must dump what we can and set the + * appropriate flag to say we queued what we could. + */ +static void +sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc, + struct sctp_stream_in *strm, + struct sctp_queued_to_read *control, + struct sctp_tmit_chunk *chk, + int created_control, + int *abort_flag, uint32_t tsn) +{ + uint32_t next_fsn; + struct sctp_tmit_chunk *at, *nat; + int do_wakeup, unordered; + + /* + * For old un-ordered data chunks. + */ + if ((control->sinfo_flags >> 8) & SCTP_DATA_UNORDERED) { + unordered = 1; + } else { + unordered = 0; + } + /* Must be added to the stream-in queue */ + if (created_control) { + if (sctp_place_control_in_stream(strm, asoc, control)) { + /* Duplicate SSN? */ + sctp_clean_up_control(stcb, control); + sctp_abort_in_reasm(stcb, control, chk, + abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_6); + return; + } + if ((tsn == (asoc->cumulative_tsn + 1) && (asoc->idata_supported == 0))) { + /* + * Ok we created this control and now lets validate + * that its legal i.e. there is a B bit set, if not + * and we have up to the cum-ack then its invalid. + */ + if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) { + sctp_abort_in_reasm(stcb, control, chk, + abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_7); + return; + } } } - if (next) { - post_tsn = chk->rec.data.TSN_seq + 1; - if (post_tsn == next->rec.data.TSN_seq) { + if ((asoc->idata_supported == 0) && (unordered == 1)) { + sctp_inject_old_unordered_data(stcb, asoc, control, chk, abort_flag); + return; + } + /* + * Ok we must queue the chunk into the reasembly portion: o if its + * the first it goes to the control mbuf. o if its not first but the + * next in sequence it goes to the control, and each succeeding one + * in order also goes. o if its not in order we place it on the list + * in its place. + */ + if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { + /* Its the very first one. */ + SCTPDBG(SCTP_DEBUG_XXX, + "chunk is a first fsn: %u becomes fsn_included\n", + chk->rec.data.fsn_num); + if (control->first_frag_seen) { /* - * Ok the one I am inserting ahead of is my NEXT - * one. A bit of valdiation here. + * Error on senders part, they either sent us two + * data chunks with FIRST, or they sent two + * un-ordered chunks that were fragmented at the + * same time in the same stream. */ - if (next->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { - /* Insert chk MUST be a last fragment */ - if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) - != SCTP_DATA_LAST_FRAG) { - SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Next is FIRST, we must be LAST\n"); - SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, its not a last!\n"); - snprintf(msg, sizeof(msg), - "Expect only E-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_10; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - return; - } - } else if ((next->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == - SCTP_DATA_MIDDLE_FRAG || - (next->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == - SCTP_DATA_LAST_FRAG) { + sctp_abort_in_reasm(stcb, control, chk, + abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_8); + return; + } + control->first_frag_seen = 1; + control->fsn_included = chk->rec.data.fsn_num; + control->data = chk->data; + sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq); + chk->data = NULL; + sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); + sctp_setup_tail_pointer(control); + } else { + /* Place the chunk in our list */ + int inserted = 0; + + if (control->last_frag_seen == 0) { + /* Still willing to raise highest FSN seen */ + if (SCTP_TSN_GT(chk->rec.data.fsn_num, control->top_fsn)) { + SCTPDBG(SCTP_DEBUG_XXX, + "We have a new top_fsn: %u\n", + chk->rec.data.fsn_num); + control->top_fsn = chk->rec.data.fsn_num; + } + if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { + SCTPDBG(SCTP_DEBUG_XXX, + "The last fsn is now in place fsn: %u\n", + chk->rec.data.fsn_num); + control->last_frag_seen = 1; + } + if (asoc->idata_supported || control->first_frag_seen) { /* - * Insert chk CAN be MIDDLE or FIRST NOT - * LAST + * For IDATA we always check since we know + * that the first fragment is 0. For old + * DATA we have to receive the first before + * we know the first FSN (which is the TSN). */ - if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == - SCTP_DATA_LAST_FRAG) { - SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Next is a MIDDLE/LAST\n"); - SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, new prev chunk is a LAST\n"); - snprintf(msg, sizeof(msg), - "Didn't expect E-bit, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_11; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - return; - } - if (chk->rec.data.stream_number != - next->rec.data.stream_number) { + if (SCTP_TSN_GE(control->fsn_included, chk->rec.data.fsn_num)) { /* - * Huh, need the correct STR here, - * they must be the same. + * We have already delivered up to + * this so its a dup */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Gak, Evil plot, ssn:%d not the same as at:%d\n", - chk->rec.data.stream_number, - next->rec.data.stream_number); - snprintf(msg, sizeof(msg), - "Required SID %4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - next->rec.data.stream_number, - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_12; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; + sctp_abort_in_reasm(stcb, control, chk, + abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_9); return; } - if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != - (next->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) { + } + } else { + if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { + /* Second last? huh? */ + SCTPDBG(SCTP_DEBUG_XXX, + "Duplicate last fsn: %u (top: %u) -- abort\n", + chk->rec.data.fsn_num, control->top_fsn); + sctp_abort_in_reasm(stcb, control, + chk, abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_10); + return; + } + if (asoc->idata_supported || control->first_frag_seen) { + /* + * For IDATA we always check since we know + * that the first fragment is 0. For old + * DATA we have to receive the first before + * we know the first FSN (which is the TSN). + */ + + if (SCTP_TSN_GE(control->fsn_included, chk->rec.data.fsn_num)) { /* - * Huh, need the same ordering here, - * they must be the same. + * We have already delivered up to + * this so its a dup */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Next check - Gak, Evil plot, U-bit not constant\n"); - snprintf(msg, sizeof(msg), - "Expect U-bit=%d for TSN=%8.8x, got U-bit=%d", - (next->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0, - chk->rec.data.TSN_seq, - (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) ? 1 : 0); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_12; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; + SCTPDBG(SCTP_DEBUG_XXX, + "New fsn: %u is already seen in included_fsn: %u -- abort\n", + chk->rec.data.fsn_num, control->fsn_included); + sctp_abort_in_reasm(stcb, control, chk, + abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_11); return; } - if ((next->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0 && - chk->rec.data.stream_seq != - next->rec.data.stream_seq) { - /* - * Huh, need the correct STR here, - * they must be the same. - */ - SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Gak, Evil plot, sseq:%d not the same as at:%d\n", - chk->rec.data.stream_seq, - next->rec.data.stream_seq); - snprintf(msg, sizeof(msg), - "Required SSN %4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - next->rec.data.stream_seq, - chk->rec.data.TSN_seq, - chk->rec.data.stream_number, - chk->rec.data.stream_seq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_13; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - return; + } + /* + * validate not beyond top FSN if we have seen last + * one + */ + if (SCTP_TSN_GT(chk->rec.data.fsn_num, control->top_fsn)) { + SCTPDBG(SCTP_DEBUG_XXX, + "New fsn: %u is beyond or at top_fsn: %u -- abort\n", + chk->rec.data.fsn_num, + control->top_fsn); + sctp_abort_in_reasm(stcb, control, chk, + abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_12); + return; + } + } + /* + * If we reach here, we need to place the new chunk in the + * reassembly for this control. + */ + SCTPDBG(SCTP_DEBUG_XXX, + "chunk is a not first fsn: %u needs to be inserted\n", + chk->rec.data.fsn_num); + TAILQ_FOREACH(at, &control->reasm, sctp_next) { + if (SCTP_TSN_GT(at->rec.data.fsn_num, chk->rec.data.fsn_num)) { + /* + * This one in queue is bigger than the new + * one, insert the new one before at. + */ + SCTPDBG(SCTP_DEBUG_XXX, + "Insert it before fsn: %u\n", + at->rec.data.fsn_num); + asoc->size_on_reasm_queue += chk->send_size; + sctp_ucount_incr(asoc->cnt_on_reasm_queue); + TAILQ_INSERT_BEFORE(at, chk, sctp_next); + inserted = 1; + break; + } else if (at->rec.data.fsn_num == chk->rec.data.fsn_num) { + /* + * Gak, He sent me a duplicate str seq + * number + */ + /* + * foo bar, I guess I will just free this + * new guy, should we abort too? FIX ME + * MAYBE? Or it COULD be that the SSN's have + * wrapped. Maybe I should compare to TSN + * somehow... sigh for now just blow away + * the chunk! + */ + SCTPDBG(SCTP_DEBUG_XXX, + "Duplicate to fsn: %u -- abort\n", + at->rec.data.fsn_num); + sctp_abort_in_reasm(stcb, control, + chk, abort_flag, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_13); + return; + } + } + if (inserted == 0) { + /* Goes on the end */ + SCTPDBG(SCTP_DEBUG_XXX, "Inserting at tail of list fsn: %u\n", + chk->rec.data.fsn_num); + asoc->size_on_reasm_queue += chk->send_size; + sctp_ucount_incr(asoc->cnt_on_reasm_queue); + TAILQ_INSERT_TAIL(&control->reasm, chk, sctp_next); + } + } + /* + * Ok lets see if we can suck any up into the control structure that + * are in seq if it makes sense. + */ + do_wakeup = 0; + /* + * If the first fragment has not been seen there is no sense in + * looking. + */ + if (control->first_frag_seen) { + next_fsn = control->fsn_included + 1; + TAILQ_FOREACH_SAFE(at, &control->reasm, sctp_next, nat) { + if (at->rec.data.fsn_num == next_fsn) { + /* We can add this one now to the control */ + SCTPDBG(SCTP_DEBUG_XXX, + "Adding more to control: %p at: %p fsn: %u next_fsn: %u included: %u\n", + control, at, + at->rec.data.fsn_num, + next_fsn, control->fsn_included); + TAILQ_REMOVE(&control->reasm, at, sctp_next); + sctp_add_chk_to_control(control, strm, stcb, asoc, at, SCTP_READ_LOCK_NOT_HELD); + if (control->on_read_q) { + do_wakeup = 1; + } + next_fsn++; + if (control->end_added && control->pdapi_started) { + if (strm->pd_api_started) { + strm->pd_api_started = 0; + control->pdapi_started = 0; + } + if (control->on_read_q == 0) { + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, control->end_added, + SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); + do_wakeup = 1; + } + break; } + } else { + break; } } } - /* Do we need to do some delivery? check */ - sctp_deliver_reasm_check(stcb, asoc); + if (do_wakeup) { + /* Need to wakeup the reader */ + sctp_wakeup_the_read_socket(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); + } } -/* - * This is an unfortunate routine. It checks to make sure a evil guy is not - * stuffing us full of bad packet fragments. A broken peer could also do this - * but this is doubtful. It is to bad I must worry about evil crackers sigh - * :< more cycles. - */ -static int -sctp_does_tsn_belong_to_reasm(struct sctp_association *asoc, - uint32_t TSN_seq) +static struct sctp_queued_to_read * +sctp_find_reasm_entry(struct sctp_stream_in *strm, uint32_t msg_id, int ordered, int old) { - struct sctp_tmit_chunk *at; - uint32_t tsn_est; - - TAILQ_FOREACH(at, &asoc->reasmqueue, sctp_next) { - if (SCTP_TSN_GT(TSN_seq, at->rec.data.TSN_seq)) { - /* is it one bigger? */ - tsn_est = at->rec.data.TSN_seq + 1; - if (tsn_est == TSN_seq) { - /* yep. It better be a last then */ - if ((at->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) != - SCTP_DATA_LAST_FRAG) { - /* - * Ok this guy belongs next to a guy - * that is NOT last, it should be a - * middle/last, not a complete - * chunk. - */ - return (1); - } else { - /* - * This guy is ok since its a LAST - * and the new chunk is a fully - * self- contained one. - */ - return (0); - } + struct sctp_queued_to_read *control; + + if (ordered) { + TAILQ_FOREACH(control, &strm->inqueue, next_instrm) { + if (control->msg_id == msg_id) { + break; } - } else if (TSN_seq == at->rec.data.TSN_seq) { - /* Software error since I have a dup? */ - return (1); - } else { - /* - * Ok, 'at' is larger than new chunk but does it - * need to be right before it. - */ - tsn_est = TSN_seq + 1; - if (tsn_est == at->rec.data.TSN_seq) { - /* Yep, It better be a first */ - if ((at->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) != - SCTP_DATA_FIRST_FRAG) { - return (1); - } else { - return (0); - } + } + } else { + if (old) { + control = TAILQ_FIRST(&strm->uno_inqueue); + return (control); + } + TAILQ_FOREACH(control, &strm->uno_inqueue, next_instrm) { + if (control->msg_id == msg_id) { + break; } } } - return (0); + return (control); } static int sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, - struct mbuf **m, int offset, struct sctp_data_chunk *ch, int chk_length, + struct mbuf **m, int offset, int chk_length, struct sctp_nets *net, uint32_t * high_tsn, int *abort_flag, - int *break_flag, int last_chunk) + int *break_flag, int last_chunk, uint8_t chtype) { /* Process a data chunk */ /* struct sctp_tmit_chunk *chk; */ + struct sctp_data_chunk *ch; + struct sctp_idata_chunk *nch, chunk_buf; struct sctp_tmit_chunk *chk; - uint32_t tsn, gap; + uint32_t tsn, fsn, gap, msg_id; struct mbuf *dmbuf; int the_len; int need_reasm_check = 0; - uint16_t strmno, strmseq; + uint16_t strmno; struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; - struct sctp_queued_to_read *control; - int ordered; + struct sctp_queued_to_read *control = NULL; uint32_t protocol_id; uint8_t chunk_flags; struct sctp_stream_reset_list *liste; + struct sctp_stream_in *strm; + int ordered; + size_t clen; + int created_control = 0; + uint8_t old_data; chk = NULL; - tsn = ntohl(ch->dp.tsn); + if (chtype == SCTP_IDATA) { + nch = (struct sctp_idata_chunk *)sctp_m_getptr(*m, offset, + sizeof(struct sctp_idata_chunk), (uint8_t *) & chunk_buf); + ch = (struct sctp_data_chunk *)nch; + clen = sizeof(struct sctp_idata_chunk); + tsn = ntohl(ch->dp.tsn); + msg_id = ntohl(nch->dp.msg_id); + protocol_id = nch->dp.ppid_fsn.protocol_id; + if (ch->ch.chunk_flags & SCTP_DATA_FIRST_FRAG) + fsn = 0; + else + fsn = ntohl(nch->dp.ppid_fsn.fsn); + old_data = 0; + } else { + ch = (struct sctp_data_chunk *)sctp_m_getptr(*m, offset, + sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf); + tsn = ntohl(ch->dp.tsn); + protocol_id = ch->dp.protocol_id; + clen = sizeof(struct sctp_data_chunk); + fsn = tsn; + msg_id = (uint32_t) (ntohs(ch->dp.stream_sequence)); + nch = NULL; + old_data = 1; + } chunk_flags = ch->ch.chunk_flags; + if ((size_t)chk_length == clen) { + /* + * Need to send an abort since we had a empty data chunk. + */ + op_err = sctp_generate_no_user_data_cause(ch->dp.tsn); + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_14; + sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; + return (0); + } if ((chunk_flags & SCTP_DATA_SACK_IMMEDIATELY) == SCTP_DATA_SACK_IMMEDIATELY) { asoc->send_sack = 1; } - protocol_id = ch->dp.protocol_id; ordered = ((chunk_flags & SCTP_DATA_UNORDERED) == 0); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(tsn, asoc->cumulative_tsn, asoc->highest_tsn_inside_map, SCTP_MAP_TSN_ENTERS); @@ -1356,6 +1708,117 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, * for on a partial delivery API. */ + /* Is the stream valid? */ + strmno = ntohs(ch->dp.stream_id); + + if (strmno >= asoc->streamincnt) { + struct sctp_error_invalid_stream *cause; + + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_invalid_stream), + 0, M_NOWAIT, 1, MT_DATA); + if (op_err != NULL) { + /* add some space up front so prepend will work well */ + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); + cause = mtod(op_err, struct sctp_error_invalid_stream *); + /* + * Error causes are just param's and this one has + * two back to back phdr, one with the error type + * and size, the other with the streamid and a rsvd + */ + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_invalid_stream); + cause->cause.code = htons(SCTP_CAUSE_INVALID_STREAM); + cause->cause.length = htons(sizeof(struct sctp_error_invalid_stream)); + cause->stream_id = ch->dp.stream_id; + cause->reserved = htons(0); + sctp_queue_op_err(stcb, op_err); + } + SCTP_STAT_INCR(sctps_badsid); + SCTP_TCB_LOCK_ASSERT(stcb); + SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); + if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { + asoc->highest_tsn_inside_nr_map = tsn; + } + if (tsn == (asoc->cumulative_tsn + 1)) { + /* Update cum-ack */ + asoc->cumulative_tsn = tsn; + } + return (0); + } + strm = &asoc->strmin[strmno]; + /* + * If its a fragmented message, lets see if we can find the control + * on the reassembly queues. + */ + if ((chtype == SCTP_IDATA) && + ((chunk_flags & SCTP_DATA_FIRST_FRAG) == 0) && + (fsn == 0)) { + /* + * The first *must* be fsn 0, and other (middle/end) pieces + * can *not* be fsn 0. XXX: This can happen in case of a + * wrap around. Ignore is for now. + */ + snprintf(msg, sizeof(msg), "FSN zero for MID=%8.8x, but flags=%2.2x", + msg_id, chunk_flags); + goto err_out; + } + control = sctp_find_reasm_entry(strm, msg_id, ordered, old_data); + SCTPDBG(SCTP_DEBUG_XXX, "chunk_flags:0x%x look for control on queues %p\n", + chunk_flags, control); + if ((chunk_flags & SCTP_DATA_NOT_FRAG) != SCTP_DATA_NOT_FRAG) { + /* See if we can find the re-assembly entity */ + if (control != NULL) { + /* We found something, does it belong? */ + if (ordered && (msg_id != control->sinfo_ssn)) { + snprintf(msg, sizeof(msg), "Reassembly problem (MID=%8.8x)", msg_id); + err_out: + op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_15; + sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; + return (0); + } + if (ordered && ((control->sinfo_flags >> 8) & SCTP_DATA_UNORDERED)) { + /* + * We can't have a switched order with an + * unordered chunk + */ + snprintf(msg, sizeof(msg), "All fragments of a user message must be ordered or unordered (TSN=%8.8x)", + tsn); + goto err_out; + } + if (!ordered && (((control->sinfo_flags >> 8) & SCTP_DATA_UNORDERED) == 0)) { + /* + * We can't have a switched unordered with a + * ordered chunk + */ + snprintf(msg, sizeof(msg), "All fragments of a user message must be ordered or unordered (TSN=%8.8x)", + tsn); + goto err_out; + } + } + } else { + /* + * Its a complete segment. Lets validate we don't have a + * re-assembly going on with the same Stream/Seq (for + * ordered) or in the same Stream for unordered. + */ + if (control != NULL) { + if (ordered || (old_data == 0)) { + SCTPDBG(SCTP_DEBUG_XXX, "chunk_flags: 0x%x dup detected on msg_id: %u\n", + chunk_flags, msg_id); + snprintf(msg, sizeof(msg), "Duplicate MID=%8.8x detected.", msg_id); + goto err_out; + } else { + if ((tsn == control->fsn_included + 1) && + (control->end_added == 0)) { + snprintf(msg, sizeof(msg), "Illegal message sequence, missing end for MID: %8.8x", control->fsn_included); + goto err_out; + } else { + control = NULL; + } + } + } + } /* now do the tests */ if (((asoc->cnt_on_all_streams + asoc->cnt_on_reasm_queue + @@ -1388,68 +1851,31 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, #endif } /* now is it in the mapping array of what we have accepted? */ - if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_map) && - SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { - /* Nope not in the valid range dump it */ - sctp_set_rwnd(stcb, asoc); - if ((asoc->cnt_on_all_streams + - asoc->cnt_on_reasm_queue + - asoc->cnt_msg_on_sb) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)) { - SCTP_STAT_INCR(sctps_datadropchklmt); - } else { - SCTP_STAT_INCR(sctps_datadroprwnd); + if (nch == NULL) { + if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_map) && + SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { + /* Nope not in the valid range dump it */ + dump_packet: + sctp_set_rwnd(stcb, asoc); + if ((asoc->cnt_on_all_streams + + asoc->cnt_on_reasm_queue + + asoc->cnt_msg_on_sb) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)) { + SCTP_STAT_INCR(sctps_datadropchklmt); + } else { + SCTP_STAT_INCR(sctps_datadroprwnd); + } + *break_flag = 1; + return (0); + } + } else { + if (control == NULL) { + goto dump_packet; + } + if (SCTP_TSN_GT(fsn, control->top_fsn)) { + goto dump_packet; } - *break_flag = 1; - return (0); - } - } - strmno = ntohs(ch->dp.stream_id); - if (strmno >= asoc->streamincnt) { - struct sctp_paramhdr *phdr; - struct mbuf *mb; - - mb = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) * 2), - 0, M_DONTWAIT, 1, MT_DATA); - if (mb != NULL) { - /* add some space up front so prepend will work well */ - SCTP_BUF_RESV_UF(mb, sizeof(struct sctp_chunkhdr)); - phdr = mtod(mb, struct sctp_paramhdr *); - /* - * Error causes are just param's and this one has - * two back to back phdr, one with the error type - * and size, the other with the streamid and a rsvd - */ - SCTP_BUF_LEN(mb) = (sizeof(struct sctp_paramhdr) * 2); - phdr->param_type = htons(SCTP_CAUSE_INVALID_STREAM); - phdr->param_length = - htons(sizeof(struct sctp_paramhdr) * 2); - phdr++; - /* We insert the stream in the type field */ - phdr->param_type = ch->dp.stream_id; - /* And set the length to 0 for the rsvd field */ - phdr->param_length = 0; - sctp_queue_op_err(stcb, mb); - } - SCTP_STAT_INCR(sctps_badsid); - SCTP_TCB_LOCK_ASSERT(stcb); - SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); - if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { - asoc->highest_tsn_inside_nr_map = tsn; - } - if (tsn == (asoc->cumulative_tsn + 1)) { - /* Update cum-ack */ - asoc->cumulative_tsn = tsn; } - return (0); } - /* - * Before we continue lets validate that we are not being fooled by - * an evil attacker. We can only have 4k chunks based on our TSN - * spread allowed by the mapping array 512 * 8 bits, so there is no - * way our stream sequence numbers could have wrapped. We of course - * only validate the FIRST fragment so the bit must be set. - */ - strmseq = ntohs(ch->dp.stream_sequence); #ifdef SCTP_ASOCLOG_OF_TSNS SCTP_TCB_LOCK_ASSERT(stcb); if (asoc->tsn_in_at >= SCTP_TSN_LOG_SIZE) { @@ -1458,7 +1884,7 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, } asoc->in_tsnlog[asoc->tsn_in_at].tsn = tsn; asoc->in_tsnlog[asoc->tsn_in_at].strm = strmno; - asoc->in_tsnlog[asoc->tsn_in_at].seq = strmseq; + asoc->in_tsnlog[asoc->tsn_in_at].seq = msg_id; asoc->in_tsnlog[asoc->tsn_in_at].sz = chk_length; asoc->in_tsnlog[asoc->tsn_in_at].flgs = chunk_flags; asoc->in_tsnlog[asoc->tsn_in_at].stcb = (void *)stcb; @@ -1466,18 +1892,26 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, asoc->in_tsnlog[asoc->tsn_in_at].in_out = 1; asoc->tsn_in_at++; #endif + /* + * Before we continue lets validate that we are not being fooled by + * an evil attacker. We can only have Nk chunks based on our TSN + * spread allowed by the mapping array N * 8 bits, so there is no + * way our stream sequence numbers could have wrapped. We of course + * only validate the FIRST fragment so the bit must be set. + */ if ((chunk_flags & SCTP_DATA_FIRST_FRAG) && (TAILQ_EMPTY(&asoc->resetHead)) && (chunk_flags & SCTP_DATA_UNORDERED) == 0 && - SCTP_SSN_GE(asoc->strmin[strmno].last_sequence_delivered, strmseq)) { + SCTP_MSGID_GE(old_data, asoc->strmin[strmno].last_sequence_delivered, msg_id)) { /* The incoming sseq is behind where we last delivered? */ - SCTPDBG(SCTP_DEBUG_INDATA1, "EVIL/Broken-Dup S-SEQ:%d delivered:%d from peer, Abort!\n", - strmseq, asoc->strmin[strmno].last_sequence_delivered); + SCTPDBG(SCTP_DEBUG_INDATA1, "EVIL/Broken-Dup S-SEQ: %u delivered: %u from peer, Abort!\n", + msg_id, asoc->strmin[strmno].last_sequence_delivered); + snprintf(msg, sizeof(msg), "Delivered SSN=%4.4x, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", asoc->strmin[strmno].last_sequence_delivered, - tsn, strmno, strmseq); + tsn, strmno, msg_id); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_14; + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_16; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_flag = 1; return (0); @@ -1486,21 +1920,24 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, * From here down we may find ch-> invalid * so its a good idea NOT to use it. *************************************/ - - the_len = (chk_length - sizeof(struct sctp_data_chunk)); + if (nch) { + the_len = (chk_length - sizeof(struct sctp_idata_chunk)); + } else { + the_len = (chk_length - sizeof(struct sctp_data_chunk)); + } if (last_chunk == 0) { - dmbuf = SCTP_M_COPYM(*m, - (offset + sizeof(struct sctp_data_chunk)), - the_len, M_DONTWAIT); + if (nch) { + dmbuf = SCTP_M_COPYM(*m, + (offset + sizeof(struct sctp_idata_chunk)), + the_len, M_NOWAIT); + } else { + dmbuf = SCTP_M_COPYM(*m, + (offset + sizeof(struct sctp_data_chunk)), + the_len, M_NOWAIT); + } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = dmbuf; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_ICOPY); - } - } + sctp_log_mbc(dmbuf, SCTP_MBUF_ICOPY); } #endif } else { @@ -1509,7 +1946,11 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, dmbuf = *m; /* lop off the top part */ - m_adj(dmbuf, (offset + sizeof(struct sctp_data_chunk))); + if (nch) { + m_adj(dmbuf, (offset + sizeof(struct sctp_idata_chunk))); + } else { + m_adj(dmbuf, (offset + sizeof(struct sctp_data_chunk))); + } if (SCTP_BUF_NEXT(dmbuf) == NULL) { l_len = SCTP_BUF_LEN(dmbuf); } else { @@ -1533,11 +1974,36 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, SCTP_STAT_INCR(sctps_nomem); return (0); } + /* + * Now no matter what we need a control, get one if we don't have + * one (we may have gotten it above when we found the message was + * fragmented + */ + if (control == NULL) { + sctp_alloc_a_readq(stcb, control); + sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn, + protocol_id, + strmno, msg_id, + chunk_flags, + NULL, fsn, msg_id); + if (control == NULL) { + SCTP_STAT_INCR(sctps_nomem); + return (0); + } + if ((chunk_flags & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) { + control->data = dmbuf; + control->tail_mbuf = NULL; + control->end_added = control->last_frag_seen = control->first_frag_seen = 1; + control->top_fsn = control->fsn_included = fsn; + } + created_control = 1; + } + SCTPDBG(SCTP_DEBUG_XXX, "chunk_flags: 0x%x ordered: %d msgid: %u control: %p\n", + chunk_flags, ordered, msg_id, control); if ((chunk_flags & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG && - asoc->fragmented_delivery_inprogress == 0 && TAILQ_EMPTY(&asoc->resetHead) && ((ordered == 0) || - ((uint16_t) (asoc->strmin[strmno].last_sequence_delivered + 1) == strmseq && + ((uint16_t) (asoc->strmin[strmno].last_sequence_delivered + 1) == msg_id && TAILQ_EMPTY(&asoc->strmin[strmno].inqueue)))) { /* Candidate for express delivery */ /* @@ -1547,109 +2013,30 @@ sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, * And there is room for it in the socket buffer. Lets just * stuff it up the buffer.... */ - - /* It would be nice to avoid this copy if we could :< */ - sctp_alloc_a_readq(stcb, control); - sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn, - protocol_id, - strmno, strmseq, - chunk_flags, - dmbuf); - if (control == NULL) { - goto failed_express_del; - } SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { asoc->highest_tsn_inside_nr_map = tsn; } + SCTPDBG(SCTP_DEBUG_XXX, "Injecting control: %p to be read (msg_id: %u)\n", + control, msg_id); + sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); if ((chunk_flags & SCTP_DATA_UNORDERED) == 0) { /* for ordered, bump what we delivered */ - asoc->strmin[strmno].last_sequence_delivered++; + strm->last_sequence_delivered++; } SCTP_STAT_INCR(sctps_recvexpress); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { - sctp_log_strm_del_alt(stcb, tsn, strmseq, strmno, + sctp_log_strm_del_alt(stcb, tsn, msg_id, strmno, SCTP_STR_LOG_FROM_EXPRS_DEL); } control = NULL; - goto finish_express_del; } -failed_express_del: - /* If we reach here this is a new chunk */ - chk = NULL; - control = NULL; - /* Express for fragmented delivery? */ - if ((asoc->fragmented_delivery_inprogress) && - (stcb->asoc.control_pdapi) && - (asoc->str_of_pdapi == strmno) && - (asoc->ssn_of_pdapi == strmseq) - ) { - control = stcb->asoc.control_pdapi; - if ((chunk_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) { - /* Can't be another first? */ - goto failed_pdapi_express_del; - } - if (tsn == (control->sinfo_tsn + 1)) { - /* Yep, we can add it on */ - int end = 0; - - if (chunk_flags & SCTP_DATA_LAST_FRAG) { - end = 1; - } - if (sctp_append_to_readq(stcb->sctp_ep, stcb, control, dmbuf, end, - tsn, - &stcb->sctp_socket->so_rcv)) { - SCTP_PRINTF("Append fails end:%d\n", end); - goto failed_pdapi_express_del; - } - SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); - if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { - asoc->highest_tsn_inside_nr_map = tsn; - } - SCTP_STAT_INCR(sctps_recvexpressm); - asoc->tsn_last_delivered = tsn; - asoc->fragment_flags = chunk_flags; - asoc->tsn_of_pdapi_last_delivered = tsn; - asoc->last_flags_delivered = chunk_flags; - asoc->last_strm_seq_delivered = strmseq; - asoc->last_strm_no_delivered = strmno; - if (end) { - /* clean up the flags and such */ - asoc->fragmented_delivery_inprogress = 0; - if ((chunk_flags & SCTP_DATA_UNORDERED) == 0) { - asoc->strmin[strmno].last_sequence_delivered++; - } - stcb->asoc.control_pdapi = NULL; - if (TAILQ_EMPTY(&asoc->reasmqueue) == 0) { - /* - * There could be another message - * ready - */ - need_reasm_check = 1; - } - } - control = NULL; - goto finish_express_del; - } - } -failed_pdapi_express_del: - control = NULL; - if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { - SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); - if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { - asoc->highest_tsn_inside_nr_map = tsn; - } - } else { - SCTP_SET_TSN_PRESENT(asoc->mapping_array, gap); - if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_map)) { - asoc->highest_tsn_inside_map = tsn; - } - } + /* Now will we need a chunk too? */ if ((chunk_flags & SCTP_DATA_NOT_FRAG) != SCTP_DATA_NOT_FRAG) { sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { @@ -1663,7 +2050,8 @@ failed_pdapi_express_del: } chk->rec.data.TSN_seq = tsn; chk->no_fr_allowed = 0; - chk->rec.data.stream_seq = strmseq; + chk->rec.data.fsn_num = fsn; + chk->rec.data.stream_seq = msg_id; chk->rec.data.stream_number = strmno; chk->rec.data.payloadtype = protocol_id; chk->rec.data.context = stcb->asoc.context; @@ -1672,193 +2060,110 @@ failed_pdapi_express_del: chk->asoc = asoc; chk->send_size = the_len; chk->whoTo = net; + SCTPDBG(SCTP_DEBUG_XXX, "Building ck: %p for control: %p to be read (msg_id: %u)\n", + chk, + control, msg_id); atomic_add_int(&net->ref_count, 1); chk->data = dmbuf; + } + /* Set the appropriate TSN mark */ + if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { + SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); + if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_nr_map)) { + asoc->highest_tsn_inside_nr_map = tsn; + } } else { - sctp_alloc_a_readq(stcb, control); - sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn, - protocol_id, - strmno, strmseq, - chunk_flags, - dmbuf); - if (control == NULL) { - /* No memory so we drop the chunk */ - SCTP_STAT_INCR(sctps_nomem); - if (last_chunk == 0) { - /* we copied it, free the copy */ - sctp_m_freem(dmbuf); - } - return (0); + SCTP_SET_TSN_PRESENT(asoc->mapping_array, gap); + if (SCTP_TSN_GT(tsn, asoc->highest_tsn_inside_map)) { + asoc->highest_tsn_inside_map = tsn; } - control->length = the_len; } - - /* Mark it as received */ - /* Now queue it where it belongs */ - if (control != NULL) { - /* First a sanity check */ - if (asoc->fragmented_delivery_inprogress) { + /* Now is it complete (i.e. not fragmented)? */ + if ((chunk_flags & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) { + /* + * Special check for when streams are resetting. We could be + * more smart about this and check the actual stream to see + * if it is not being reset.. that way we would not create a + * HOLB when amongst streams being reset and those not being + * reset. + * + */ + if (((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) && + SCTP_TSN_GT(tsn, liste->tsn)) { /* - * Ok, we have a fragmented delivery in progress if - * this chunk is next to deliver OR belongs in our - * view to the reassembly, the peer is evil or - * broken. + * yep its past where we need to reset... go ahead + * and queue it. */ - uint32_t estimate_tsn; - - estimate_tsn = asoc->tsn_last_delivered + 1; - if (TAILQ_EMPTY(&asoc->reasmqueue) && - (estimate_tsn == control->sinfo_tsn)) { - /* Evil/Broke peer */ - sctp_m_freem(control->data); - control->data = NULL; - if (control->whoFrom) { - sctp_free_remote_addr(control->whoFrom); - control->whoFrom = NULL; - } - sctp_free_a_readq(stcb, control); - snprintf(msg, sizeof(msg), "Reas. queue emtpy, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - tsn, strmno, strmseq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_15; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - if (last_chunk) { - *m = NULL; - } - return (0); + if (TAILQ_EMPTY(&asoc->pending_reply_queue)) { + /* first one on */ + TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next); } else { - if (sctp_does_tsn_belong_to_reasm(asoc, control->sinfo_tsn)) { - sctp_m_freem(control->data); - control->data = NULL; - if (control->whoFrom) { - sctp_free_remote_addr(control->whoFrom); - control->whoFrom = NULL; - } - sctp_free_a_readq(stcb, control); - snprintf(msg, sizeof(msg), "PD ongoing, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - tsn, strmno, strmseq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_16; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - if (last_chunk) { - *m = NULL; + struct sctp_queued_to_read *ctlOn, *nctlOn; + unsigned char inserted = 0; + + TAILQ_FOREACH_SAFE(ctlOn, &asoc->pending_reply_queue, next, nctlOn) { + if (SCTP_TSN_GT(control->sinfo_tsn, ctlOn->sinfo_tsn)) { + + continue; + } else { + /* found it */ + TAILQ_INSERT_BEFORE(ctlOn, control, next); + inserted = 1; + break; } - return (0); } - } - } else { - /* No PDAPI running */ - if (!TAILQ_EMPTY(&asoc->reasmqueue)) { - /* - * Reassembly queue is NOT empty validate - * that this tsn does not need to be in - * reasembly queue. If it does then our peer - * is broken or evil. - */ - if (sctp_does_tsn_belong_to_reasm(asoc, control->sinfo_tsn)) { - sctp_m_freem(control->data); - control->data = NULL; - if (control->whoFrom) { - sctp_free_remote_addr(control->whoFrom); - control->whoFrom = NULL; - } - sctp_free_a_readq(stcb, control); - snprintf(msg, sizeof(msg), "No PD ongoing, got TSN=%8.8x, SID=%4.4x, SSN=%4.4x", - tsn, strmno, strmseq); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_17; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - *abort_flag = 1; - if (last_chunk) { - *m = NULL; - } - return (0); + if (inserted == 0) { + /* + * must be put at end, use prevP + * (all setup from loop) to setup + * nextP. + */ + TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next); } } + goto finish_express_del; } - /* ok, if we reach here we have passed the sanity checks */ if (chunk_flags & SCTP_DATA_UNORDERED) { /* queue directly into socket buffer */ + SCTPDBG(SCTP_DEBUG_XXX, "Unordered data to be read control: %p msg_id: %u\n", + control, msg_id); sctp_mark_non_revokable(asoc, control->sinfo_tsn); sctp_add_to_readq(stcb->sctp_ep, stcb, control, - &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); - } else { - /* - * Special check for when streams are resetting. We - * could be more smart about this and check the - * actual stream to see if it is not being reset.. - * that way we would not create a HOLB when amongst - * streams being reset and those not being reset. - * - * We take complete messages that have a stream reset - * intervening (aka the TSN is after where our - * cum-ack needs to be) off and put them on a - * pending_reply_queue. The reassembly ones we do - * not have to worry about since they are all sorted - * and proceessed by TSN order. It is only the - * singletons I must worry about. - */ - if (((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) && - SCTP_TSN_GT(tsn, liste->tsn)) { - /* - * yep its past where we need to reset... go - * ahead and queue it. - */ - if (TAILQ_EMPTY(&asoc->pending_reply_queue)) { - /* first one on */ - TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next); - } else { - struct sctp_queued_to_read *ctlOn, - *nctlOn; - unsigned char inserted = 0; + &stcb->sctp_socket->so_rcv, 1, + SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); - TAILQ_FOREACH_SAFE(ctlOn, &asoc->pending_reply_queue, next, nctlOn) { - if (SCTP_TSN_GT(control->sinfo_tsn, ctlOn->sinfo_tsn)) { - continue; - } else { - /* found it */ - TAILQ_INSERT_BEFORE(ctlOn, control, next); - inserted = 1; - break; - } - } - if (inserted == 0) { - /* - * must be put at end, use - * prevP (all setup from - * loop) to setup nextP. - */ - TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next); - } - } - } else { - sctp_queue_data_to_stream(stcb, asoc, control, abort_flag); - if (*abort_flag) { - if (last_chunk) { - *m = NULL; - } - return (0); + } else { + SCTPDBG(SCTP_DEBUG_XXX, "Queue control: %p for reordering msg_id: %u\n", control, + msg_id); + sctp_queue_data_to_stream(stcb, strm, asoc, control, abort_flag, &need_reasm_check); + if (*abort_flag) { + if (last_chunk) { + *m = NULL; } + return (0); } } - } else { - /* Into the re-assembly queue */ - sctp_queue_data_for_reasm(stcb, asoc, chk, abort_flag); - if (*abort_flag) { - /* - * the assoc is now gone and chk was put onto the - * reasm queue, which has all been freed. - */ - if (last_chunk) { - *m = NULL; - } - return (0); + goto finish_express_del; + } + /* If we reach here its a reassembly */ + need_reasm_check = 1; + SCTPDBG(SCTP_DEBUG_XXX, + "Queue data to stream for reasm control: %p msg_id: %u\n", + control, msg_id); + sctp_queue_data_for_reasm(stcb, asoc, strm, control, chk, created_control, abort_flag, tsn); + if (*abort_flag) { + /* + * the assoc is now gone and chk was put onto the reasm + * queue, which has all been freed. + */ + if (last_chunk) { + *m = NULL; } + return (0); } finish_express_del: + /* Here we tidy up things */ if (tsn == (asoc->cumulative_tsn + 1)) { /* Update cum-ack */ asoc->cumulative_tsn = tsn; @@ -1874,7 +2179,7 @@ finish_express_del: SCTP_STAT_INCR(sctps_recvdata); /* Set it present please */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { - sctp_log_strm_del_alt(stcb, tsn, strmseq, strmno, SCTP_STR_LOG_FROM_MARK_TSN); + sctp_log_strm_del_alt(stcb, tsn, msg_id, strmno, SCTP_STR_LOG_FROM_MARK_TSN); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(asoc->mapping_array_base_tsn, asoc->cumulative_tsn, @@ -1893,6 +2198,7 @@ finish_express_del: sctp_reset_in_stream(stcb, liste->number_entries, liste->list_of_streams); TAILQ_REMOVE(&asoc->resetHead, liste, next_resp); + sctp_send_deferred_reset_response(stcb, liste, SCTP_STREAM_RESET_RESULT_PERFORMED); SCTP_FREE(liste, SCTP_M_STRESET); /* sa_ignore FREED_MEMORY */ liste = TAILQ_FIRST(&asoc->resetHead); @@ -1900,7 +2206,7 @@ finish_express_del: /* All can be removed */ TAILQ_FOREACH_SAFE(ctl, &asoc->pending_reply_queue, next, nctl) { TAILQ_REMOVE(&asoc->pending_reply_queue, ctl, next); - sctp_queue_data_to_stream(stcb, asoc, ctl, abort_flag); + sctp_queue_data_to_stream(stcb, strm, asoc, ctl, abort_flag, &need_reasm_check); if (*abort_flag) { return (0); } @@ -1916,7 +2222,7 @@ finish_express_del: * ctl->sinfo_tsn > liste->tsn */ TAILQ_REMOVE(&asoc->pending_reply_queue, ctl, next); - sctp_queue_data_to_stream(stcb, asoc, ctl, abort_flag); + sctp_queue_data_to_stream(stcb, strm, asoc, ctl, abort_flag, &need_reasm_check); if (*abort_flag) { return (0); } @@ -1926,17 +2232,17 @@ finish_express_del: * Now service re-assembly to pick up anything that has been * held on reassembly queue? */ - sctp_deliver_reasm_check(stcb, asoc); + (void)sctp_deliver_reasm_check(stcb, asoc, strm, SCTP_READ_LOCK_NOT_HELD); need_reasm_check = 0; } if (need_reasm_check) { /* Another one waits ? */ - sctp_deliver_reasm_check(stcb, asoc); + (void)sctp_deliver_reasm_check(stcb, asoc, strm, SCTP_READ_LOCK_NOT_HELD); } return (1); } -int8_t sctp_map_lookup_tab[256] = { +static const int8_t sctp_map_lookup_tab[256] = { 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, @@ -1980,7 +2286,7 @@ sctp_slide_mapping_arrays(struct sctp_tcb *stcb) * 1) Did we move the cum-ack point? * * When you first glance at this you might think that all entries that - * make up the postion of the cum-ack would be in the nr-mapping + * make up the position of the cum-ack would be in the nr-mapping * array only.. i.e. things up to the cum-ack are always * deliverable. Thats true with one exception, when its a fragmented * message we may not deliver the data until some threshold (or all @@ -2078,7 +2384,7 @@ sctp_slide_mapping_arrays(struct sctp_tcb *stcb) #ifdef INVARIANTS panic("impossible slide"); #else - SCTP_PRINTF("impossible slide lgap:%x slide_end:%x slide_from:%x? at:%d\n", + SCTP_PRINTF("impossible slide lgap: %x slide_end: %x slide_from: %x? at: %d\n", lgap, slide_end, slide_from, at); return; #endif @@ -2087,7 +2393,7 @@ sctp_slide_mapping_arrays(struct sctp_tcb *stcb) #ifdef INVARIANTS panic("would overrun buffer"); #else - SCTP_PRINTF("Gak, would have overrun map end:%d slide_end:%d\n", + SCTP_PRINTF("Gak, would have overrun map end: %d slide_end: %d\n", asoc->mapping_array_size, slide_end); slide_end = asoc->mapping_array_size; #endif @@ -2166,7 +2472,8 @@ sctp_sack_check(struct sctp_tcb *stcb, int was_a_gap) */ if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, - stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_INDATA + SCTP_LOC_18); + stcb->sctp_ep, stcb, NULL, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_17); } sctp_send_shutdown(stcb, ((stcb->asoc.alternate) ? stcb->asoc.alternate : stcb->asoc.primary_destination)); @@ -2231,76 +2538,12 @@ sctp_sack_check(struct sctp_tcb *stcb, int was_a_gap) } } -void -sctp_service_queues(struct sctp_tcb *stcb, struct sctp_association *asoc) -{ - struct sctp_tmit_chunk *chk; - uint32_t tsize, pd_point; - uint16_t nxt_todel; - - if (asoc->fragmented_delivery_inprogress) { - sctp_service_reassembly(stcb, asoc); - } - /* Can we proceed further, i.e. the PD-API is complete */ - if (asoc->fragmented_delivery_inprogress) { - /* no */ - return; - } - /* - * Now is there some other chunk I can deliver from the reassembly - * queue. - */ -doit_again: - chk = TAILQ_FIRST(&asoc->reasmqueue); - if (chk == NULL) { - asoc->size_on_reasm_queue = 0; - asoc->cnt_on_reasm_queue = 0; - return; - } - nxt_todel = asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered + 1; - if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) && - ((nxt_todel == chk->rec.data.stream_seq) || - (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED))) { - /* - * Yep the first one is here. We setup to start reception, - * by backing down the TSN just in case we can't deliver. - */ - - /* - * Before we start though either all of the message should - * be here or the socket buffer max or nothing on the - * delivery queue and something can be delivered. - */ - if (stcb->sctp_socket) { - pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket) >> SCTP_PARTIAL_DELIVERY_SHIFT, - stcb->sctp_ep->partial_delivery_point); - } else { - pd_point = stcb->sctp_ep->partial_delivery_point; - } - if (sctp_is_all_msg_on_reasm(asoc, &tsize) || (tsize >= pd_point)) { - asoc->fragmented_delivery_inprogress = 1; - asoc->tsn_last_delivered = chk->rec.data.TSN_seq - 1; - asoc->str_of_pdapi = chk->rec.data.stream_number; - asoc->ssn_of_pdapi = chk->rec.data.stream_seq; - asoc->pdapi_ppid = chk->rec.data.payloadtype; - asoc->fragment_flags = chk->rec.data.rcv_flags; - sctp_service_reassembly(stcb, asoc); - if (asoc->fragmented_delivery_inprogress == 0) { - goto doit_again; - } - } - } -} - int sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, - struct sockaddr *src, struct sockaddr *dst, - struct sctphdr *sh, struct sctp_inpcb *inp, - struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t * high_tsn, - uint8_t use_mflowid, uint32_t mflowid, - uint32_t vrf_id, uint16_t port) + struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net, uint32_t * high_tsn) { - struct sctp_data_chunk *ch, chunk_buf; + struct sctp_chunkhdr *ch, chunk_buf; struct sctp_association *asoc; int num_chunks = 0; /* number of control chunks processed */ int stop_proc = 0; @@ -2338,7 +2581,7 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, */ if (SCTP_BUF_LEN(m) < (long)MLEN && SCTP_BUF_NEXT(m) == NULL) { /* we only handle mbufs that are singletons.. not chains */ - m = sctp_get_mbuf_for_msg(SCTP_BUF_LEN(m), 0, M_DONTWAIT, 1, MT_DATA); + m = sctp_get_mbuf_for_msg(SCTP_BUF_LEN(m), 0, M_NOWAIT, 1, MT_DATA); if (m) { /* ok lets see if we can copy the data up */ caddr_t *from, *to; @@ -2350,7 +2593,7 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, /* copy the length and free up the old */ SCTP_BUF_LEN(m) = SCTP_BUF_LEN((*mm)); sctp_m_freem(*mm); - /* sucess, back copy */ + /* success, back copy */ *mm = m; } else { /* We are in trouble in the mbuf world .. yikes */ @@ -2358,8 +2601,8 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, } } /* get pointer to the first chunk header */ - ch = (struct sctp_data_chunk *)sctp_m_getptr(m, *offset, - sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf); + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, + sizeof(struct sctp_chunkhdr), (uint8_t *) & chunk_buf); if (ch == NULL) { return (1); } @@ -2371,14 +2614,44 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, asoc->data_pkts_seen++; while (stop_proc == 0) { /* validate chunk length */ - chk_length = ntohs(ch->ch.chunk_length); + chk_length = ntohs(ch->chunk_length); if (length - *offset < chk_length) { /* all done, mutulated chunk */ stop_proc = 1; continue; } - if (ch->ch.chunk_type == SCTP_DATA) { - if ((size_t)chk_length < sizeof(struct sctp_data_chunk)) { + if ((asoc->idata_supported == 1) && + (ch->chunk_type == SCTP_DATA)) { + struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; + + snprintf(msg, sizeof(msg), "%s", "I-DATA chunk received when DATA was negotiated"); + op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_18; + sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); + return (2); + } + if ((asoc->idata_supported == 0) && + (ch->chunk_type == SCTP_IDATA)) { + struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; + + snprintf(msg, sizeof(msg), "%s", "DATA chunk received when I-DATA was negotiated"); + op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19; + sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); + return (2); + } + if ((ch->chunk_type == SCTP_DATA) || + (ch->chunk_type == SCTP_IDATA)) { + int clen; + + if (ch->chunk_type == SCTP_DATA) { + clen = sizeof(struct sctp_data_chunk); + } else { + clen = sizeof(struct sctp_idata_chunk); + } + if (chk_length < clen) { /* * Need to send an abort since we had a * invalid data chunk. @@ -2389,26 +2662,8 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, snprintf(msg, sizeof(msg), "DATA chunk of length %d", chk_length); op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19; - sctp_abort_association(inp, stcb, m, iphlen, - src, dst, sh, op_err, - use_mflowid, mflowid, - vrf_id, port); - return (2); - } - if ((size_t)chk_length == sizeof(struct sctp_data_chunk)) { - /* - * Need to send an abort since we had an - * empty data chunk. - */ - struct mbuf *op_err; - - op_err = sctp_generate_no_user_data_cause(ch->dp.tsn); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19; - sctp_abort_association(inp, stcb, m, iphlen, - src, dst, sh, op_err, - use_mflowid, mflowid, - vrf_id, port); + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_20; + sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); return (2); } #ifdef SCTP_AUDITING_ENABLED @@ -2419,9 +2674,9 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, } else { last_chunk = 0; } - if (sctp_process_a_data_chunk(stcb, asoc, mm, *offset, ch, + if (sctp_process_a_data_chunk(stcb, asoc, mm, *offset, chk_length, net, high_tsn, &abort_flag, &break_flag, - last_chunk)) { + last_chunk, ch->chunk_type)) { num_chunks++; } if (abort_flag) @@ -2437,7 +2692,7 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, } } else { /* not a data chunk in the data region */ - switch (ch->ch.chunk_type) { + switch (ch->chunk_type) { case SCTP_INITIATION: case SCTP_INITIATION_ACK: case SCTP_SELECTIVE_ACK: @@ -2459,64 +2714,50 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, case SCTP_STREAM_RESET: case SCTP_FORWARD_CUM_TSN: case SCTP_ASCONF: - /* - * Now, what do we do with KNOWN chunks that - * are NOT in the right place? - * - * For now, I do nothing but ignore them. We - * may later want to add sysctl stuff to - * switch out and do either an ABORT() or - * possibly process them. - */ - if (SCTP_BASE_SYSCTL(sctp_strict_data_order)) { + { + /* + * Now, what do we do with KNOWN + * chunks that are NOT in the right + * place? + * + * For now, I do nothing but ignore + * them. We may later want to add + * sysctl stuff to switch out and do + * either an ABORT() or possibly + * process them. + */ struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, ""); - sctp_abort_association(inp, stcb, - m, iphlen, - src, dst, - sh, op_err, - use_mflowid, mflowid, - vrf_id, port); + snprintf(msg, sizeof(msg), "DATA chunk followed by chunk of type %2.2x", + ch->chunk_type); + op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); + sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); return (2); } - break; default: /* unknown chunk type, use bit rules */ - if (ch->ch.chunk_type & 0x40) { + if (ch->chunk_type & 0x40) { /* Add a error report to the queue */ - struct mbuf *merr; - struct sctp_paramhdr *phd; - - merr = sctp_get_mbuf_for_msg(sizeof(*phd), 0, M_DONTWAIT, 1, MT_DATA); - if (merr) { - phd = mtod(merr, struct sctp_paramhdr *); - /* - * We cheat and use param - * type since we did not - * bother to define a error - * cause struct. They are - * the same basic format - * with different names. - */ - phd->param_type = - htons(SCTP_CAUSE_UNRECOG_CHUNK); - phd->param_length = - htons(chk_length + sizeof(*phd)); - SCTP_BUF_LEN(merr) = sizeof(*phd); - SCTP_BUF_NEXT(merr) = SCTP_M_COPYM(m, *offset, chk_length, M_DONTWAIT); - if (SCTP_BUF_NEXT(merr)) { - if (sctp_pad_lastmbuf(SCTP_BUF_NEXT(merr), SCTP_SIZE32(chk_length) - chk_length, NULL)) { - sctp_m_freem(merr); - } else { - sctp_queue_op_err(stcb, merr); - } + struct mbuf *op_err; + struct sctp_gen_error_cause *cause; + + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_gen_error_cause), + 0, M_NOWAIT, 1, MT_DATA); + if (op_err != NULL) { + cause = mtod(op_err, struct sctp_gen_error_cause *); + cause->code = htons(SCTP_CAUSE_UNRECOG_CHUNK); + cause->length = htons((uint16_t) (chk_length + sizeof(struct sctp_gen_error_cause))); + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause); + SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(m, *offset, chk_length, M_NOWAIT); + if (SCTP_BUF_NEXT(op_err) != NULL) { + sctp_queue_op_err(stcb, op_err); } else { - sctp_m_freem(merr); + sctp_m_freem(op_err); } } } - if ((ch->ch.chunk_type & 0x80) == 0) { + if ((ch->chunk_type & 0x80) == 0) { /* discard the rest of this packet */ stop_proc = 1; } /* else skip this bad chunk and @@ -2530,8 +2771,8 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, stop_proc = 1; continue; } - ch = (struct sctp_data_chunk *)sctp_m_getptr(m, *offset, - sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf); + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, + sizeof(struct sctp_chunkhdr), (uint8_t *) & chunk_buf); if (ch == NULL) { *offset = length; stop_proc = 1; @@ -2561,9 +2802,6 @@ sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_last_rcvd); } /* now service all of the reassm queue if needed */ - if (!(TAILQ_EMPTY(&asoc->reasmqueue))) - sctp_service_queues(stcb, asoc); - if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { /* Assure that we ack right away */ stcb->asoc.send_sack = 1; @@ -2604,12 +2842,14 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1 * cumack trackers for first transmissions, * and retransmissions. */ - if ((tp1->whoTo->find_pseudo_cumack == 1) && (tp1->sent < SCTP_DATAGRAM_RESEND) && + if ((tp1->sent < SCTP_DATAGRAM_RESEND) && + (tp1->whoTo->find_pseudo_cumack == 1) && (tp1->snd_count == 1)) { tp1->whoTo->pseudo_cumack = tp1->rec.data.TSN_seq; tp1->whoTo->find_pseudo_cumack = 0; } - if ((tp1->whoTo->find_rtx_pseudo_cumack == 1) && (tp1->sent < SCTP_DATAGRAM_RESEND) && + if ((tp1->sent < SCTP_DATAGRAM_RESEND) && + (tp1->whoTo->find_rtx_pseudo_cumack == 1) && (tp1->snd_count > 1)) { tp1->whoTo->rtx_pseudo_cumack = tp1->rec.data.TSN_seq; tp1->whoTo->find_rtx_pseudo_cumack = 0; @@ -2697,7 +2937,7 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1 sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_GAP, tp1->whoTo->flight_size, tp1->book_size, - (uintptr_t) tp1->whoTo, + (uint32_t) (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } sctp_flight_size_decrease(tp1); @@ -2772,6 +3012,11 @@ sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1 panic("No chunks on the queues for sid %u.", tp1->rec.data.stream_number); #endif } + if ((stcb->asoc.strmout[tp1->rec.data.stream_number].chunks_on_queues == 0) && + (stcb->asoc.strmout[tp1->rec.data.stream_number].state == SCTP_STREAM_RESET_PENDING) && + TAILQ_EMPTY(&stcb->asoc.strmout[tp1->rec.data.stream_number].outqueue)) { + stcb->asoc.trigger_reset = 1; + } tp1->sent = SCTP_DATAGRAM_NR_ACKED; if (tp1->data) { /* @@ -2901,7 +3146,7 @@ sctp_check_for_revoked(struct sctp_tcb *stcb, sctp_misc_ints(SCTP_FLIGHT_LOG_UP_REVOKE, tp1->whoTo->flight_size, tp1->book_size, - (uintptr_t) tp1->whoTo, + (uint32_t) (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } sctp_flight_size_increase(tp1); @@ -2961,7 +3206,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc, num_dests_sacked++; } } - if (stcb->asoc.peer_supports_prsctp) { + if (stcb->asoc.prsctp_supported) { (void)SCTP_GETTIME_TIMEVAL(&now); } TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { @@ -2982,7 +3227,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc, /* done */ break; } - if (stcb->asoc.peer_supports_prsctp) { + if (stcb->asoc.prsctp_supported) { if ((PR_SCTP_TTL_ENABLED(tp1->flags)) && tp1->sent < SCTP_DATAGRAM_ACKED) { /* Is it expired? */ if (timevalcmp(&now, &tp1->rec.data.timetodrop, >)) { @@ -3215,7 +3460,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc, sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_RSND, (tp1->whoTo ? (tp1->whoTo->flight_size) : 0), tp1->book_size, - (uintptr_t) tp1->whoTo, + (uint32_t) (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } if (tp1->whoTo) { @@ -3236,7 +3481,7 @@ sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc, /* remove from the total flight */ sctp_total_flight_decrease(stcb, tp1); - if ((stcb->asoc.peer_supports_prsctp) && + if ((stcb->asoc.prsctp_supported) && (PR_SCTP_RTX_ENABLED(tp1->flags))) { /* * Has it been retransmitted tv_sec times? - @@ -3381,7 +3626,7 @@ sctp_try_advance_peer_ack_point(struct sctp_tcb *stcb, struct timeval now; int now_filled = 0; - if (asoc->peer_supports_prsctp == 0) { + if (asoc->prsctp_supported == 0) { return (NULL); } TAILQ_FOREACH_SAFE(tp1, &asoc->sent_queue, sctp_next, tp2) { @@ -3467,18 +3712,24 @@ sctp_fs_audit(struct sctp_association *asoc) { struct sctp_tmit_chunk *chk; int inflight = 0, resend = 0, inbetween = 0, acked = 0, above = 0; - int entry_flight, entry_cnt, ret; + int ret; + +#ifndef INVARIANTS + int entry_flight, entry_cnt; + +#endif + ret = 0; +#ifndef INVARIANTS entry_flight = asoc->total_flight; entry_cnt = asoc->total_flight_count; - ret = 0; - +#endif if (asoc->pr_sctp_cnt >= asoc->sent_queue_cnt) return (0); TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { if (chk->sent < SCTP_DATAGRAM_RESEND) { - SCTP_PRINTF("Chk TSN:%u size:%d inflight cnt:%d\n", + SCTP_PRINTF("Chk TSN: %u size: %d inflight cnt: %d\n", chk->rec.data.TSN_seq, chk->send_size, chk->snd_count); @@ -3498,10 +3749,10 @@ sctp_fs_audit(struct sctp_association *asoc) #ifdef INVARIANTS panic("Flight size-express incorrect? \n"); #else - SCTP_PRINTF("asoc->total_flight:%d cnt:%d\n", + SCTP_PRINTF("asoc->total_flight: %d cnt: %d\n", entry_flight, entry_cnt); - SCTP_PRINTF("Flight size-express incorrect F:%d I:%d R:%d Ab:%d ACK:%d\n", + SCTP_PRINTF("Flight size-express incorrect F: %d I: %d R: %d Ab: %d ACK: %d\n", inflight, inbetween, resend, above, acked); ret = 1; #endif @@ -3519,9 +3770,9 @@ sctp_window_probe_recovery(struct sctp_tcb *stcb, if ((tp1->sent >= SCTP_DATAGRAM_ACKED) || (tp1->data == NULL)) { /* TSN's skipped we do NOT move back. */ sctp_misc_ints(SCTP_FLIGHT_LOG_DWN_WP_FWD, - tp1->whoTo->flight_size, + tp1->whoTo ? tp1->whoTo->flight_size : 0, tp1->book_size, - (uintptr_t) tp1->whoTo, + (uint32_t) (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); return; } @@ -3540,7 +3791,7 @@ sctp_window_probe_recovery(struct sctp_tcb *stcb, sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_WP, tp1->whoTo->flight_size, tp1->book_size, - (uintptr_t) tp1->whoTo, + (uint32_t) (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } } @@ -3557,6 +3808,7 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, int win_probe_recovered = 0; int j, done_once = 0; int rto_ok = 1; + uint32_t send_s; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_SACK_ARRIVALS_ENABLE) { sctp_misc_ints(SCTP_SACK_LOG_EXPRESS, cumack, @@ -3608,36 +3860,25 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, (*stcb->asoc.cc_functions.sctp_cwnd_prepare_net_for_sack) (stcb, net); } } - if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) { - uint32_t send_s; - - if (!TAILQ_EMPTY(&asoc->sent_queue)) { - tp1 = TAILQ_LAST(&asoc->sent_queue, - sctpchunk_listhead); - send_s = tp1->rec.data.TSN_seq + 1; - } else { - send_s = asoc->sending_seq; - } - if (SCTP_TSN_GE(cumack, send_s)) { -#ifndef INVARIANTS - struct mbuf *op_err; - char msg[SCTP_DIAG_INFO_LEN]; - -#endif -#ifdef INVARIANTS - panic("Impossible sack 1"); -#else + if (!TAILQ_EMPTY(&asoc->sent_queue)) { + tp1 = TAILQ_LAST(&asoc->sent_queue, + sctpchunk_listhead); + send_s = tp1->rec.data.TSN_seq + 1; + } else { + send_s = asoc->sending_seq; + } + if (SCTP_TSN_GE(cumack, send_s)) { + struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; - *abort_now = 1; - /* XXX */ - snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal then TSN %8.8x", - cumack, send_s); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - return; -#endif - } + *abort_now = 1; + /* XXX */ + snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal than TSN %8.8x", + cumack, send_s); + op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_21; + sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); + return; } asoc->this_sack_highest_gap = cumack; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { @@ -3666,7 +3907,7 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_CA, tp1->whoTo->flight_size, tp1->book_size, - (uintptr_t) tp1->whoTo, + (uint32_t) (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } sctp_flight_size_decrease(tp1); @@ -3746,6 +3987,11 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, #endif } } + if ((asoc->strmout[tp1->rec.data.stream_number].chunks_on_queues == 0) && + (asoc->strmout[tp1->rec.data.stream_number].state == SCTP_STREAM_RESET_PENDING) && + TAILQ_EMPTY(&asoc->strmout[tp1->rec.data.stream_number].outqueue)) { + asoc->trigger_reset = 1; + } TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next); if (tp1->data) { /* sa_ignore NO_NULL_CHK */ @@ -3830,7 +4076,9 @@ sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, } if (net->dest_state & SCTP_ADDR_PF) { net->dest_state &= ~SCTP_ADDR_PF; - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, + stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_22); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); asoc->cc_functions.sctp_cwnd_update_exit_pf(stcb, net); /* Done with this net */ @@ -3916,7 +4164,7 @@ again: } else if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, - SCTP_FROM_SCTP_INDATA + SCTP_LOC_22); + SCTP_FROM_SCTP_INDATA + SCTP_LOC_23); } } } @@ -3957,28 +4205,8 @@ again: if ((asoc->stream_queue_cnt == 1) && ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) && - (asoc->locked_on_sending) - ) { - struct sctp_stream_queue_pending *sp; - - /* - * I may be in a state where we got all across.. but - * cannot write more due to a shutdown... we abort - * since the user did not indicate EOR in this case. - * The sp will be cleaned during free of the asoc. - */ - sp = TAILQ_LAST(&((asoc->locked_on_sending)->outqueue), - sctp_streamhead); - if ((sp) && (sp->length == 0)) { - /* Let cleanup code purge it */ - if (sp->msg_is_complete) { - asoc->stream_queue_cnt--; - } else { - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; - asoc->locked_on_sending = NULL; - asoc->stream_queue_cnt--; - } - } + ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc))) { + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; } if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) && (asoc->stream_queue_cnt == 0)) { @@ -3992,6 +4220,7 @@ again: op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_24; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); + return; } else { struct sctp_nets *netp; @@ -4043,7 +4272,7 @@ again: asoc->advanced_peer_ack_point = cumack; } /* PR-Sctp issues need to be addressed too */ - if ((asoc->peer_supports_prsctp) && (asoc->pr_sctp_cnt > 0)) { + if ((asoc->prsctp_supported) && (asoc->pr_sctp_cnt > 0)) { struct sctp_tmit_chunk *lchk; uint32_t old_adv_peer_ack_point; @@ -4173,40 +4402,38 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, sctp_log_fr(*dupdata, 0, 0, SCTP_FR_DUPED); } } - if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) { - /* reality check */ - if (!TAILQ_EMPTY(&asoc->sent_queue)) { - tp1 = TAILQ_LAST(&asoc->sent_queue, - sctpchunk_listhead); - send_s = tp1->rec.data.TSN_seq + 1; - } else { - tp1 = NULL; - send_s = asoc->sending_seq; - } - if (SCTP_TSN_GE(cum_ack, send_s)) { - struct mbuf *op_err; - char msg[SCTP_DIAG_INFO_LEN]; + /* reality check */ + if (!TAILQ_EMPTY(&asoc->sent_queue)) { + tp1 = TAILQ_LAST(&asoc->sent_queue, + sctpchunk_listhead); + send_s = tp1->rec.data.TSN_seq + 1; + } else { + tp1 = NULL; + send_s = asoc->sending_seq; + } + if (SCTP_TSN_GE(cum_ack, send_s)) { + struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; - /* - * no way, we have not even sent this TSN out yet. - * Peer is hopelessly messed up with us. - */ - SCTP_PRINTF("NEW cum_ack:%x send_s:%x is smaller or equal\n", - cum_ack, send_s); - if (tp1) { - SCTP_PRINTF("Got send_s from tsn:%x + 1 of tp1:%p\n", - tp1->rec.data.TSN_seq, (void *)tp1); - } - hopeless_peer: - *abort_now = 1; - /* XXX */ - snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal then TSN %8.8x", - cum_ack, send_s); - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); - stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25; - sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); - return; - } + /* + * no way, we have not even sent this TSN out yet. Peer is + * hopelessly messed up with us. + */ + SCTP_PRINTF("NEW cum_ack:%x send_s:%x is smaller or equal\n", + cum_ack, send_s); + if (tp1) { + SCTP_PRINTF("Got send_s from tsn:%x + 1 of tp1: %p\n", + tp1->rec.data.TSN_seq, (void *)tp1); + } +hopeless_peer: + *abort_now = 1; + /* XXX */ + snprintf(msg, sizeof(msg), "Cum ack %8.8x greater or equal than TSN %8.8x", + cum_ack, send_s); + op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25; + sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); + return; } /**********************/ /* 1) check the range */ @@ -4299,7 +4526,7 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_CA, tp1->whoTo->flight_size, tp1->book_size, - (uintptr_t) tp1->whoTo, + (uint32_t) (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } sctp_flight_size_decrease(tp1); @@ -4416,20 +4643,18 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, num_seg, num_nr_seg, &rto_ok)) { wake_him++; } - if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) { + /* + * validate the biggest_tsn_acked in the gap acks if strict + * adherence is wanted. + */ + if (SCTP_TSN_GE(biggest_tsn_acked, send_s)) { /* - * validate the biggest_tsn_acked in the gap acks if - * strict adherence is wanted. + * peer is either confused or we are under attack. + * We must abort. */ - if (SCTP_TSN_GE(biggest_tsn_acked, send_s)) { - /* - * peer is either confused or we are under - * attack. We must abort. - */ - SCTP_PRINTF("Hopeless peer! biggest_tsn_acked:%x largest seq:%x\n", - biggest_tsn_acked, send_s); - goto hopeless_peer; - } + SCTP_PRINTF("Hopeless peer! biggest_tsn_acked:%x largest seq:%x\n", + biggest_tsn_acked, send_s); + goto hopeless_peer; } } /*******************************************/ @@ -4469,6 +4694,11 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, #endif } } + if ((asoc->strmout[tp1->rec.data.stream_number].chunks_on_queues == 0) && + (asoc->strmout[tp1->rec.data.stream_number].state == SCTP_STREAM_RESET_PENDING) && + TAILQ_EMPTY(&asoc->strmout[tp1->rec.data.stream_number].outqueue)) { + asoc->trigger_reset = 1; + } TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next); if (PR_SCTP_ENABLED(tp1->flags)) { if (asoc->pr_sctp_cnt != 0) @@ -4480,7 +4710,7 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, sctp_free_bufspace(stcb, asoc, tp1, 1); sctp_m_freem(tp1->data); tp1->data = NULL; - if (asoc->peer_supports_prsctp && PR_SCTP_BUF_ENABLED(tp1->flags)) { + if (asoc->prsctp_supported && PR_SCTP_BUF_ENABLED(tp1->flags)) { asoc->sent_queue_cnt_removeable--; } } @@ -4497,7 +4727,7 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, } if (TAILQ_EMPTY(&asoc->sent_queue) && (asoc->total_flight > 0)) { #ifdef INVARIANTS - panic("Warning flight size is postive and should be 0"); + panic("Warning flight size is positive and should be 0"); #else SCTP_PRINTF("Warning flight size incorrect should be 0 is %d\n", asoc->total_flight); @@ -4567,7 +4797,7 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, sctp_misc_ints(SCTP_FLIGHT_LOG_UP_REVOKE, tp1->whoTo->flight_size, tp1->book_size, - (uintptr_t) tp1->whoTo, + (uint32_t) (uintptr_t) tp1->whoTo, tp1->rec.data.TSN_seq); } sctp_flight_size_increase(tp1); @@ -4620,7 +4850,9 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, } if (net->dest_state & SCTP_ADDR_PF) { net->dest_state &= ~SCTP_ADDR_PF; - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, + stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_29); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); asoc->cc_functions.sctp_cwnd_update_exit_pf(stcb, net); /* Done with this net */ @@ -4643,7 +4875,8 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, TAILQ_FOREACH(net, &asoc->nets, sctp_next) { /* stop all timers */ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, - stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_30); + stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_30); net->flight_size = 0; net->partial_bytes_acked = 0; } @@ -4668,26 +4901,8 @@ sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, if ((asoc->stream_queue_cnt == 1) && ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) && - (asoc->locked_on_sending) - ) { - struct sctp_stream_queue_pending *sp; - - /* - * I may be in a state where we got all across.. but - * cannot write more due to a shutdown... we abort - * since the user did not indicate EOR in this case. - */ - sp = TAILQ_LAST(&((asoc->locked_on_sending)->outqueue), - sctp_streamhead); - if ((sp) && (sp->length == 0)) { - asoc->locked_on_sending = NULL; - if (sp->msg_is_complete) { - asoc->stream_queue_cnt--; - } else { - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; - asoc->stream_queue_cnt--; - } - } + ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc))) { + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; } if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) && (asoc->stream_queue_cnt == 0)) { @@ -4851,7 +5066,7 @@ again: } else if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, - SCTP_FROM_SCTP_INDATA + SCTP_LOC_22); + SCTP_FROM_SCTP_INDATA + SCTP_LOC_32); } } } @@ -4892,7 +5107,7 @@ again: asoc->advanced_peer_ack_point = cum_ack; } /* C2. try to further move advancedPeerAckPoint ahead */ - if ((asoc->peer_supports_prsctp) && (asoc->pr_sctp_cnt > 0)) { + if ((asoc->prsctp_supported) && (asoc->pr_sctp_cnt > 0)) { struct sctp_tmit_chunk *lchk; uint32_t old_adv_peer_ack_point; @@ -4952,134 +5167,219 @@ sctp_kick_prsctp_reorder_queue(struct sctp_tcb *stcb, { struct sctp_queued_to_read *ctl, *nctl; struct sctp_association *asoc; - uint16_t tt; + uint32_t tt; + int need_reasm_check = 0, old; asoc = &stcb->asoc; tt = strmin->last_sequence_delivered; + if (asoc->idata_supported) { + old = 0; + } else { + old = 1; + } /* * First deliver anything prior to and including the stream no that - * came in + * came in. */ - TAILQ_FOREACH_SAFE(ctl, &strmin->inqueue, next, nctl) { - if (SCTP_SSN_GE(tt, ctl->sinfo_ssn)) { + TAILQ_FOREACH_SAFE(ctl, &strmin->inqueue, next_instrm, nctl) { + if (SCTP_MSGID_GE(old, tt, ctl->sinfo_ssn)) { /* this is deliverable now */ - TAILQ_REMOVE(&strmin->inqueue, ctl, next); - /* subtract pending on streams */ - asoc->size_on_all_streams -= ctl->length; - sctp_ucount_decr(asoc->cnt_on_all_streams); - /* deliver it to at least the delivery-q */ - if (stcb->sctp_socket) { - sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); - sctp_add_to_readq(stcb->sctp_ep, stcb, - ctl, - &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED); + if (((ctl->sinfo_flags >> 8) & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) { + if (ctl->on_strm_q) { + if (ctl->on_strm_q == SCTP_ON_ORDERED) { + TAILQ_REMOVE(&strmin->inqueue, ctl, next_instrm); + } else if (ctl->on_strm_q == SCTP_ON_UNORDERED) { + TAILQ_REMOVE(&strmin->uno_inqueue, ctl, next_instrm); +#ifdef INVARIANTS + } else { + panic("strmin: %p ctl: %p unknown %d", + strmin, ctl, ctl->on_strm_q); +#endif + } + ctl->on_strm_q = 0; + } + /* subtract pending on streams */ + asoc->size_on_all_streams -= ctl->length; + sctp_ucount_decr(asoc->cnt_on_all_streams); + /* deliver it to at least the delivery-q */ + if (stcb->sctp_socket) { + sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); + sctp_add_to_readq(stcb->sctp_ep, stcb, + ctl, + &stcb->sctp_socket->so_rcv, + 1, SCTP_READ_LOCK_HELD, + SCTP_SO_NOT_LOCKED); + } + } else { + /* Its a fragmented message */ + if (ctl->first_frag_seen) { + /* + * Make it so this is next to + * deliver, we restore later + */ + strmin->last_sequence_delivered = ctl->sinfo_ssn - 1; + need_reasm_check = 1; + break; + } } } else { /* no more delivery now. */ break; } } + if (need_reasm_check) { + int ret; + + ret = sctp_deliver_reasm_check(stcb, &stcb->asoc, strmin, SCTP_READ_LOCK_HELD); + if (SCTP_MSGID_GT(old, tt, strmin->last_sequence_delivered)) { + /* Restore the next to deliver unless we are ahead */ + strmin->last_sequence_delivered = tt; + } + if (ret == 0) { + /* Left the front Partial one on */ + return; + } + need_reasm_check = 0; + } /* * now we must deliver things in queue the normal way if any are * now ready. */ tt = strmin->last_sequence_delivered + 1; - TAILQ_FOREACH_SAFE(ctl, &strmin->inqueue, next, nctl) { + TAILQ_FOREACH_SAFE(ctl, &strmin->inqueue, next_instrm, nctl) { if (tt == ctl->sinfo_ssn) { - /* this is deliverable now */ - TAILQ_REMOVE(&strmin->inqueue, ctl, next); - /* subtract pending on streams */ - asoc->size_on_all_streams -= ctl->length; - sctp_ucount_decr(asoc->cnt_on_all_streams); - /* deliver it to at least the delivery-q */ - strmin->last_sequence_delivered = ctl->sinfo_ssn; - if (stcb->sctp_socket) { - sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); - sctp_add_to_readq(stcb->sctp_ep, stcb, - ctl, - &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED); + if (((ctl->sinfo_flags >> 8) & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) { + /* this is deliverable now */ + if (ctl->on_strm_q) { + if (ctl->on_strm_q == SCTP_ON_ORDERED) { + TAILQ_REMOVE(&strmin->inqueue, ctl, next_instrm); + } else if (ctl->on_strm_q == SCTP_ON_UNORDERED) { + TAILQ_REMOVE(&strmin->uno_inqueue, ctl, next_instrm); +#ifdef INVARIANTS + } else { + panic("strmin: %p ctl: %p unknown %d", + strmin, ctl, ctl->on_strm_q); +#endif + } + ctl->on_strm_q = 0; + } + /* subtract pending on streams */ + asoc->size_on_all_streams -= ctl->length; + sctp_ucount_decr(asoc->cnt_on_all_streams); + /* deliver it to at least the delivery-q */ + strmin->last_sequence_delivered = ctl->sinfo_ssn; + if (stcb->sctp_socket) { + sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); + sctp_add_to_readq(stcb->sctp_ep, stcb, + ctl, + &stcb->sctp_socket->so_rcv, 1, + SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED); + } + tt = strmin->last_sequence_delivered + 1; + } else { + /* Its a fragmented message */ + if (ctl->first_frag_seen) { + /* + * Make it so this is next to + * deliver + */ + strmin->last_sequence_delivered = ctl->sinfo_ssn - 1; + need_reasm_check = 1; + break; + } } - tt = strmin->last_sequence_delivered + 1; } else { break; } } + if (need_reasm_check) { + (void)sctp_deliver_reasm_check(stcb, &stcb->asoc, strmin, SCTP_READ_LOCK_HELD); + } } + + static void sctp_flush_reassm_for_str_seq(struct sctp_tcb *stcb, struct sctp_association *asoc, - uint16_t stream, uint16_t seq) + uint16_t stream, uint32_t seq, int ordered, int old, uint32_t cumtsn) { + struct sctp_queued_to_read *control; + struct sctp_stream_in *strm; struct sctp_tmit_chunk *chk, *nchk; + int cnt_removed = 0; - /* For each one on here see if we need to toss it */ /* - * For now large messages held on the reasmqueue that are complete + * For now large messages held on the stream reasm that are complete * will be tossed too. We could in theory do more work to spin * through and stop after dumping one msg aka seeing the start of a * new msg at the head, and call the delivery function... to see if * it can be delivered... But for now we just dump everything on the * queue. */ - TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) { - /* - * Do not toss it if on a different stream or marked for - * unordered delivery in which case the stream sequence - * number has no meaning. - */ - if ((chk->rec.data.stream_number != stream) || - ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == SCTP_DATA_UNORDERED)) { - continue; - } - if (chk->rec.data.stream_seq == seq) { - /* It needs to be tossed */ - TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); - if (SCTP_TSN_GT(chk->rec.data.TSN_seq, asoc->tsn_last_delivered)) { - asoc->tsn_last_delivered = chk->rec.data.TSN_seq; - asoc->str_of_pdapi = chk->rec.data.stream_number; - asoc->ssn_of_pdapi = chk->rec.data.stream_seq; - asoc->fragment_flags = chk->rec.data.rcv_flags; - } - asoc->size_on_reasm_queue -= chk->send_size; - sctp_ucount_decr(asoc->cnt_on_reasm_queue); - - /* Clear up any stream problem */ - if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != SCTP_DATA_UNORDERED && - SCTP_SSN_GT(chk->rec.data.stream_seq, asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered)) { - /* - * We must dump forward this streams - * sequence number if the chunk is not - * unordered that is being skipped. There is - * a chance that if the peer does not - * include the last fragment in its FWD-TSN - * we WILL have a problem here since you - * would have a partial chunk in queue that - * may not be deliverable. Also if a Partial - * delivery API as started the user may get - * a partial chunk. The next read returning - * a new chunk... really ugly but I see no - * way around it! Maybe a notify?? - */ - asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered = chk->rec.data.stream_seq; - } - if (chk->data) { - sctp_m_freem(chk->data); - chk->data = NULL; + strm = &asoc->strmin[stream]; + control = sctp_find_reasm_entry(strm, (uint32_t) seq, ordered, old); + if (control == NULL) { + /* Not found */ + return; + } + TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { + /* Purge hanging chunks */ + if (old && (ordered == 0)) { + if (SCTP_TSN_GT(chk->rec.data.TSN_seq, cumtsn)) { + break; } - sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); - } else if (SCTP_SSN_GT(chk->rec.data.stream_seq, seq)) { - /* - * If the stream_seq is > than the purging one, we - * are done - */ - break; } + cnt_removed++; + TAILQ_REMOVE(&control->reasm, chk, sctp_next); + asoc->size_on_reasm_queue -= chk->send_size; + sctp_ucount_decr(asoc->cnt_on_reasm_queue); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); + } + if (!TAILQ_EMPTY(&control->reasm)) { + /* This has to be old data, unordered */ + if (control->data) { + sctp_m_freem(control->data); + control->data = NULL; + } + sctp_reset_a_control(control, stcb->sctp_ep, cumtsn); + chk = TAILQ_FIRST(&control->reasm); + if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { + TAILQ_REMOVE(&control->reasm, chk, sctp_next); + sctp_add_chk_to_control(control, strm, stcb, asoc, + chk, SCTP_READ_LOCK_HELD); + } + sctp_deliver_reasm_check(stcb, asoc, strm, SCTP_READ_LOCK_HELD); + return; + } + if (control->on_strm_q == SCTP_ON_ORDERED) { + TAILQ_REMOVE(&strm->inqueue, control, next_instrm); + control->on_strm_q = 0; + } else if (control->on_strm_q == SCTP_ON_UNORDERED) { + TAILQ_REMOVE(&strm->uno_inqueue, control, next_instrm); + control->on_strm_q = 0; +#ifdef INVARIANTS + } else if (control->on_strm_q) { + panic("strm: %p ctl: %p unknown %d", + strm, control, control->on_strm_q); +#endif + } + control->on_strm_q = 0; + if (control->on_read_q == 0) { + sctp_free_remote_addr(control->whoFrom); + if (control->data) { + sctp_m_freem(control->data); + control->data = NULL; + } + sctp_free_a_readq(stcb, control); } } - void sctp_handle_forward_tsn(struct sctp_tcb *stcb, struct sctp_forward_tsn_chunk *fwd, @@ -5102,7 +5402,6 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb, unsigned int i, fwd_sz, m_size; uint32_t str_seq; struct sctp_stream_in *strm; - struct sctp_tmit_chunk *chk, *nchk; struct sctp_queued_to_read *ctl, *sv; asoc = &stcb->asoc; @@ -5172,66 +5471,17 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb, /*************************************************************/ /* 2. Clear up re-assembly queue */ /*************************************************************/ - /* - * First service it if pd-api is up, just in case we can progress it - * forward - */ - if (asoc->fragmented_delivery_inprogress) { - sctp_service_reassembly(stcb, asoc); - } - /* For each one on here see if we need to toss it */ - /* - * For now large messages held on the reasmqueue that are complete - * will be tossed too. We could in theory do more work to spin - * through and stop after dumping one msg aka seeing the start of a - * new msg at the head, and call the delivery function... to see if - * it can be delivered... But for now we just dump everything on the - * queue. - */ - TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) { - if (SCTP_TSN_GE(new_cum_tsn, chk->rec.data.TSN_seq)) { - /* It needs to be tossed */ - TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); - if (SCTP_TSN_GT(chk->rec.data.TSN_seq, asoc->tsn_last_delivered)) { - asoc->tsn_last_delivered = chk->rec.data.TSN_seq; - asoc->str_of_pdapi = chk->rec.data.stream_number; - asoc->ssn_of_pdapi = chk->rec.data.stream_seq; - asoc->fragment_flags = chk->rec.data.rcv_flags; - } - asoc->size_on_reasm_queue -= chk->send_size; - sctp_ucount_decr(asoc->cnt_on_reasm_queue); - - /* Clear up any stream problem */ - if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != SCTP_DATA_UNORDERED && - SCTP_SSN_GT(chk->rec.data.stream_seq, asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered)) { - /* - * We must dump forward this streams - * sequence number if the chunk is not - * unordered that is being skipped. There is - * a chance that if the peer does not - * include the last fragment in its FWD-TSN - * we WILL have a problem here since you - * would have a partial chunk in queue that - * may not be deliverable. Also if a Partial - * delivery API as started the user may get - * a partial chunk. The next read returning - * a new chunk... really ugly but I see no - * way around it! Maybe a notify?? - */ - asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered = chk->rec.data.stream_seq; - } - if (chk->data) { - sctp_m_freem(chk->data); - chk->data = NULL; - } - sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); - } else { - /* - * Ok we have gone beyond the end of the fwd-tsn's - * mark. - */ - break; + + /* This is now done as part of clearing up the stream/seq */ + if (asoc->idata_supported == 0) { + uint16_t sid; + + /* Flush all the un-ordered data based on cum-tsn */ + SCTP_INP_READ_LOCK(stcb->sctp_ep); + for (sid = 0; sid < asoc->streamincnt; sid++) { + sctp_flush_reassm_for_str_seq(stcb, asoc, sid, 0, 0, 1, new_cum_tsn); } + SCTP_INP_READ_UNLOCK(stcb->sctp_ep); } /*******************************************************/ /* 3. Update the PR-stream re-ordering queues and fix */ @@ -5241,27 +5491,53 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb, if (m && fwd_sz) { /* New method. */ unsigned int num_str; + uint32_t sequence; + uint16_t stream; + uint16_t ordered, flags; + int old; struct sctp_strseq *stseq, strseqbuf; + struct sctp_strseq_mid *stseq_m, strseqbuf_m; offset += sizeof(*fwd); SCTP_INP_READ_LOCK(stcb->sctp_ep); - num_str = fwd_sz / sizeof(struct sctp_strseq); + if (asoc->idata_supported) { + num_str = fwd_sz / sizeof(struct sctp_strseq_mid); + old = 0; + } else { + num_str = fwd_sz / sizeof(struct sctp_strseq); + old = 1; + } for (i = 0; i < num_str; i++) { - uint16_t st; - - stseq = (struct sctp_strseq *)sctp_m_getptr(m, offset, - sizeof(struct sctp_strseq), - (uint8_t *) & strseqbuf); - offset += sizeof(struct sctp_strseq); - if (stseq == NULL) { - break; + if (asoc->idata_supported) { + stseq_m = (struct sctp_strseq_mid *)sctp_m_getptr(m, offset, + sizeof(struct sctp_strseq_mid), + (uint8_t *) & strseqbuf_m); + offset += sizeof(struct sctp_strseq_mid); + if (stseq_m == NULL) { + break; + } + stream = ntohs(stseq_m->stream); + sequence = ntohl(stseq_m->msg_id); + flags = ntohs(stseq_m->flags); + if (flags & PR_SCTP_UNORDERED_FLAG) { + ordered = 0; + } else { + ordered = 1; + } + } else { + stseq = (struct sctp_strseq *)sctp_m_getptr(m, offset, + sizeof(struct sctp_strseq), + (uint8_t *) & strseqbuf); + offset += sizeof(struct sctp_strseq); + if (stseq == NULL) { + break; + } + stream = ntohs(stseq->stream); + sequence = (uint32_t) ntohs(stseq->sequence); + ordered = 1; } /* Convert */ - st = ntohs(stseq->stream); - stseq->stream = st; - st = ntohs(stseq->sequence); - stseq->sequence = st; /* now process */ @@ -5270,12 +5546,12 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb, * queue where its not all delivered. If we find it * we transmute the read entry into a PDI_ABORTED. */ - if (stseq->stream >= asoc->streamincnt) { + if (stream >= asoc->streamincnt) { /* screwed up streams, stop! */ break; } - if ((asoc->str_of_pdapi == stseq->stream) && - (asoc->ssn_of_pdapi == stseq->sequence)) { + if ((asoc->str_of_pdapi == stream) && + (asoc->ssn_of_pdapi == sequence)) { /* * If this is the one we were partially * delivering now then we no longer are. @@ -5284,14 +5560,38 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb, */ asoc->fragmented_delivery_inprogress = 0; } - sctp_flush_reassm_for_str_seq(stcb, asoc, stseq->stream, stseq->sequence); + strm = &asoc->strmin[stream]; + if (asoc->idata_supported == 0) { + uint16_t strm_at; + + for (strm_at = strm->last_sequence_delivered; SCTP_MSGID_GE(1, sequence, strm_at); strm_at++) { + sctp_flush_reassm_for_str_seq(stcb, asoc, stream, strm_at, ordered, old, new_cum_tsn); + } + } else { + uint32_t strm_at; + + for (strm_at = strm->last_sequence_delivered; SCTP_MSGID_GE(0, sequence, strm_at); strm_at++) { + sctp_flush_reassm_for_str_seq(stcb, asoc, stream, strm_at, ordered, old, new_cum_tsn); + } + } TAILQ_FOREACH(ctl, &stcb->sctp_ep->read_queue, next) { - if ((ctl->sinfo_stream == stseq->stream) && - (ctl->sinfo_ssn == stseq->sequence)) { - str_seq = (stseq->stream << 16) | stseq->sequence; - ctl->end_added = 1; + if ((ctl->sinfo_stream == stream) && + (ctl->sinfo_ssn == sequence)) { + str_seq = (stream << 16) | (0x0000ffff & sequence); ctl->pdapi_aborted = 1; sv = stcb->asoc.control_pdapi; + ctl->end_added = 1; + if (ctl->on_strm_q == SCTP_ON_ORDERED) { + TAILQ_REMOVE(&strm->inqueue, ctl, next_instrm); + } else if (ctl->on_strm_q == SCTP_ON_UNORDERED) { + TAILQ_REMOVE(&strm->uno_inqueue, ctl, next_instrm); +#ifdef INVARIANTS + } else if (ctl->on_strm_q) { + panic("strm: %p ctl: %p unknown %d", + strm, ctl, ctl->on_strm_q); +#endif + } + ctl->on_strm_q = 0; stcb->asoc.control_pdapi = ctl; sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION, stcb, @@ -5300,16 +5600,15 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb, SCTP_SO_NOT_LOCKED); stcb->asoc.control_pdapi = sv; break; - } else if ((ctl->sinfo_stream == stseq->stream) && - SCTP_SSN_GT(ctl->sinfo_ssn, stseq->sequence)) { + } else if ((ctl->sinfo_stream == stream) && + SCTP_MSGID_GT(old, ctl->sinfo_ssn, sequence)) { /* We are past our victim SSN */ break; } } - strm = &asoc->strmin[stseq->stream]; - if (SCTP_SSN_GT(stseq->sequence, strm->last_sequence_delivered)) { + if (SCTP_MSGID_GT(old, sequence, strm->last_sequence_delivered)) { /* Update the sequence number */ - strm->last_sequence_delivered = stseq->sequence; + strm->last_sequence_delivered = sequence; } /* now kick the stream the new way */ /* sa_ignore NO_NULL_CHK */ @@ -5321,10 +5620,4 @@ sctp_handle_forward_tsn(struct sctp_tcb *stcb, * Now slide thing forward. */ sctp_slide_mapping_arrays(stcb); - - if (!TAILQ_EMPTY(&asoc->reasmqueue)) { - /* now lets kick out and check for more fragmented delivery */ - /* sa_ignore NO_NULL_CHK */ - sctp_deliver_reasm_check(stcb, &stcb->asoc); - } } diff --git a/freebsd/sys/netinet/sctp_indata.h b/freebsd/sys/netinet/sctp_indata.h index 79a86e2a..162ca905 100644 --- a/freebsd/sys/netinet/sctp_indata.h +++ b/freebsd/sys/netinet/sctp_indata.h @@ -43,35 +43,31 @@ sctp_build_readq_entry(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t tsn, uint32_t ppid, uint32_t context, uint16_t stream_no, - uint16_t stream_seq, uint8_t flags, + uint32_t stream_seq, uint8_t flags, struct mbuf *dm); -#define sctp_build_readq_entry_mac(_ctl, in_it, context, net, tsn, ppid, stream_no, stream_seq, flags, dm) do { \ +#define sctp_build_readq_entry_mac(_ctl, in_it, context, net, tsn, ppid, stream_no, stream_seq, flags, dm, tfsn, msgid) do { \ if (_ctl) { \ atomic_add_int(&((net)->ref_count), 1); \ + memset(_ctl, 0, sizeof(struct sctp_queued_to_read)); \ (_ctl)->sinfo_stream = stream_no; \ (_ctl)->sinfo_ssn = stream_seq; \ + TAILQ_INIT(&_ctl->reasm); \ + (_ctl)->top_fsn = tfsn; \ + (_ctl)->msg_id = msgid; \ (_ctl)->sinfo_flags = (flags << 8); \ (_ctl)->sinfo_ppid = ppid; \ (_ctl)->sinfo_context = context; \ - (_ctl)->sinfo_timetolive = 0; \ + (_ctl)->fsn_included = 0xffffffff; \ + (_ctl)->top_fsn = 0xffffffff; \ (_ctl)->sinfo_tsn = tsn; \ (_ctl)->sinfo_cumtsn = tsn; \ (_ctl)->sinfo_assoc_id = sctp_get_associd((in_it)); \ - (_ctl)->length = 0; \ - (_ctl)->held_length = 0; \ (_ctl)->whoFrom = net; \ (_ctl)->data = dm; \ - (_ctl)->tail_mbuf = NULL; \ - (_ctl)->aux_data = NULL; \ (_ctl)->stcb = (in_it); \ (_ctl)->port_from = (in_it)->rport; \ - (_ctl)->spec_flags = 0; \ - (_ctl)->do_not_ref_stcb = 0; \ - (_ctl)->end_added = 0; \ - (_ctl)->pdapi_aborted = 0; \ - (_ctl)->some_taken = 0; \ } \ } while (0) @@ -112,12 +108,8 @@ void int sctp_process_data(struct mbuf **, int, int *, int, - struct sockaddr *src, struct sockaddr *dst, - struct sctphdr *, struct sctp_inpcb *, struct sctp_tcb *, - struct sctp_nets *, uint32_t *, - uint8_t, uint32_t, - uint32_t, uint16_t); + struct sctp_nets *, uint32_t *); void sctp_slide_mapping_arrays(struct sctp_tcb *stcb); diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c index 9e35c882..621784ea 100644 --- a/freebsd/sys/netinet/sctp_input.c +++ b/freebsd/sys/netinet/sctp_input.c @@ -49,7 +49,9 @@ __FBSDID("$FreeBSD$"); #include #include #include +#if defined(INET) || defined(INET6) #include +#endif #include @@ -85,8 +87,8 @@ static void sctp_handle_init(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_chunk *cp, struct sctp_inpcb *inp, - struct sctp_tcb *stcb, int *abort_no_unlock, - uint8_t use_mflowid, uint32_t mflowid, + struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, + uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_init *init; @@ -101,7 +103,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset, if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) { op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; @@ -113,7 +115,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset, /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; @@ -123,7 +125,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset, /* invalid parameter... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; @@ -133,7 +135,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset, /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; @@ -143,7 +145,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset, /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; @@ -155,7 +157,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset, op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Problem with AUTH parameters"); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; @@ -186,7 +188,7 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset, op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "No listener"); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, inp->fibnum, vrf_id, port); } goto outnow; @@ -198,9 +200,9 @@ sctp_handle_init(struct mbuf *m, int iphlen, int offset, sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); } else { SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending INIT-ACK\n"); - sctp_send_initiate_ack(inp, stcb, m, iphlen, offset, src, dst, - sh, cp, - use_mflowid, mflowid, + sctp_send_initiate_ack(inp, stcb, net, m, iphlen, offset, + src, dst, sh, cp, + mflowtype, mflowid, vrf_id, port, ((stcb == NULL) ? SCTP_HOLDS_LOCK : SCTP_NOT_LOCKED)); } @@ -221,18 +223,18 @@ sctp_is_there_unsent_data(struct sctp_tcb *stcb, int so_locked #endif ) { - int unsent_data = 0; + int unsent_data; unsigned int i; struct sctp_stream_queue_pending *sp; struct sctp_association *asoc; /* - * This function returns the number of streams that have true unsent - * data on them. Note that as it looks through it will clean up any - * places that have old data that has been sent but left at top of - * stream queue. + * This function returns if any stream has true unsent data on it. + * Note that as it looks through it will clean up any places that + * have old data that has been sent but left at top of stream queue. */ asoc = &stcb->asoc; + unsent_data = 0; SCTP_TCB_SEND_LOCK(stcb); if (!stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { /* Check to see if some data queued */ @@ -260,6 +262,7 @@ sctp_is_there_unsent_data(struct sctp_tcb *stcb, int so_locked } atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1); TAILQ_REMOVE(&stcb->asoc.strmout[i].outqueue, sp, next); + stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, &asoc->strmout[i], sp, 1); if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; @@ -269,8 +272,13 @@ sctp_is_there_unsent_data(struct sctp_tcb *stcb, int so_locked sp->data = NULL; } sctp_free_a_strmoq(stcb, sp, so_locked); + if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { + unsent_data++; + } } else { unsent_data++; + } + if (unsent_data > 0) { break; } } @@ -341,8 +349,9 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) for (i = newcnt; i < asoc->pre_open_streams; i++) { outs = &asoc->strmout[i]; TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { + atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); - asoc->stream_queue_cnt--; + stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1); sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, 0, sp, SCTP_SO_NOT_LOCKED); if (sp->data) { @@ -357,14 +366,19 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) sctp_free_a_strmoq(stcb, sp, SCTP_SO_NOT_LOCKED); /* sa_ignore FREED_MEMORY */ } + outs->state = SCTP_STREAM_CLOSED; } } /* cut back the count */ asoc->pre_open_streams = newcnt; } SCTP_TCB_SEND_UNLOCK(stcb); - asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams; - + asoc->streamoutcnt = asoc->pre_open_streams; + if (asoc->strmout) { + for (i = 0; i < asoc->streamoutcnt; i++) { + asoc->strmout[i].state = SCTP_STREAM_OPEN; + } + } /* EY - nr_sack: initialize highest tsn in nr_mapping_array */ asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { @@ -381,17 +395,9 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) if (asoc->strmin != NULL) { /* Free the old ones */ - struct sctp_queued_to_read *ctl, *nctl; - for (i = 0; i < asoc->streamincnt; i++) { - TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[i].inqueue, next, nctl) { - TAILQ_REMOVE(&asoc->strmin[i].inqueue, ctl, next); - sctp_free_remote_addr(ctl->whoFrom); - ctl->whoFrom = NULL; - sctp_m_freem(ctl->data); - ctl->data = NULL; - sctp_free_a_readq(stcb, ctl); - } + sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue); + sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue); } SCTP_FREE(asoc->strmin, SCTP_M_STRMI); } @@ -409,8 +415,10 @@ sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) } for (i = 0; i < asoc->streamincnt; i++) { asoc->strmin[i].stream_no = i; - asoc->strmin[i].last_sequence_delivered = 0xffff; + asoc->strmin[i].last_sequence_delivered = 0xffffffff; TAILQ_INIT(&asoc->strmin[i].inqueue); + TAILQ_INIT(&asoc->strmin[i].uno_inqueue); + asoc->strmin[i].pd_api_started = 0; asoc->strmin[i].delivery_started = 0; } /* @@ -434,7 +442,7 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id) { struct sctp_association *asoc; @@ -466,7 +474,7 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, /* load all addresses */ if ((retval = sctp_load_addresses_from_init(stcb, m, (offset + sizeof(struct sctp_init_chunk)), initack_limit, - src, dst, NULL))) { + src, dst, NULL, stcb->asoc.port))) { op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Problem with address parameters"); SCTPDBG(SCTP_DEBUG_INPUT1, @@ -474,13 +482,13 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, retval); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } /* if the peer doesn't support asconf, flush the asconf queue */ - if (asoc->peer_supports_asconf == 0) { + if (asoc->asconf_supported == 0) { struct sctp_asconf_addr *param, *nparam; TAILQ_FOREACH_SAFE(param, &asoc->asconf_queue, next, nparam) { @@ -513,12 +521,11 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, * primary. */ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, - asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_4); + asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); /* calculate the RTO */ net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, sctp_align_safe_nocopy, SCTP_RTT_FROM_NON_DATA); - retval = sctp_send_cookie_echo(m, offset, stcb, net); if (retval < 0) { /* @@ -527,29 +534,25 @@ sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, * abandon the peer, its broke. */ if (retval == -3) { + uint16_t len; + + len = (uint16_t) (sizeof(struct sctp_error_missing_param) + sizeof(uint16_t)); /* We abort with an error of missing mandatory param */ - op_err = sctp_generate_cause(SCTP_CAUSE_MISSING_PARAM, ""); - if (op_err) { - /* - * Expand beyond to include the mandatory - * param cookie - */ - struct sctp_inv_mandatory_param *mp; + op_err = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); + if (op_err != NULL) { + struct sctp_error_missing_param *cause; - SCTP_BUF_LEN(op_err) = - sizeof(struct sctp_inv_mandatory_param); - mp = mtod(op_err, - struct sctp_inv_mandatory_param *); + SCTP_BUF_LEN(op_err) = len; + cause = mtod(op_err, struct sctp_error_missing_param *); /* Subtract the reserved param */ - mp->length = - htons(sizeof(struct sctp_inv_mandatory_param) - 2); - mp->num_param = htonl(1); - mp->param = htons(SCTP_STATE_COOKIE); - mp->resv = 0; + cause->cause.code = htons(SCTP_CAUSE_MISSING_PARAM); + cause->cause.length = htons(len); + cause->num_missing_params = htonl(1); + cause->type[0] = htons(SCTP_STATE_COOKIE); } sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; } @@ -562,21 +565,12 @@ static void sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net) { - struct sockaddr_storage store; + union sctp_sockstore store; struct sctp_nets *r_net, *f_net; struct timeval tv; int req_prim = 0; uint16_t old_error_counter; -#ifdef INET - struct sockaddr_in *sin; - -#endif -#ifdef INET6 - struct sockaddr_in6 *sin6; - -#endif - if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_heartbeat_chunk)) { /* Invalid length */ return; @@ -586,12 +580,11 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, #ifdef INET case AF_INET: if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in)) { - sin = (struct sockaddr_in *)&store; - sin->sin_family = cp->heartbeat.hb_info.addr_family; - sin->sin_len = cp->heartbeat.hb_info.addr_len; - sin->sin_port = stcb->rport; - memcpy(&sin->sin_addr, cp->heartbeat.hb_info.address, - sizeof(sin->sin_addr)); + store.sin.sin_family = cp->heartbeat.hb_info.addr_family; + store.sin.sin_len = cp->heartbeat.hb_info.addr_len; + store.sin.sin_port = stcb->rport; + memcpy(&store.sin.sin_addr, cp->heartbeat.hb_info.address, + sizeof(store.sin.sin_addr)); } else { return; } @@ -600,12 +593,10 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, #ifdef INET6 case AF_INET6: if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in6)) { - sin6 = (struct sockaddr_in6 *)&store; - sin6->sin6_family = cp->heartbeat.hb_info.addr_family; - sin6->sin6_len = cp->heartbeat.hb_info.addr_len; - sin6->sin6_port = stcb->rport; - memcpy(&sin6->sin6_addr, cp->heartbeat.hb_info.address, - sizeof(sin6->sin6_addr)); + store.sin6.sin6_family = cp->heartbeat.hb_info.addr_family; + store.sin6.sin6_len = cp->heartbeat.hb_info.addr_len; + store.sin6.sin6_port = stcb->rport; + memcpy(&store.sin6.sin6_addr, cp->heartbeat.hb_info.address, sizeof(struct in6_addr)); } else { return; } @@ -614,7 +605,7 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, default: return; } - r_net = sctp_findnet(stcb, (struct sockaddr *)&store); + r_net = sctp_findnet(stcb, &store.sa); if (r_net == NULL) { SCTPDBG(SCTP_DEBUG_INPUT1, "Huh? I can't find the address I sent it to, discard\n"); return; @@ -634,7 +625,7 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, if (f_net != r_net) { /* * first one on the list is NOT the primary - * sctp_cmpaddr() is much more efficent if + * sctp_cmpaddr() is much more efficient if * the primary is the first on the list, * make it so. */ @@ -645,7 +636,8 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, } sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED); - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, + r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_4); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net); } old_error_counter = r_net->error_count; @@ -666,7 +658,8 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, stcb->asoc.cc_functions.sctp_cwnd_update_exit_pf(stcb, net); } if (old_error_counter > 0) { - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, + stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_5); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net); } if (r_net == stcb->asoc.primary_destination) { @@ -685,7 +678,9 @@ sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_PRIM_DELETED)) { - sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_TIMER + SCTP_LOC_7); + sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, + stcb->sctp_ep, stcb, NULL, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_6); if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { sctp_assoc_immediate_retrans(stcb, @@ -756,7 +751,7 @@ sctp_handle_nat_missing_state(struct sctp_tcb *stcb, * return 0 means we want you to proceed with the abort non-zero * means no abort processing */ - if (stcb->asoc.peer_supports_auth == 0) { + if (stcb->asoc.auth_supported == 0) { SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_nat_missing_state: Peer does not support AUTH, cannot send an asconf\n"); return (0); } @@ -786,10 +781,10 @@ sctp_handle_abort(struct sctp_abort_chunk *abort, * Need to check the cause codes for our two magic nat * aborts which don't kill the assoc necessarily. */ - struct sctp_missing_nat_state *natc; + struct sctp_gen_error_cause *cause; - natc = (struct sctp_missing_nat_state *)(abort + 1); - error = ntohs(natc->cause); + cause = (struct sctp_gen_error_cause *)(abort + 1); + error = ntohs(cause->code); if (error == SCTP_CAUSE_NAT_COLLIDING_STATE) { SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n", abort->ch.chunk_flags); @@ -807,7 +802,8 @@ sctp_handle_abort(struct sctp_abort_chunk *abort, error = 0; } /* stop any receive timers */ - sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_6); + sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_7); /* notify user of the abort and clean up... */ sctp_abort_notification(stcb, 1, error, abort, SCTP_SO_NOT_LOCKED); /* free the tcb */ @@ -829,7 +825,7 @@ sctp_handle_abort(struct sctp_abort_chunk *abort, #endif stcb->asoc.state |= SCTP_STATE_WAS_ABORTED; (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, - SCTP_FROM_SCTP_INPUT + SCTP_LOC_6); + SCTP_FROM_SCTP_INPUT + SCTP_LOC_8); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -871,6 +867,7 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, { struct sctp_association *asoc; int some_on_streamwheel; + int old_state; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; @@ -889,17 +886,37 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_shutdown_chunk)) { /* Shutdown NOT the expected size */ return; - } else { - sctp_update_acked(stcb, cp, abort_flag); - if (*abort_flag) { - return; - } + } + old_state = SCTP_GET_STATE(asoc); + sctp_update_acked(stcb, cp, abort_flag); + if (*abort_flag) { + return; } if (asoc->control_pdapi) { /* * With a normal shutdown we assume the end of last record. */ SCTP_INP_READ_LOCK(stcb->sctp_ep); + if (asoc->control_pdapi->on_strm_q) { + struct sctp_stream_in *strm; + + strm = &asoc->strmin[asoc->control_pdapi->sinfo_stream]; + if (asoc->control_pdapi->on_strm_q == SCTP_ON_UNORDERED) { + /* Unordered */ + TAILQ_REMOVE(&strm->uno_inqueue, asoc->control_pdapi, next_instrm); + asoc->control_pdapi->on_strm_q = 0; + } else if (asoc->control_pdapi->on_strm_q == SCTP_ON_ORDERED) { + /* Ordered */ + TAILQ_REMOVE(&strm->inqueue, asoc->control_pdapi, next_instrm); + asoc->control_pdapi->on_strm_q = 0; +#ifdef INVARIANTS + } else { + panic("Unknown state on ctrl:%p on_strm_q:%d", + asoc->control_pdapi, + asoc->control_pdapi->on_strm_q); +#endif + } + } asoc->control_pdapi->end_added = 1; asoc->control_pdapi->pdapi_aborted = 1; asoc->control_pdapi = NULL; @@ -917,7 +934,9 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, return; } #endif - sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); + if (stcb->sctp_socket) { + sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); + } #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -944,7 +963,8 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, * stop the shutdown timer, since we WILL move to * SHUTDOWN-ACK-SENT. */ - sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_8); + sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, + net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9); } /* Now is there unsent data on a stream somewhere? */ some_on_streamwheel = sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED); @@ -962,12 +982,16 @@ sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); - sctp_stop_timers_for_shutdown(stcb); - sctp_send_shutdown_ack(stcb, net); - sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, - stcb, net); + if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) { + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); + sctp_stop_timers_for_shutdown(stcb); + sctp_send_shutdown_ack(stcb, net); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, + stcb->sctp_ep, stcb, net); + } else if (old_state == SCTP_STATE_SHUTDOWN_ACK_SENT) { + sctp_send_shutdown_ack(stcb, net); + } } } @@ -1032,12 +1056,13 @@ sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED, #ifdef INVARIANTS if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || - !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { + sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) { panic("Queues are not empty when handling SHUTDOWN-ACK"); } #endif /* stop the timer */ - sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9); + sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_10); /* send SHUTDOWN-COMPLETE */ sctp_send_shutdown_complete(stcb, net, 0); /* notify upper layer protocol */ @@ -1058,7 +1083,7 @@ sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED, atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, - SCTP_FROM_SCTP_INPUT + SCTP_LOC_10); + SCTP_FROM_SCTP_INPUT + SCTP_LOC_11); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -1066,7 +1091,7 @@ sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED, /* * Skip past the param header and then we will find the chunk that caused the - * problem. There are two possiblities ASCONF or FWD-TSN other than that and + * problem. There are two possibilities ASCONF or FWD-TSN other than that and * our peer must be broken. */ static void @@ -1081,8 +1106,9 @@ sctp_process_unrecog_chunk(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr, case SCTP_ASCONF: sctp_asconf_cleanup(stcb, net); break; + case SCTP_IFORWARD_CUM_TSN: case SCTP_FORWARD_CUM_TSN: - stcb->asoc.peer_supports_prsctp = 0; + stcb->asoc.prsctp_supported = 0; break; default: SCTPDBG(SCTP_DEBUG_INPUT2, @@ -1096,6 +1122,7 @@ sctp_process_unrecog_chunk(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr, * Skip past the param header and then we will find the param that caused the * problem. There are a number of param's in a ASCONF OR the prsctp param * these will turn of specific features. + * XXX: Is this the right thing to do? */ static void sctp_process_unrecog_param(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr) @@ -1106,7 +1133,7 @@ sctp_process_unrecog_param(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr) switch (ntohs(pbad->param_type)) { /* pr-sctp draft */ case SCTP_PRSCTP_SUPPORTED: - stcb->asoc.peer_supports_prsctp = 0; + stcb->asoc.prsctp_supported = 0; break; case SCTP_SUPPORTED_CHUNK_EXT: break; @@ -1117,14 +1144,14 @@ sctp_process_unrecog_param(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr) case SCTP_ADD_IP_ADDRESS: case SCTP_DEL_IP_ADDRESS: case SCTP_SET_PRIM_ADDR: - stcb->asoc.peer_supports_asconf = 0; + stcb->asoc.asconf_supported = 0; break; case SCTP_SUCCESS_REPORT: case SCTP_ERROR_CAUSE_IND: SCTPDBG(SCTP_DEBUG_INPUT2, "Huh, the peer does not support success? or error cause?\n"); SCTPDBG(SCTP_DEBUG_INPUT2, "Turning off ASCONF to this strange peer\n"); - stcb->asoc.peer_supports_asconf = 0; + stcb->asoc.asconf_supported = 0; break; default: SCTPDBG(SCTP_DEBUG_INPUT2, @@ -1217,7 +1244,7 @@ sctp_handle_error(struct sctp_chunkhdr *ch, atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, - SCTP_FROM_SCTP_INPUT + SCTP_LOC_11); + SCTP_FROM_SCTP_INPUT + SCTP_LOC_12); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -1238,7 +1265,7 @@ sctp_handle_error(struct sctp_chunkhdr *ch, * (or IPv4 for that matter) it does not matter. If * they don't support that type of address, they can * NOT possibly get that packet type... i.e. with no - * IPv6 you can't recieve a IPv6 packet. so we can + * IPv6 you can't receive a IPv6 packet. so we can * safely ignore this one. If we ever added support * for HOSTNAME Addresses, then we would need to do * something here. @@ -1295,7 +1322,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id) { struct sctp_init_ack *init_ack; @@ -1314,7 +1341,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); @@ -1326,7 +1353,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); @@ -1336,7 +1363,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); @@ -1346,7 +1373,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); @@ -1356,7 +1383,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); @@ -1381,7 +1408,7 @@ sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, } if (sctp_process_init_ack(m, iphlen, offset, src, dst, sh, cp, stcb, net, abort_no_unlock, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id) < 0) { /* error in parsing parameters */ return (-1); @@ -1438,7 +1465,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, struct sctp_inpcb *inp, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port); @@ -1455,7 +1482,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_association *asoc; @@ -1468,6 +1495,11 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, int spec_flag = 0; uint32_t how_indx; +#if defined(SCTP_DETAILED_STR_STATS) + int j; + +#endif + net = *netp; /* I know that the TCB is non-NULL from the caller */ asoc = &stcb->asoc; @@ -1483,7 +1515,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination); op_err = sctp_generate_cause(SCTP_CAUSE_COOKIE_IN_SHUTDOWN, ""); sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, inp->fibnum, vrf_id, net->port); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 2; @@ -1564,9 +1596,12 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, return (NULL); } /* we have already processed the INIT so no problem */ - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, - net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_12); - sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_13); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, + stcb, net, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_13); + sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, + stcb, net, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_14); /* update current state */ if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) SCTP_STAT_INCR_COUNTER32(sctps_activeestab); @@ -1646,7 +1681,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, */ if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), - initack_offset, src, dst, init_src)) { + initack_offset, src, dst, init_src, stcb->asoc.port)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 4; return (NULL); @@ -1690,7 +1725,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, */ op_err = sctp_generate_cause(SCTP_CAUSE_NAT_COLLIDING_STATE, ""); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, inp->fibnum, vrf_id, port); return (NULL); } @@ -1726,7 +1761,8 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 8; - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_14); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_15); sctp_stop_all_cookie_timers(stcb); /* * since we did not send a HB make sure we don't double @@ -1772,7 +1808,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, } if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), - initack_offset, src, dst, init_src)) { + initack_offset, src, dst, init_src, stcb->asoc.port)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 10; return (NULL); @@ -1862,7 +1898,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, sh, cookie, cookie_len, inp, netp, init_src, notification, auth_skipped, auth_offset, auth_len, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port)); } /* @@ -1871,8 +1907,10 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, /* temp code */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 12; - sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_15); - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); + sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_17); /* notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_RESTART; @@ -1930,8 +1968,18 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_LOCKED); for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].chunks_on_queues = 0; +#if defined(SCTP_DETAILED_STR_STATS) + for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { + asoc->strmout[i].abandoned_sent[j] = 0; + asoc->strmout[i].abandoned_unsent[j] = 0; + } +#else + asoc->strmout[i].abandoned_sent[0] = 0; + asoc->strmout[i].abandoned_unsent[0] = 0; +#endif stcb->asoc.strmout[i].stream_no = i; - stcb->asoc.strmout[i].next_sequence_send = 0; + stcb->asoc.strmout[i].next_mid_ordered = 0; + stcb->asoc.strmout[i].next_mid_unordered = 0; stcb->asoc.strmout[i].last_msg_incomplete = 0; } /* process the INIT-ACK info (my info) */ @@ -1973,7 +2021,7 @@ sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), - initack_offset, src, dst, init_src)) { + initack_offset, src, dst, init_src, stcb->asoc.port)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 14; @@ -2009,28 +2057,19 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, struct sctp_inpcb *inp, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_tcb *stcb; struct sctp_init_chunk *init_cp, init_buf; struct sctp_init_ack_chunk *initack_cp, initack_buf; - struct sockaddr_storage sa_store; - struct sockaddr *initack_src = (struct sockaddr *)&sa_store; + union sctp_sockstore store; struct sctp_association *asoc; int init_offset, initack_offset, initack_limit; int retval; int error = 0; uint8_t auth_chunk_buf[SCTP_PARAM_BUFFER_SIZE]; -#ifdef INET - struct sockaddr_in *sin; - -#endif -#ifdef INET6 - struct sockaddr_in6 *sin6; - -#endif #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; @@ -2093,6 +2132,8 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, */ stcb = sctp_aloc_assoc(inp, init_src, &error, ntohl(initack_cp->init.initiate_tag), vrf_id, + ntohs(initack_cp->init.num_outbound_streams), + port, (struct thread *)NULL ); if (stcb == NULL) { @@ -2104,7 +2145,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); return (NULL); } @@ -2132,7 +2173,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); @@ -2140,7 +2181,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, SCTP_TCB_LOCK(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, - SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); + SCTP_FROM_SCTP_INPUT + SCTP_LOC_18); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -2171,7 +2212,8 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_19); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -2181,14 +2223,15 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, /* load all addresses */ if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, - src, dst, init_src)) { + src, dst, init_src, port)) { atomic_add_int(&stcb->asoc.refcnt, 1); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_TCB_UNLOCK(stcb); SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_17); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_20); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -2217,7 +2260,8 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_18); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_21); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -2254,23 +2298,20 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, #ifdef INET case SCTP_IPV4_ADDRESS: /* source addr is IPv4 */ - sin = (struct sockaddr_in *)initack_src; - memset(sin, 0, sizeof(*sin)); - sin->sin_family = AF_INET; - sin->sin_len = sizeof(struct sockaddr_in); - sin->sin_addr.s_addr = cookie->laddress[0]; + memset(&store.sin, 0, sizeof(struct sockaddr_in)); + store.sin.sin_family = AF_INET; + store.sin.sin_len = sizeof(struct sockaddr_in); + store.sin.sin_addr.s_addr = cookie->laddress[0]; break; #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: /* source addr is IPv6 */ - sin6 = (struct sockaddr_in6 *)initack_src; - memset(sin6, 0, sizeof(*sin6)); - sin6->sin6_family = AF_INET6; - sin6->sin6_len = sizeof(struct sockaddr_in6); - sin6->sin6_scope_id = cookie->scope_id; - memcpy(&sin6->sin6_addr, cookie->laddress, - sizeof(sin6->sin6_addr)); + memset(&store.sin6, 0, sizeof(struct sockaddr_in6)); + store.sin6.sin6_family = AF_INET6; + store.sin6.sin6_len = sizeof(struct sockaddr_in6); + store.sin6.sin6_scope_id = cookie->scope_id; + memcpy(&store.sin6.sin6_addr, cookie->laddress, sizeof(struct in6_addr)); break; #endif default: @@ -2280,7 +2321,8 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, SCTP_SOCKET_LOCK(so, 1); SCTP_TCB_LOCK(stcb); #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_19); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_22); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -2334,9 +2376,9 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); } - /* calculate the RTT */ (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); - if ((netp) && (*netp)) { + if ((netp != NULL) && (*netp != NULL)) { + /* calculate the RTT and set the encaps port */ (*netp)->RTO = sctp_calculate_rto(stcb, asoc, *netp, &cookie->time_entered, sctp_align_unsafe_makecopy, SCTP_RTT_FROM_NON_DATA); @@ -2351,7 +2393,7 @@ sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, sctp_check_address_list(stcb, m, initack_offset + sizeof(struct sctp_init_ack_chunk), initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk)), - initack_src, cookie->local_scope, cookie->site_scope, + &store.sa, cookie->local_scope, cookie->site_scope, cookie->ipv4_scope, cookie->loopback_scope); @@ -2382,7 +2424,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, struct sctp_inpcb **inp_p, struct sctp_tcb **stcb, struct sctp_nets **netp, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, struct sctp_tcb **locked_tcb, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_state_cookie *cookie; @@ -2422,8 +2464,8 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, cookie_offset = offset + sizeof(struct sctp_chunkhdr); cookie_len = ntohs(cp->ch.chunk_length); - if ((cookie->peerport != sh->src_port) && - (cookie->myport != sh->dest_port) && + if ((cookie->peerport != sh->src_port) || + (cookie->myport != sh->dest_port) || (cookie->my_vtag != sh->v_tag)) { /* * invalid ports or bad tag. Note that we always leave the @@ -2445,20 +2487,14 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, * calculated in the sctp_hmac_m() call). */ sig_offset = offset + cookie_len - SCTP_SIGNATURE_SIZE; - m_sig = m_split(m, sig_offset, M_DONTWAIT); + m_sig = m_split(m, sig_offset, M_NOWAIT); if (m_sig == NULL) { /* out of memory or ?? */ return (NULL); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = m_sig; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_SPLIT); - } - } + sctp_log_mbc(m_sig, SCTP_MBUF_SPLIT); } #endif @@ -2547,29 +2583,29 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, if (timevalcmp(&now, &time_expires, >)) { /* cookie is stale! */ struct mbuf *op_err; - struct sctp_stale_cookie_msg *scm; + struct sctp_error_stale_cookie *cause; uint32_t tim; - op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_stale_cookie_msg), - 0, M_DONTWAIT, 1, MT_DATA); + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_stale_cookie), + 0, M_NOWAIT, 1, MT_DATA); if (op_err == NULL) { /* FOOBAR */ return (NULL); } /* Set the len */ - SCTP_BUF_LEN(op_err) = sizeof(struct sctp_stale_cookie_msg); - scm = mtod(op_err, struct sctp_stale_cookie_msg *); - scm->ph.param_type = htons(SCTP_CAUSE_STALE_COOKIE); - scm->ph.param_length = htons((sizeof(struct sctp_paramhdr) + + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_stale_cookie); + cause = mtod(op_err, struct sctp_error_stale_cookie *); + cause->cause.code = htons(SCTP_CAUSE_STALE_COOKIE); + cause->cause.length = htons((sizeof(struct sctp_paramhdr) + (sizeof(uint32_t)))); /* seconds to usec */ tim = (now.tv_sec - time_expires.tv_sec) * 1000000; /* add in usec */ if (tim == 0) tim = now.tv_usec - cookie->time_entered.tv_usec; - scm->time_usec = htonl(tim); + cause->stale_time = htonl(tim); sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, l_inp->fibnum, vrf_id, port); return (NULL); } @@ -2610,7 +2646,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, /* This should not happen */ return (NULL); } - if ((*stcb == NULL) && to) { + if (*stcb == NULL) { /* Yep, lets check */ *stcb = sctp_findassociation_ep_addr(inp_p, to, netp, dst, NULL); if (*stcb == NULL) { @@ -2649,9 +2685,6 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, } } } - if (to == NULL) { - return (NULL); - } cookie_len -= SCTP_SIGNATURE_SIZE; if (*stcb == NULL) { /* this is the "normal" case... get a new TCB */ @@ -2659,7 +2692,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, cookie, cookie_len, *inp_p, netp, to, ¬ification, auth_skipped, auth_offset, auth_len, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); } else { /* this is abnormal... cookie-echo on existing TCB */ @@ -2668,7 +2701,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, src, dst, sh, cookie, cookie_len, *inp_p, *stcb, netp, to, ¬ification, auth_skipped, auth_offset, auth_len, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); } @@ -2676,11 +2709,9 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, /* still no TCB... must be bad cookie-echo */ return (NULL); } - if ((*netp != NULL) && (use_mflowid != 0)) { + if (*netp != NULL) { + (*netp)->flowtype = mflowtype; (*netp)->flowid = mflowid; -#ifdef INVARIANTS - (*netp)->flowidset = 1; -#endif } /* * Ok, we built an association so confirm the address we sent the @@ -2692,7 +2723,8 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, */ if (netl == NULL) { /* TSNH! Huh, why do I need to add this address here? */ - if (sctp_add_remote_addr(*stcb, to, NULL, SCTP_DONOT_SETSCOPE, SCTP_IN_COOKIE_PROC)) { + if (sctp_add_remote_addr(*stcb, to, NULL, port, + SCTP_DONOT_SETSCOPE, SCTP_IN_COOKIE_PROC)) { return (NULL); } netl = sctp_findnet(*stcb, to); @@ -2751,7 +2783,7 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(*inp_p, NULL, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) pcb_so = SCTP_INP_SO(*inp_p); @@ -2761,7 +2793,8 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); #endif - (void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_20); + (void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_23); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(pcb_so, 1); #endif @@ -2784,11 +2817,19 @@ sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, inp->sctp_mobility_features = (*inp_p)->sctp_mobility_features; inp->sctp_socket = so; inp->sctp_frag_point = (*inp_p)->sctp_frag_point; + inp->max_cwnd = (*inp_p)->max_cwnd; inp->sctp_cmt_on_off = (*inp_p)->sctp_cmt_on_off; - inp->sctp_ecn_enable = (*inp_p)->sctp_ecn_enable; + inp->ecn_supported = (*inp_p)->ecn_supported; + inp->prsctp_supported = (*inp_p)->prsctp_supported; + inp->auth_supported = (*inp_p)->auth_supported; + inp->asconf_supported = (*inp_p)->asconf_supported; + inp->reconfig_supported = (*inp_p)->reconfig_supported; + inp->nrsack_supported = (*inp_p)->nrsack_supported; + inp->pktdrop_supported = (*inp_p)->pktdrop_supported; inp->partial_delivery_point = (*inp_p)->partial_delivery_point; inp->sctp_context = (*inp_p)->sctp_context; inp->local_strreset_support = (*inp_p)->local_strreset_support; + inp->fibnum = (*inp_p)->fibnum; inp->inp_starting_point_for_iterator = NULL; /* * copy in the authentication parameters from the @@ -2885,9 +2926,9 @@ sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_cookie_ack: handling COOKIE-ACK\n"); - if (stcb == NULL) + if ((stcb == NULL) || (net == NULL)) { return; - + } asoc = &stcb->asoc; sctp_stop_all_cookie_timers(stcb); @@ -2962,7 +3003,7 @@ sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, * in flight) */ if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) && - (stcb->asoc.peer_supports_asconf) && + (stcb->asoc.asconf_supported == 1) && (!TAILQ_EMPTY(&stcb->asoc.asconf_queue))) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, @@ -3123,7 +3164,6 @@ sctp_handle_ecn_cwr(struct sctp_cwr_chunk *cp, struct sctp_tcb *stcb, struct sct uint32_t cwr_tsn; cwr_tsn = ntohl(cp->tsn); - override = cp->ch.chunk_flags & SCTP_CWR_REDUCE_OVERRIDE; TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) { if (chk->rec.chunk_id.id != SCTP_ECN_ECHO) { @@ -3139,10 +3179,8 @@ sctp_handle_ecn_cwr(struct sctp_cwr_chunk *cp, struct sctp_tcb *stcb, struct sct stcb->asoc.ecn_echo_cnt_onq--; TAILQ_REMOVE(&stcb->asoc.control_send_queue, chk, sctp_next); - if (chk->data) { - sctp_m_freem(chk->data); - chk->data = NULL; - } + sctp_m_freem(chk->data); + chk->data = NULL; stcb->asoc.ctrl_queue_cnt--; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); if (override == 0) { @@ -3184,12 +3222,13 @@ sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSE #ifdef INVARIANTS if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || - !stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { + sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) { panic("Queues are not empty when handling SHUTDOWN-COMPLETE"); } #endif /* stop the timer */ - sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_22); + sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_24); SCTP_STAT_INCR_COUNTER32(sctps_shutdown); /* free the TCB */ SCTPDBG(SCTP_DEBUG_INPUT2, @@ -3202,7 +3241,8 @@ sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSE SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif - (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_23); + (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_25); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -3310,7 +3350,8 @@ process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc, /* restart the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, - stcb, tp1->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_24); + stcb, tp1->whoTo, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_26); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, tp1->whoTo); @@ -3319,7 +3360,7 @@ process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc, sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PDRP, tp1->whoTo->flight_size, tp1->book_size, - (uintptr_t) stcb, + (uint32_t) (uintptr_t) stcb, tp1->rec.data.TSN_seq); } if (tp1->sent < SCTP_DATAGRAM_RESEND) { @@ -3378,7 +3419,8 @@ process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc, * this, otherwise we let the timer fire. */ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, - stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_25); + stcb, net, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_27); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); } break; @@ -3429,6 +3471,7 @@ process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc, /* resend last asconf ack */ sctp_send_asconf_ack(stcb); break; + case SCTP_IFORWARD_CUM_TSN: case SCTP_FORWARD_CUM_TSN: send_forward_tsn(stcb, &stcb->asoc); break; @@ -3454,8 +3497,8 @@ sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t * uint16_t temp; /* - * We set things to 0xffff since this is the last delivered sequence - * and we will be sending in 0 after the reset. + * We set things to 0xffffffff since this is the last delivered + * sequence and we will be sending in 0 after the reset. */ if (number_entries) { @@ -3464,12 +3507,12 @@ sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t * if (temp >= stcb->asoc.streamincnt) { continue; } - stcb->asoc.strmin[temp].last_sequence_delivered = 0xffff; + stcb->asoc.strmin[temp].last_sequence_delivered = 0xffffffff; } } else { list = NULL; for (i = 0; i < stcb->asoc.streamincnt; i++) { - stcb->asoc.strmin[i].last_sequence_delivered = 0xffff; + stcb->asoc.strmin[i].last_sequence_delivered = 0xffffffff; } } sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_RECV, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); @@ -3488,23 +3531,47 @@ sctp_reset_out_streams(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t /* no such stream */ continue; } - stcb->asoc.strmout[temp].next_sequence_send = 0; + stcb->asoc.strmout[temp].next_mid_ordered = 0; + stcb->asoc.strmout[temp].next_mid_unordered = 0; } } else { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { - stcb->asoc.strmout[i].next_sequence_send = 0; + stcb->asoc.strmout[i].next_mid_ordered = 0; + stcb->asoc.strmout[i].next_mid_unordered = 0; } } sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_SEND, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); } +static void +sctp_reset_clear_pending(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t * list) +{ + uint32_t i; + uint16_t temp; -struct sctp_stream_reset_out_request * + if (number_entries > 0) { + for (i = 0; i < number_entries; i++) { + temp = ntohs(list[i]); + if (temp >= stcb->asoc.streamoutcnt) { + /* no such stream */ + continue; + } + stcb->asoc.strmout[temp].state = SCTP_STREAM_OPEN; + } + } else { + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + stcb->asoc.strmout[i].state = SCTP_STREAM_OPEN; + } + } +} + + +struct sctp_stream_reset_request * sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chunk **bchk) { struct sctp_association *asoc; struct sctp_chunkhdr *ch; - struct sctp_stream_reset_out_request *r; + struct sctp_stream_reset_request *r; struct sctp_tmit_chunk *chk; int len, clen; @@ -3527,7 +3594,7 @@ sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chu } clen = chk->send_size; ch = mtod(chk->data, struct sctp_chunkhdr *); - r = (struct sctp_stream_reset_out_request *)(ch + 1); + r = (struct sctp_stream_reset_request *)(ch + 1); if (ntohl(r->request_seq) == seq) { /* found it */ return (r); @@ -3535,7 +3602,7 @@ sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chu len = SCTP_SIZE32(ntohs(r->ph.param_length)); if (clen > (len + (int)sizeof(struct sctp_chunkhdr))) { /* move to the next one, there can only be a max of two */ - r = (struct sctp_stream_reset_out_request *)((caddr_t)r + len); + r = (struct sctp_stream_reset_request *)((caddr_t)r + len); if (ntohl(r->request_seq) == seq) { return (r); } @@ -3555,7 +3622,8 @@ sctp_clean_up_stream_reset(struct sctp_tcb *stcb) } asoc = &stcb->asoc; - sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_26); + sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, + chk->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_28); TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); @@ -3579,7 +3647,9 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb, int lparm_len; struct sctp_association *asoc = &stcb->asoc; struct sctp_tmit_chunk *chk; - struct sctp_stream_reset_out_request *srparam; + struct sctp_stream_reset_request *req_param; + struct sctp_stream_reset_out_request *req_out_param; + struct sctp_stream_reset_in_request *req_in_param; uint32_t number_entries; if (asoc->stream_reset_outstanding == 0) { @@ -3587,35 +3657,50 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb, return (0); } if (seq == stcb->asoc.str_reset_seq_out) { - srparam = sctp_find_stream_reset(stcb, seq, &chk); - if (srparam) { + req_param = sctp_find_stream_reset(stcb, seq, &chk); + if (req_param != NULL) { stcb->asoc.str_reset_seq_out++; - type = ntohs(srparam->ph.param_type); - lparm_len = ntohs(srparam->ph.param_length); + type = ntohs(req_param->ph.param_type); + lparm_len = ntohs(req_param->ph.param_length); if (type == SCTP_STR_RESET_OUT_REQUEST) { + int no_clear = 0; + + req_out_param = (struct sctp_stream_reset_out_request *)req_param; number_entries = (lparm_len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t); asoc->stream_reset_out_is_outstanding = 0; if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { /* do it */ - sctp_reset_out_streams(stcb, number_entries, srparam->list_of_streams); + sctp_reset_out_streams(stcb, number_entries, req_out_param->list_of_streams); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { - sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_OUT, stcb, number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED); + sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED); + } else if (action == SCTP_STREAM_RESET_RESULT_IN_PROGRESS) { + /* + * Set it up so we don't stop + * retransmitting + */ + asoc->stream_reset_outstanding++; + stcb->asoc.str_reset_seq_out--; + asoc->stream_reset_out_is_outstanding = 1; + no_clear = 1; } else { - sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED); + sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED); + } + if (no_clear == 0) { + sctp_reset_clear_pending(stcb, number_entries, req_out_param->list_of_streams); } } else if (type == SCTP_STR_RESET_IN_REQUEST) { - /* Answered my request */ + req_in_param = (struct sctp_stream_reset_in_request *)req_param; number_entries = (lparm_len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t); if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_IN, stcb, - number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED); + number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED); } else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_IN, stcb, - number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED); + number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED); } } else if (type == SCTP_STR_RESET_ADD_OUT_STREAMS) { /* Ok we now may have more streams */ @@ -3631,7 +3716,12 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb, asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { /* Put the new streams into effect */ - stcb->asoc.streamoutcnt += num_stream; + int i; + + for (i = asoc->streamoutcnt; i < (asoc->streamoutcnt + num_stream); i++) { + asoc->strmout[i].state = SCTP_STREAM_OPEN; + } + asoc->streamoutcnt += num_stream; sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, @@ -3708,6 +3798,9 @@ sctp_handle_stream_reset_response(struct sctp_tcb *stcb, } } } + if (asoc->stream_reset_outstanding == 0) { + sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED); + } return (0); } @@ -3738,22 +3831,33 @@ sctp_handle_str_reset_request_in(struct sctp_tcb *stcb, } else if (stcb->asoc.stream_reset_out_is_outstanding == 0) { len = ntohs(req->ph.param_length); number_entries = ((len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t)); - for (i = 0; i < number_entries; i++) { - temp = ntohs(req->list_of_streams[i]); - req->list_of_streams[i] = temp; + if (number_entries) { + for (i = 0; i < number_entries; i++) { + temp = ntohs(req->list_of_streams[i]); + if (temp >= stcb->asoc.streamoutcnt) { + asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; + goto bad_boy; + } + req->list_of_streams[i] = temp; + } + for (i = 0; i < number_entries; i++) { + if (stcb->asoc.strmout[req->list_of_streams[i]].state == SCTP_STREAM_OPEN) { + stcb->asoc.strmout[req->list_of_streams[i]].state = SCTP_STREAM_RESET_PENDING; + } + } + } else { + /* Its all */ + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN) + stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING; + } } asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; - sctp_add_stream_reset_out(chk, number_entries, req->list_of_streams, - asoc->str_reset_seq_out, - seq, (asoc->sending_seq - 1)); - asoc->stream_reset_out_is_outstanding = 1; - asoc->str_reset = chk; - sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo); - stcb->asoc.stream_reset_outstanding++; } else { /* Can't do it, since we have sent one out */ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS; } +bad_boy: sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; } else if (asoc->str_reset_seq_in - 1 == seq) { @@ -3763,6 +3867,7 @@ sctp_handle_str_reset_request_in(struct sctp_tcb *stcb, } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } + sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED); } static int @@ -3881,11 +3986,12 @@ sctp_handle_str_reset_request_out(struct sctp_tcb *stcb, sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); return; } + liste->seq = seq; liste->tsn = tsn; liste->number_entries = number_entries; memcpy(&liste->list_of_streams, req->list_of_streams, number_entries * sizeof(uint16_t)); TAILQ_INSERT_TAIL(&asoc->resetHead, liste, next_resp); - asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; + asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_IN_PROGRESS; } sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; @@ -3949,20 +4055,28 @@ sctp_handle_str_reset_add_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *ch /* copy off the old data */ for (i = 0; i < stcb->asoc.streamincnt; i++) { TAILQ_INIT(&stcb->asoc.strmin[i].inqueue); + TAILQ_INIT(&stcb->asoc.strmin[i].uno_inqueue); stcb->asoc.strmin[i].stream_no = i; stcb->asoc.strmin[i].last_sequence_delivered = oldstrm[i].last_sequence_delivered; stcb->asoc.strmin[i].delivery_started = oldstrm[i].delivery_started; + stcb->asoc.strmin[i].pd_api_started = oldstrm[i].pd_api_started; /* now anything on those queues? */ - TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].inqueue, next, nctl) { - TAILQ_REMOVE(&oldstrm[i].inqueue, ctl, next); - TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].inqueue, ctl, next); + TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].inqueue, next_instrm, nctl) { + TAILQ_REMOVE(&oldstrm[i].inqueue, ctl, next_instrm); + TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].inqueue, ctl, next_instrm); + } + TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].uno_inqueue, next_instrm, nctl) { + TAILQ_REMOVE(&oldstrm[i].uno_inqueue, ctl, next_instrm); + TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].uno_inqueue, ctl, next_instrm); } } /* Init the new streams */ for (i = stcb->asoc.streamincnt; i < num_stream; i++) { TAILQ_INIT(&stcb->asoc.strmin[i].inqueue); + TAILQ_INIT(&stcb->asoc.strmin[i].uno_inqueue); stcb->asoc.strmin[i].stream_no = i; - stcb->asoc.strmin[i].last_sequence_delivered = 0xffff; + stcb->asoc.strmin[i].last_sequence_delivered = 0xffffffff; + stcb->asoc.strmin[i].pd_api_started = 0; stcb->asoc.strmin[i].delivery_started = 0; } SCTP_FREE(oldstrm, SCTP_M_STRMI); @@ -4022,7 +4136,7 @@ sctp_handle_str_reset_add_out_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk mychk += num_stream; if (mychk < 0x10000) { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; - if (sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 0, 1, num_stream, 0, 1)) { + if (sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 1, num_stream, 0, 1)) { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } } else { @@ -4075,13 +4189,15 @@ __attribute__((noinline)) if (chk == NULL) { return (ret_code); } + chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; + chk->flags = 0; chk->asoc = &stcb->asoc; chk->no_fr_allowed = 0; chk->book_size = chk->send_size = sizeof(struct sctp_chunkhdr); chk->book_size_scale = 0; - chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { strres_nochunk: if (chk->data) { @@ -4366,7 +4482,7 @@ sctp_handle_packet_dropped(struct sctp_pktdrop_chunk *cp, (stcb->asoc.sat_t3_loss_recovery == 0) && (stcb->asoc.sat_network)) { /* - * This is debateable but for sat networks it makes sense + * This is debatable but for sat networks it makes sense * Note if a T3 timer has went off, we will prohibit any * changes to cwnd until we exit the t3 loss recovery. */ @@ -4392,7 +4508,7 @@ __attribute__((noinline)) struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp, int *fwd_tsn_seen, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { struct sctp_association *asoc; @@ -4461,7 +4577,7 @@ __attribute__((noinline)) */ if ((ch->chunk_type == SCTP_AUTHENTICATION) && (stcb == NULL) && - !SCTP_BASE_SYSCTL(sctp_auth_disable)) { + (inp->auth_supported == 1)) { /* save this chunk for later processing */ auth_skipped = 1; auth_offset = *offset; @@ -4551,12 +4667,12 @@ __attribute__((noinline)) } } if (stcb == NULL) { - snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__); + snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); /* no association, so it's out of the blue... */ sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, inp->fibnum, vrf_id, port); *offset = length; if (locked_tcb) { @@ -4595,12 +4711,12 @@ __attribute__((noinline)) if (locked_tcb) { SCTP_TCB_UNLOCK(locked_tcb); } - snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__); + snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); return (NULL); } @@ -4728,7 +4844,7 @@ process_control_chunks: /* check to see if this chunk required auth, but isn't */ if ((stcb != NULL) && - !SCTP_BASE_SYSCTL(sctp_auth_disable) && + (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(ch->chunk_type, stcb->asoc.local_auth_chunks) && !stcb->asoc.authenticated) { /* "silently" ignore */ @@ -4741,13 +4857,11 @@ process_control_chunks: /* The INIT chunk must be the only chunk. */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { - op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), - "INIT not the only chunk"); - sctp_abort_association(inp, stcb, m, iphlen, - src, dst, sh, op_err, - use_mflowid, mflowid, - vrf_id, port); + /* RFC 4960 requires that no ABORT is sent */ *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } return (NULL); } /* Honor our resource limit. */ @@ -4755,15 +4869,15 @@ process_control_chunks: op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); *offset = length; return (NULL); } sctp_handle_init(m, iphlen, *offset, src, dst, sh, (struct sctp_init_chunk *)ch, inp, - stcb, &abort_no_unlock, - use_mflowid, mflowid, + stcb, *netp, &abort_no_unlock, + mflowtype, mflowid, vrf_id, port); *offset = length; if ((!abort_no_unlock) && (locked_tcb)) { @@ -4780,7 +4894,7 @@ process_control_chunks: if ((stcb) && (stcb->asoc.total_output_queue_size)) { ; } else { - if (locked_tcb != stcb) { + if ((locked_tcb != NULL) && (locked_tcb != stcb)) { /* Very unlikely */ SCTP_TCB_UNLOCK(locked_tcb); } @@ -4794,7 +4908,8 @@ process_control_chunks: SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_29); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -4817,7 +4932,7 @@ process_control_chunks: (struct sctp_init_ack_chunk *)ch, stcb, *netp, &abort_no_unlock, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id); } else { ret = -1; @@ -4936,8 +5051,7 @@ process_control_chunks: SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing NR-SACK chunk\n"); break; } - if ((stcb->asoc.sctp_nr_sack_on_off == 0) || - (stcb->asoc.peer_supports_nr_sack == 0)) { + if (stcb->asoc.nrsack_supported == 0) { goto unknown_chunk; } if (chk_length < sizeof(struct sctp_nr_sack_chunk)) { @@ -5123,7 +5237,7 @@ process_control_chunks: op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, vrf_id, port); } *offset = length; @@ -5158,7 +5272,7 @@ process_control_chunks: auth_offset, auth_len, &locked_tcb, - use_mflowid, + mflowtype, mflowid, vrf_id, port); @@ -5215,7 +5329,8 @@ process_control_chunks: SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_30); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -5248,6 +5363,9 @@ process_control_chunks: return (NULL); } if (stcb) { + if (stcb->asoc.ecn_supported == 0) { + goto unknown_chunk; + } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, @@ -5273,6 +5391,9 @@ process_control_chunks: return (NULL); } if (stcb) { + if (stcb->asoc.ecn_supported == 0) { + goto unknown_chunk; + } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, @@ -5306,6 +5427,9 @@ process_control_chunks: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF\n"); /* He's alive so give him credit */ if (stcb) { + if (stcb->asoc.asconf_supported == 0) { + goto unknown_chunk; + } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, @@ -5330,6 +5454,9 @@ process_control_chunks: return (NULL); } if ((stcb) && netp && *netp) { + if (stcb->asoc.asconf_supported == 0) { + goto unknown_chunk; + } /* He's alive so give him credit */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, @@ -5346,6 +5473,7 @@ process_control_chunks: } break; case SCTP_FORWARD_CUM_TSN: + case SCTP_IFORWARD_CUM_TSN: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_FWD-TSN\n"); if (chk_length < sizeof(struct sctp_forward_tsn_chunk)) { /* Its not ours */ @@ -5359,6 +5487,9 @@ process_control_chunks: if (stcb) { int abort_flag = 0; + if (stcb->asoc.prsctp_supported == 0) { + goto unknown_chunk; + } stcb->asoc.overall_error_count = 0; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, @@ -5378,7 +5509,8 @@ process_control_chunks: SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_29); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_31); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -5413,13 +5545,8 @@ process_control_chunks: *offset = length; return (NULL); } - if (stcb->asoc.peer_supports_strreset == 0) { - /* - * hmm, peer should have announced this, but - * we will turn it on since he is sending us - * a stream reset. - */ - stcb->asoc.peer_supports_strreset = 1; + if (stcb->asoc.reconfig_supported == 0) { + goto unknown_chunk; } if (sctp_handle_stream_reset(stcb, m, *offset, ch)) { /* stop processing */ @@ -5439,18 +5566,17 @@ process_control_chunks: return (NULL); } if (ch && (stcb) && netp && (*netp)) { + if (stcb->asoc.pktdrop_supported == 0) { + goto unknown_chunk; + } sctp_handle_packet_dropped((struct sctp_pktdrop_chunk *)ch, stcb, *netp, min(chk_length, (sizeof(chunk_buf) - 4))); } break; - case SCTP_AUTHENTICATION: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_AUTHENTICATION\n"); - if (SCTP_BASE_SYSCTL(sctp_auth_disable)) - goto unknown_chunk; - if (stcb == NULL) { /* save the first AUTH for later processing */ if (auth_skipped == 0) { @@ -5461,6 +5587,9 @@ process_control_chunks: /* skip this chunk (temporarily) */ goto next_chunk; } + if (stcb->asoc.auth_supported == 0) { + goto unknown_chunk; + } if ((chk_length < (sizeof(struct sctp_auth_chunk))) || (chk_length > (sizeof(struct sctp_auth_chunk) + SCTP_AUTH_DIGEST_LEN_MAX))) { @@ -5491,43 +5620,27 @@ process_control_chunks: unknown_chunk: /* it's an unknown chunk! */ if ((ch->chunk_type & 0x40) && (stcb != NULL)) { - struct mbuf *mm; - struct sctp_paramhdr *phd; - - mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), - 0, M_DONTWAIT, 1, MT_DATA); - if (mm) { - phd = mtod(mm, struct sctp_paramhdr *); - /* - * We cheat and use param type since - * we did not bother to define a - * error cause struct. They are the - * same basic format with different - * names. - */ - phd->param_type = htons(SCTP_CAUSE_UNRECOG_CHUNK); - phd->param_length = htons(chk_length + sizeof(*phd)); - SCTP_BUF_LEN(mm) = sizeof(*phd); - SCTP_BUF_NEXT(mm) = SCTP_M_COPYM(m, *offset, chk_length, M_DONTWAIT); - if (SCTP_BUF_NEXT(mm)) { - if (sctp_pad_lastmbuf(SCTP_BUF_NEXT(mm), SCTP_SIZE32(chk_length) - chk_length, NULL)) { - sctp_m_freem(mm); - } else { + struct sctp_gen_error_cause *cause; + int len; + + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_gen_error_cause), + 0, M_NOWAIT, 1, MT_DATA); + if (op_err != NULL) { + len = min(SCTP_SIZE32(chk_length), (uint32_t) (length - *offset)); + cause = mtod(op_err, struct sctp_gen_error_cause *); + cause->code = htons(SCTP_CAUSE_UNRECOG_CHUNK); + cause->length = htons((uint16_t) (len + sizeof(struct sctp_gen_error_cause))); + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause); + SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(m, *offset, len, M_NOWAIT); + if (SCTP_BUF_NEXT(op_err) != NULL) { #ifdef SCTP_MBUF_LOGGING - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = SCTP_BUF_NEXT(mm); mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_ICOPY); - } - } - } -#endif - sctp_queue_op_err(stcb, mm); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + sctp_log_mbc(SCTP_BUF_NEXT(op_err), SCTP_MBUF_ICOPY); } +#endif + sctp_queue_op_err(stcb, op_err); } else { - sctp_m_freem(mm); + sctp_m_freem(op_err); } } } @@ -5565,30 +5678,6 @@ next_chunk: } -#ifdef INVARIANTS -#ifdef __GNUC__ -__attribute__((noinline)) -#endif - void - sctp_validate_no_locks(struct sctp_inpcb *inp) -{ - struct sctp_tcb *lstcb; - - LIST_FOREACH(lstcb, &inp->sctp_asoc_list, sctp_tcblist) { - if (mtx_owned(&lstcb->tcb_mtx)) { - panic("Own lock on stcb at return from input"); - } - } - if (mtx_owned(&inp->inp_create_mtx)) { - panic("Own create lock on inp"); - } - if (mtx_owned(&inp->inp_mtx)) { - panic("Own inp lock on inp"); - } -} - -#endif - /* * common input chunk processing (v4 and v6) */ @@ -5600,7 +5689,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt uint8_t compute_crc, #endif uint8_t ecn_bits, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { uint32_t high_tsn; @@ -5631,17 +5720,26 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt calc_check, check, (void *)m, length, iphlen); stcb = sctp_findassociation_addr(m, offset, src, dst, sh, ch, &inp, &net, vrf_id); - if ((net != NULL) && (port != 0)) { +#if defined(INET) || defined(INET6) + if ((ch->chunk_type != SCTP_INITIATION) && + (net != NULL) && (net->port != port)) { if (net->port == 0) { - sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr)); + /* UDP encapsulation turned on. */ + net->mtu -= sizeof(struct udphdr); + if (stcb->asoc.smallest_mtu > net->mtu) { + sctp_pathmtu_adjustment(stcb, net->mtu); + } + } else if (port == 0) { + /* UDP encapsulation turned off. */ + net->mtu += sizeof(struct udphdr); + /* XXX Update smallest_mtu */ } net->port = port; } - if ((net != NULL) && (use_mflowid != 0)) { - net->flowid = mflowid; -#ifdef INVARIANTS - net->flowidset = 1; #endif + if (net != NULL) { + net->flowtype = mflowtype; + net->flowid = mflowid; } if ((inp != NULL) && (stcb != NULL)) { sctp_send_packet_dropped(stcb, net, m, length, iphlen, 1); @@ -5662,17 +5760,26 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt } stcb = sctp_findassociation_addr(m, offset, src, dst, sh, ch, &inp, &net, vrf_id); - if ((net != NULL) && (port != 0)) { +#if defined(INET) || defined(INET6) + if ((ch->chunk_type != SCTP_INITIATION) && + (net != NULL) && (net->port != port)) { if (net->port == 0) { - sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr)); + /* UDP encapsulation turned on. */ + net->mtu -= sizeof(struct udphdr); + if (stcb->asoc.smallest_mtu > net->mtu) { + sctp_pathmtu_adjustment(stcb, net->mtu); + } + } else if (port == 0) { + /* UDP encapsulation turned off. */ + net->mtu += sizeof(struct udphdr); + /* XXX Update smallest_mtu */ } net->port = port; } - if ((net != NULL) && (use_mflowid != 0)) { - net->flowid = mflowid; -#ifdef INVARIANTS - net->flowidset = 1; #endif + if (net != NULL) { + net->flowtype = mflowtype; + net->flowid = mflowid; } if (inp == NULL) { SCTP_STAT_INCR(sctps_noport); @@ -5681,7 +5788,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt } if (ch->chunk_type == SCTP_SHUTDOWN_ACK) { sctp_send_shutdown_complete2(src, dst, sh, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); goto out; } @@ -5696,7 +5803,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt "Out of the blue"); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); } } @@ -5714,7 +5821,6 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt #ifdef INET case AF_INET: if (ipsec4_in_reject(m, &inp->ip_inp.inp)) { - IPSECSTAT_INC(in_polvio); SCTP_STAT_INCR(sctps_hdrops); goto out; } @@ -5723,7 +5829,6 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt #ifdef INET6 case AF_INET6: if (ipsec6_in_reject(m, &inp->ip_inp.inp)) { - IPSEC6STAT_INC(in_polvio); SCTP_STAT_INCR(sctps_hdrops); goto out; } @@ -5753,11 +5858,11 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt */ SCTP_TCB_UNLOCK(stcb); stcb = NULL; - snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__); + snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, inp->fibnum, vrf_id, port); goto out; } @@ -5768,7 +5873,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt stcb = sctp_process_control(m, iphlen, &offset, length, src, dst, sh, ch, inp, stcb, &net, &fwd_tsn_seen, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); if (stcb) { /* @@ -5776,12 +5881,23 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt * it changes our INP. */ inp = stcb->sctp_ep; - if ((net) && (port)) { +#if defined(INET) || defined(INET6) + if ((ch->chunk_type != SCTP_INITIATION) && + (net != NULL) && (net->port != port)) { if (net->port == 0) { - sctp_pathmtu_adjustment(stcb, net->mtu - sizeof(struct udphdr)); + /* UDP encapsulation turned on. */ + net->mtu -= sizeof(struct udphdr); + if (stcb->asoc.smallest_mtu > net->mtu) { + sctp_pathmtu_adjustment(stcb, net->mtu); + } + } else if (port == 0) { + /* UDP encapsulation turned off. */ + net->mtu += sizeof(struct udphdr); + /* XXX Update smallest_mtu */ } net->port = port; } +#endif } } else { /* @@ -5795,7 +5911,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt * chunks */ if ((stcb != NULL) && - !SCTP_BASE_SYSCTL(sctp_auth_disable) && + (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks)) { /* "silently" ignore */ SCTP_STAT_INCR(sctps_recvauthmissing); @@ -5803,11 +5919,11 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt } if (stcb == NULL) { /* out of the blue DATA chunk */ - snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__); + snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); goto out; } @@ -5837,7 +5953,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt */ if ((length > offset) && (stcb != NULL) && - !SCTP_BASE_SYSCTL(sctp_auth_disable) && + (stcb->asoc.auth_supported == 1) && sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks) && !stcb->asoc.authenticated) { /* "silently" ignore */ @@ -5875,11 +5991,11 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt /* * We consider OOTB any data sent during asoc setup. */ - snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__); + snprintf(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, inp->fibnum, vrf_id, port); goto out; /* sa_ignore NOTREACHED */ @@ -5898,10 +6014,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt } /* plow through the data chunks while length > offset */ retval = sctp_process_data(mm, iphlen, &offset, length, - src, dst, sh, - inp, stcb, net, &high_tsn, - use_mflowid, mflowid, - vrf_id, port); + inp, stcb, net, &high_tsn); if (retval == 2) { /* * The association aborted, NO UNLOCK needed since @@ -5918,7 +6031,7 @@ sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int lengt } /* take care of ecn */ if ((data_processed == 1) && - (stcb->asoc.ecn_allowed == 1) && + (stcb->asoc.ecn_supported == 1) && ((ecn_bits & SCTP_CE_BITS) == SCTP_CE_BITS)) { /* Yep, we need to add a ECNE */ sctp_send_ecn_echo(stcb, net, high_tsn); @@ -5953,7 +6066,9 @@ trigger_send: if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { cnt_ctrl_ready = stcb->asoc.ctrl_queue_cnt - stcb->asoc.ecn_echo_cnt_onq; } - if (cnt_ctrl_ready || + if (!TAILQ_EMPTY(&stcb->asoc.asconf_send_queue) || + cnt_ctrl_ready || + stcb->asoc.trigger_reset || ((un_sent) && (stcb->asoc.peers_rwnd > 0 || (stcb->asoc.peers_rwnd <= 0 && stcb->asoc.total_flight == 0)))) { @@ -5975,27 +6090,9 @@ out: SCTP_INP_DECR_REF(inp_decr); SCTP_INP_WUNLOCK(inp_decr); } -#ifdef INVARIANTS - if (inp != NULL) { - sctp_validate_no_locks(inp); - } -#endif return; } -#if 0 -static void -sctp_print_mbuf_chain(struct mbuf *m) -{ - for (; m; m = SCTP_BUF_NEXT(m)) { - SCTP_PRINTF("%p: m_len = %ld\n", (void *)m, SCTP_BUF_LEN(m)); - if (SCTP_BUF_IS_EXTENDED(m)) - SCTP_PRINTF("%p: extend_size = %d\n", (void *)m, SCTP_BUF_EXTEND_SIZE(m)); - } -} - -#endif - #ifdef INET void sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) @@ -6015,7 +6112,8 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) #endif uint32_t mflowid; - uint8_t use_mflowid; + uint8_t mflowtype; + uint16_t fibnum; iphlen = off; if (SCTP_GET_PKT_VRFID(i_pak, vrf_id)) { @@ -6026,13 +6124,7 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) #ifdef SCTP_MBUF_LOGGING /* Log in any input mbufs */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_INPUT); - } - } + sctp_log_mbc(m, SCTP_MBUF_INPUT); } #endif #ifdef SCTP_PACKET_LOGGING @@ -6041,17 +6133,13 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) } #endif SCTPDBG(SCTP_DEBUG_CRCOFFLOAD, - "sctp_input(): Packet of length %d received on %s with csum_flags 0x%x.\n", + "sctp_input(): Packet of length %d received on %s with csum_flags 0x%b.\n", m->m_pkthdr.len, if_name(m->m_pkthdr.rcvif), - m->m_pkthdr.csum_flags); - if (m->m_flags & M_FLOWID) { - mflowid = m->m_pkthdr.flowid; - use_mflowid = 1; - } else { - mflowid = 0; - use_mflowid = 0; - } + (int)m->m_pkthdr.csum_flags, CSUM_BITS); + mflowid = m->m_pkthdr.flowid; + mflowtype = M_HASHTYPE_GET(m); + fibnum = M_GETFIB(m); SCTP_STAT_INCR(sctps_recvpackets); SCTP_STAT_INCR_COUNTER64(sctps_inpackets); /* Get IP, SCTP, and first chunk header together in the first mbuf. */ @@ -6076,7 +6164,7 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) dst.sin_len = sizeof(struct sockaddr_in); dst.sin_port = sh->dest_port; dst.sin_addr = ip->ip_dst; - length = ip->ip_len + iphlen; + length = ntohs(ip->ip_len); /* Validate mbuf chain length with IP payload length. */ if (SCTP_HEADER_LEN(m) != length) { SCTPDBG(SCTP_DEBUG_INPUT1, @@ -6111,7 +6199,7 @@ sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) compute_crc, #endif ecn_bits, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); out: if (m) { @@ -6125,18 +6213,23 @@ extern int *sctp_cpuarry; #endif -void -sctp_input(struct mbuf *m, int off) +int +sctp_input(struct mbuf **mp, int *offp, int proto SCTP_UNUSED) { -#if defined(__FreeBSD__) && defined(SCTP_MCORE_INPUT) && defined(SMP) - struct ip *ip; - struct sctphdr *sh; - int offset; - int cpu_to_use; - uint32_t flowid, tag; + struct mbuf *m; + int off; + m = *mp; + off = *offp; +#if defined(__FreeBSD__) && defined(SCTP_MCORE_INPUT) && defined(SMP) if (mp_ncpus > 1) { - if (m->m_flags & M_FLOWID) { + struct ip *ip; + struct sctphdr *sh; + int offset; + int cpu_to_use; + uint32_t flowid, tag; + + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { flowid = m->m_pkthdr.flowid; } else { /* @@ -6147,7 +6240,7 @@ sctp_input(struct mbuf *m, int off) if (SCTP_BUF_LEN(m) < offset) { if ((m = m_pullup(m, offset)) == NULL) { SCTP_STAT_INCR(sctps_hdrops); - return; + return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); @@ -6155,14 +6248,15 @@ sctp_input(struct mbuf *m, int off) tag = htonl(sh->v_tag); flowid = tag ^ ntohs(sh->dest_port) ^ ntohs(sh->src_port); m->m_pkthdr.flowid = flowid; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE_HASH); } cpu_to_use = sctp_cpuarry[flowid % mp_ncpus]; sctp_queue_to_mcore(m, off, cpu_to_use); - return; + return (IPPROTO_DONE); } #endif sctp_input_with_port(m, off, 0); + return (IPPROTO_DONE); } #endif diff --git a/freebsd/sys/netinet/sctp_input.h b/freebsd/sys/netinet/sctp_input.h index 95208032..148864b6 100644 --- a/freebsd/sys/netinet/sctp_input.h +++ b/freebsd/sys/netinet/sctp_input.h @@ -45,10 +45,10 @@ sctp_common_input_processing(struct mbuf **, int, int, int, uint8_t, #endif uint8_t, - uint8_t, uint32_t, + uint8_t, uint32_t, uint16_t, uint32_t, uint16_t); -struct sctp_stream_reset_out_request * +struct sctp_stream_reset_request * sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chunk **bchk); diff --git a/freebsd/sys/netinet/sctp_lock_bsd.h b/freebsd/sys/netinet/sctp_lock_bsd.h index 35cdf5f8..96e35214 100644 --- a/freebsd/sys/netinet/sctp_lock_bsd.h +++ b/freebsd/sys/netinet/sctp_lock_bsd.h @@ -49,7 +49,7 @@ __FBSDID("$FreeBSD$"); * Most other locks (INP and INFO) attempt to localize the locking i.e. we try * to contain the lock and unlock within the function that needs to lock it. * This sometimes mean we do extra locks and unlocks and lose a bit of - * efficency, but if the performance statements about non-recursive locks are + * efficiency, but if the performance statements about non-recursive locks are * true this should not be a problem. One issue that arises with this only * lock when needed is that if an implicit association setup is done we have * a problem. If at the time I lookup an association I have NULL in the tcb diff --git a/freebsd/sys/netinet/sctp_os_bsd.h b/freebsd/sys/netinet/sctp_os_bsd.h index d33d1fd3..e87914e5 100644 --- a/freebsd/sys/netinet/sctp_os_bsd.h +++ b/freebsd/sys/netinet/sctp_os_bsd.h @@ -95,7 +95,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include @@ -105,7 +104,7 @@ __FBSDID("$FreeBSD$"); #include #include -#include +#include #ifndef in6pcb #define in6pcb inpcb @@ -152,33 +151,27 @@ MALLOC_DECLARE(SCTP_M_MCORE); #define V_system_base_info VNET(system_base_info) #define SCTP_BASE_INFO(__m) V_system_base_info.sctppcbinfo.__m #define SCTP_BASE_STATS V_system_base_info.sctpstat -#define SCTP_BASE_STATS_SYSCTL VNET_NAME(system_base_info.sctpstat) -#define SCTP_BASE_STAT(__m) V_system_base_info.sctpstat.__m -#define SCTP_BASE_SYSCTL(__m) VNET_NAME(system_base_info.sctpsysctl.__m) +#define SCTP_BASE_STAT(__m) V_system_base_info.sctpstat.__m +#define SCTP_BASE_SYSCTL(__m) V_system_base_info.sctpsysctl.__m #define SCTP_BASE_VAR(__m) V_system_base_info.__m -/* - * - */ -#define USER_ADDR_NULL (NULL) /* FIX ME: temp */ - #define SCTP_PRINTF(params...) printf(params) #if defined(SCTP_DEBUG) #define SCTPDBG(level, params...) \ { \ - do { \ - if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \ - SCTP_PRINTF(params); \ - } \ - } while (0); \ + do { \ + if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \ + SCTP_PRINTF(params); \ + } \ + } while (0); \ } #define SCTPDBG_ADDR(level, addr) \ { \ - do { \ - if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \ - sctp_print_address(addr); \ - } \ - } while (0); \ + do { \ + if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \ + sctp_print_address(addr); \ + } \ + } while (0); \ } #else #define SCTPDBG(level, params...) @@ -194,11 +187,11 @@ MALLOC_DECLARE(SCTP_M_MCORE); #ifdef SCTP_LTRACE_ERRORS #define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) \ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \ - SCTP_PRINTF("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \ + SCTP_PRINTF("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \ m, inp, stcb, net, file, __LINE__, err); #define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) \ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \ - SCTP_PRINTF("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \ + SCTP_PRINTF("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \ inp, stcb, net, file, __LINE__, err); #else #define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) @@ -232,16 +225,16 @@ MALLOC_DECLARE(SCTP_M_MCORE); * general memory allocation */ #define SCTP_MALLOC(var, type, size, name) \ - do { \ - var = (type)malloc(size, name, M_NOWAIT); \ - } while (0) + do { \ + var = (type)malloc(size, name, M_NOWAIT); \ + } while (0) #define SCTP_FREE(var, type) free(var, type) #define SCTP_MALLOC_SONAME(var, type, size) \ - do { \ - var = (type)malloc(size, M_SONAME, M_WAITOK | M_ZERO); \ - } while (0) + do { \ + var = (type)malloc(size, M_SONAME, M_WAITOK | M_ZERO); \ + } while (0) #define SCTP_FREE_SONAME(var) free(var, M_SONAME) @@ -305,16 +298,12 @@ typedef struct callout sctp_os_timer_t; #define SCTP_BUF_RESV_UF(m, size) m->m_data += size #define SCTP_BUF_AT(m, size) m->m_data + size #define SCTP_BUF_IS_EXTENDED(m) (m->m_flags & M_EXT) -#define SCTP_BUF_EXTEND_SIZE(m) (m->m_ext.ext_size) +#define SCTP_BUF_SIZE M_SIZE #define SCTP_BUF_TYPE(m) (m->m_type) #define SCTP_BUF_RECVIF(m) (m->m_pkthdr.rcvif) #define SCTP_BUF_PREPEND M_PREPEND -#define SCTP_ALIGN_TO_END(m, len) if(m->m_flags & M_PKTHDR) { \ - MH_ALIGN(m, len); \ - } else if ((m->m_flags & M_EXT) == 0) { \ - M_ALIGN(m, len); \ - } +#define SCTP_ALIGN_TO_END(m, len) M_ALIGN(m, len) /* We make it so if you have up to 4 threads * writing based on the default size of @@ -328,11 +317,11 @@ typedef struct callout sctp_os_timer_t; /* MTU */ /*************************/ #define SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, af) ((struct ifnet *)ifn)->if_mtu -#define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((rt != NULL) ? rt->rt_rmx.rmx_mtu : 0) +#define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((uint32_t)((rt != NULL) ? rt->rt_mtu : 0)) #define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn) ((sctp_ifn->ifn_p != NULL) ? ((struct ifnet *)(sctp_ifn->ifn_p))->if_mtu : 0) #define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu) do { \ if (rt != NULL) \ - rt->rt_rmx.rmx_mtu = mtu; \ + rt->rt_mtu = mtu; \ } while(0) /* (de-)register interface event notifications */ @@ -346,7 +335,7 @@ typedef struct callout sctp_os_timer_t; /* return the base ext data pointer */ #define SCTP_BUF_EXTEND_BASE(m) (m->m_ext.ext_buf) /* return the refcnt of the data pointer */ -#define SCTP_BUF_EXTEND_REFCNT(m) (*m->m_ext.ref_cnt) +#define SCTP_BUF_EXTEND_REFCNT(m) (*m->m_ext.ext_cnt) /* return any buffer related flags, this is * used beyond logging for apple only. */ @@ -399,6 +388,11 @@ typedef struct callout sctp_os_timer_t; #define SCTP_CLEAR_SO_NBIO(so) ((so)->so_state &= ~SS_NBIO) /* get the socket type */ #define SCTP_SO_TYPE(so) ((so)->so_type) +/* Use a macro for renaming sb_cc to sb_acc. + * Initially sb_ccc was used, but this broke select() when used + * with SCTP sockets. + */ +#define sb_cc sb_acc /* reserve sb space for a socket */ #define SCTP_SORESERVE(so, send, recv) soreserve(so, send, recv) /* wakeup a socket */ @@ -418,19 +412,19 @@ typedef struct callout sctp_os_timer_t; typedef struct route sctp_route_t; typedef struct rtentry sctp_rtentry_t; -/* - * XXX multi-FIB support was backed out in r179783 and it seems clear that the - * VRF support as currently in FreeBSD is not ready to support multi-FIB. - * It might be best to implement multi-FIB support for both v4 and v6 indepedent - * of VRFs and leave those to a real MPLS stack. - */ -#define SCTP_RTALLOC(ro, vrf_id) rtalloc_ign((struct route *)ro, 0UL) +#define SCTP_RTALLOC(ro, vrf_id, fibnum) \ + rtalloc_ign_fib((struct route *)ro, 0UL, fibnum) /* Future zero copy wakeup/send function */ #define SCTP_ZERO_COPY_EVENT(inp, so) /* This is re-pulse ourselves for sendbuf */ #define SCTP_ZERO_COPY_SENDQ_EVENT(inp, so) +/* + * SCTP protocol specific mbuf flags. + */ +#define M_NOTIFICATION M_PROTO1 /* SCTP notification */ + /* * IP output routines */ @@ -442,12 +436,14 @@ typedef struct rtentry sctp_rtentry_t; local_stcb->sctp_ep && \ local_stcb->sctp_ep->sctp_socket) \ o_flgs |= local_stcb->sctp_ep->sctp_socket->so_options & SO_DONTROUTE; \ + m_clrprotoflags(o_pak); \ result = ip_output(o_pak, NULL, ro, o_flgs, 0, NULL); \ } #define SCTP_IP6_OUTPUT(result, o_pak, ro, ifp, stcb, vrf_id) \ { \ struct sctp_tcb *local_stcb = stcb; \ + m_clrprotoflags(o_pak); \ if (local_stcb && local_stcb->sctp_ep) \ result = ip6_output(o_pak, \ ((struct in6pcb *)(local_stcb->sctp_ep))->in6p_outputopts, \ diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c index cbc25b9c..9e12e775 100644 --- a/freebsd/sys/netinet/sctp_output.c +++ b/freebsd/sys/netinet/sctp_output.c @@ -52,7 +52,9 @@ __FBSDID("$FreeBSD$"); #include #include #include +#if defined(INET) || defined(INET6) #include +#endif #include #include @@ -67,7 +69,7 @@ struct sack_track { struct sctp_gap_ack_block gaps[SCTP_MAX_GAPS_INARRAY]; }; -struct sack_track sack_array[256] = { +const struct sack_track sack_array[256] = { {0, 0, 0, 0, /* 0x00 */ {{0, 0}, {0, 0}, @@ -1881,7 +1883,7 @@ sctp_is_address_in_scope(struct sctp_ifa *ifa, if (scope->ipv4_addr_legal) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)&ifa->address.sin; + sin = &ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* not in scope , unspecified */ return (0); @@ -1912,7 +1914,7 @@ sctp_is_address_in_scope(struct sctp_ifa *ifa, return (0); } /* ok to use deprecated addresses? */ - sin6 = (struct sockaddr_in6 *)&ifa->address.sin6; + sin6 = &ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* skip unspecifed addresses */ return (0); @@ -1971,7 +1973,7 @@ sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t * len) while (SCTP_BUF_NEXT(mret) != NULL) { mret = SCTP_BUF_NEXT(mret); } - SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_DONTWAIT, 1, MT_DATA); + SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_NOWAIT, 1, MT_DATA); if (SCTP_BUF_NEXT(mret) == NULL) { /* We are hosed, can't add more addresses */ return (m); @@ -1987,7 +1989,7 @@ sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t * len) struct sctp_ipv4addr_param *ipv4p; struct sockaddr_in *sin; - sin = (struct sockaddr_in *)&ifa->address.sin; + sin = &ifa->address.sin; ipv4p = (struct sctp_ipv4addr_param *)parmh; parmh->param_type = htons(SCTP_IPV4_ADDRESS); parmh->param_length = htons(plen); @@ -2002,7 +2004,7 @@ sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t * len) struct sctp_ipv6addr_param *ipv6p; struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)&ifa->address.sin6; + sin6 = &ifa->address.sin6; ipv6p = (struct sctp_ipv6addr_param *)parmh; parmh->param_type = htons(SCTP_IPV6_ADDRESS); parmh->param_length = htons(plen); @@ -2417,7 +2419,7 @@ sctp_is_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", - __FUNCTION__); + __func__); continue; } if (laddr->ifa == ifa) { @@ -2439,7 +2441,7 @@ sctp_is_addr_in_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa) LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", - __FUNCTION__); + __func__); continue; } if ((laddr->ifa == ifa) && laddr->action == 0) @@ -3071,7 +3073,7 @@ bound_all_plan_b: ifn, num_preferred); if (num_preferred == 0) { /* None on this interface. */ - SCTPDBG(SCTP_DEBUG_OUTPUT2, "No prefered -- skipping to next\n"); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "No preferred -- skipping to next\n"); continue; } SCTPDBG(SCTP_DEBUG_OUTPUT2, @@ -3156,12 +3158,10 @@ again_with_private_addresses_allowed: * It is restricted for some reason.. * probably not yet added. */ - SCTPDBG(SCTP_DEBUG_OUTPUT2, "Its resticted\n"); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "Its restricted\n"); sifa = NULL; continue; } - } else { - SCTP_PRINTF("Stcb is null - no print\n"); } atomic_add_int(&sifa->refcount, 1); goto out; @@ -3224,12 +3224,14 @@ plan_d: } } #ifdef INET - if ((retried == 0) && (stcb->asoc.scope.ipv4_local_scope == 0)) { - stcb->asoc.scope.ipv4_local_scope = 1; - retried = 1; - goto again_with_private_addresses_allowed; - } else if (retried == 1) { - stcb->asoc.scope.ipv4_local_scope = 0; + if (stcb) { + if ((retried == 0) && (stcb->asoc.scope.ipv4_local_scope == 0)) { + stcb->asoc.scope.ipv4_local_scope = 1; + retried = 1; + goto again_with_private_addresses_allowed; + } else if (retried == 1) { + stcb->asoc.scope.ipv4_local_scope = 0; + } } #endif out: @@ -3326,10 +3328,11 @@ sctp_source_address_selection(struct sctp_inpcb *inp, #endif /** - * Rules: - Find the route if needed, cache if I can. - Look at - * interface address in route, Is it in the bound list. If so we - * have the best source. - If not we must rotate amongst the - * addresses. + * Rules: + * - Find the route if needed, cache if I can. + * - Look at interface address in route, Is it in the bound list. If so we + * have the best source. + * - If not we must rotate amongst the addresses. * * Cavets and issues * @@ -3391,7 +3394,7 @@ sctp_source_address_selection(struct sctp_inpcb *inp, /* * Need a route to cache. */ - SCTP_RTALLOC(ro, vrf_id); + SCTP_RTALLOC(ro, vrf_id, inp->fibnum); } if (ro->ro_rt == NULL) { return (NULL); @@ -3508,7 +3511,7 @@ sctp_find_cmsg(int c_type, void *data, struct mbuf *control, size_t cpsize) return (found); } /* It is exactly what we want. Copy it out. */ - m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), cpsize, (caddr_t)data); + m_copydata(control, at + CMSG_ALIGN(sizeof(cmh)), (int)cpsize, (caddr_t)data); return (1); } else { struct sctp_sndrcvinfo *sndrcvinfo; @@ -3618,6 +3621,11 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er struct sctp_stream_out *tmp_str; unsigned int i; +#if defined(SCTP_DETAILED_STR_STATS) + int j; + +#endif + /* Default is NOT correct */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, default:%d pre_open:%d\n", stcb->asoc.streamoutcnt, stcb->asoc.pre_open_streams); @@ -3637,10 +3645,21 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er for (i = 0; i < stcb->asoc.streamoutcnt; i++) { TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); stcb->asoc.strmout[i].chunks_on_queues = 0; - stcb->asoc.strmout[i].next_sequence_send = 0; + stcb->asoc.strmout[i].next_mid_ordered = 0; + stcb->asoc.strmout[i].next_mid_unordered = 0; +#if defined(SCTP_DETAILED_STR_STATS) + for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { + stcb->asoc.strmout[i].abandoned_sent[j] = 0; + stcb->asoc.strmout[i].abandoned_unsent[j] = 0; + } +#else + stcb->asoc.strmout[i].abandoned_sent[0] = 0; + stcb->asoc.strmout[i].abandoned_unsent[0] = 0; +#endif stcb->asoc.strmout[i].stream_no = i; stcb->asoc.strmout[i].last_msg_incomplete = 0; - stcb->asoc.ss_functions.sctp_ss_init_stream(&stcb->asoc.strmout[i], NULL); + stcb->asoc.strmout[i].state = SCTP_STREAM_OPENING; + stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], NULL); } } break; @@ -3661,7 +3680,7 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er *error = EINVAL; return (1); } - if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, + if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { *error = ENOBUFS; return (1); @@ -3693,14 +3712,14 @@ sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *er *error = EINVAL; return (1); } - if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, + if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { *error = ENOBUFS; return (1); } } else #endif - if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin6, NULL, + if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin6, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { *error = ENOBUFS; return (1); @@ -3821,28 +3840,22 @@ sctp_add_cookie(struct mbuf *init, int init_offset, mret = sctp_get_mbuf_for_msg((sizeof(struct sctp_state_cookie) + sizeof(struct sctp_paramhdr)), 0, - M_DONTWAIT, 1, MT_DATA); + M_NOWAIT, 1, MT_DATA); if (mret == NULL) { return (NULL); } - copy_init = SCTP_M_COPYM(init, init_offset, M_COPYALL, M_DONTWAIT); + copy_init = SCTP_M_COPYM(init, init_offset, M_COPYALL, M_NOWAIT); if (copy_init == NULL) { sctp_m_freem(mret); return (NULL); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = copy_init; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_ICOPY); - } - } + sctp_log_mbc(copy_init, SCTP_MBUF_ICOPY); } #endif copy_initack = SCTP_M_COPYM(initack, initack_offset, M_COPYALL, - M_DONTWAIT); + M_NOWAIT); if (copy_initack == NULL) { sctp_m_freem(mret); sctp_m_freem(copy_init); @@ -3850,13 +3863,7 @@ sctp_add_cookie(struct mbuf *init, int init_offset, } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = copy_initack; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_ICOPY); - } - } + sctp_log_mbc(copy_initack, SCTP_MBUF_ICOPY); } #endif /* easy side we just drop it on the end */ @@ -3892,7 +3899,7 @@ sctp_add_cookie(struct mbuf *init, int init_offset, break; } } - sig = sctp_get_mbuf_for_msg(SCTP_SECRET_SIZE, 0, M_DONTWAIT, 1, MT_DATA); + sig = sctp_get_mbuf_for_msg(SCTP_SECRET_SIZE, 0, M_NOWAIT, 1, MT_DATA); if (sig == NULL) { /* no space, so free the entire chain */ sctp_m_freem(mret); @@ -3914,7 +3921,7 @@ sctp_add_cookie(struct mbuf *init, int init_offset, static uint8_t sctp_get_ect(struct sctp_tcb *stcb) { - if ((stcb != NULL) && (stcb->asoc.ecn_allowed == 1)) { + if ((stcb != NULL) && (stcb->asoc.ecn_supported == 1)) { return (SCTP_ECT0_BIT); } else { return (0); @@ -3985,7 +3992,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, uint32_t v_tag, uint16_t port, union sctp_sockstore *over_addr, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, #if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) int so_locked SCTP_UNUSED #else @@ -4061,11 +4068,11 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, sctp_route_t iproute; int len; - len = sizeof(struct ip) + sizeof(struct sctphdr); + len = SCTP_MIN_V4_OVERHEAD; if (port) { len += sizeof(struct udphdr); } - newm = sctp_get_mbuf_for_msg(len, 1, M_DONTWAIT, 1, MT_DATA); + newm = sctp_get_mbuf_for_msg(len, 1, M_NOWAIT, 1, MT_DATA); if (newm == NULL) { sctp_m_freem(m); SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); @@ -4076,18 +4083,11 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, SCTP_BUF_NEXT(newm) = m; m = newm; if (net != NULL) { -#ifdef INVARIANTS - if (net->flowidset == 0) { - panic("Flow ID not set"); - } -#endif m->m_pkthdr.flowid = net->flowid; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, net->flowtype); } else { - if (use_mflowid != 0) { - m->m_pkthdr.flowid = mflowid; - m->m_flags |= M_FLOWID; - } + m->m_pkthdr.flowid = mflowid; + M_HASHTYPE_SET(m, mflowtype); } packet_length = sctp_calculate_len(m); ip = mtod(m, struct ip *); @@ -4106,15 +4106,15 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, tos_value |= sctp_get_ect(stcb); } if ((nofragment_flag) && (port == 0)) { - ip->ip_off = IP_DF; + ip->ip_off = htons(IP_DF); } else { - ip->ip_off = 0; + ip->ip_off = htons(0); } /* FreeBSD has a function for ip_id's */ - ip->ip_id = ip_newid(); + ip_fillid(ip); ip->ip_ttl = inp->ip_inp.inp.inp_ip_ttl; - ip->ip_len = packet_length; + ip->ip_len = htons(packet_length); ip->ip_tos = tos_value; if (port) { ip->ip_p = IPPROTO_UDP; @@ -4177,7 +4177,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, sctp_free_ifa(_lsrc); } else { ip->ip_src = over_addr->sin.sin_addr; - SCTP_RTALLOC(ro, vrf_id); + SCTP_RTALLOC(ro, vrf_id, inp->fibnum); } } if (port) { @@ -4190,7 +4190,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); udp->uh_dport = port; - udp->uh_ulen = htons(packet_length - sizeof(struct ip)); + udp->uh_ulen = htons((uint16_t) (packet_length - sizeof(struct ip))); if (V_udp_cksum) { udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP)); } else { @@ -4350,11 +4350,11 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, flowlabel = ntohl(((struct in6pcb *)inp)->in6p_flowinfo); } flowlabel &= 0x000fffff; - len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr); + len = SCTP_MIN_OVERHEAD; if (port) { len += sizeof(struct udphdr); } - newm = sctp_get_mbuf_for_msg(len, 1, M_DONTWAIT, 1, MT_DATA); + newm = sctp_get_mbuf_for_msg(len, 1, M_NOWAIT, 1, MT_DATA); if (newm == NULL) { sctp_m_freem(m); SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); @@ -4365,18 +4365,11 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, SCTP_BUF_NEXT(newm) = m; m = newm; if (net != NULL) { -#ifdef INVARIANTS - if (net->flowidset == 0) { - panic("Flow ID not set"); - } -#endif m->m_pkthdr.flowid = net->flowid; - m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, net->flowtype); } else { - if (use_mflowid != 0) { - m->m_pkthdr.flowid = mflowid; - m->m_flags |= M_FLOWID; - } + m->m_pkthdr.flowid = mflowid; + M_HASHTYPE_SET(m, mflowtype); } packet_length = sctp_calculate_len(m); @@ -4425,7 +4418,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, } else { ip6h->ip6_nxt = IPPROTO_SCTP; } - ip6h->ip6_plen = (packet_length - sizeof(struct ip6_hdr)); + ip6h->ip6_plen = (uint16_t) (packet_length - sizeof(struct ip6_hdr)); ip6h->ip6_dst = sin6->sin6_addr; /* @@ -4498,7 +4491,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, sctp_free_ifa(_lsrc); } else { lsa6->sin6_addr = over_addr->sin6.sin6_addr; - SCTP_RTALLOC(ro, vrf_id); + SCTP_RTALLOC(ro, vrf_id, inp->fibnum); } (void)sa6_recoverscope(sin6); } @@ -4544,7 +4537,7 @@ sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, udp = (struct udphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr)); udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); udp->uh_dport = port; - udp->uh_ulen = htons(packet_length - sizeof(struct ip6_hdr)); + udp->uh_ulen = htons((uint16_t) (packet_length - sizeof(struct ip6_hdr))); udp->uh_sum = 0; sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr)); } else { @@ -4700,7 +4693,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked #endif ) { - struct mbuf *m; + struct mbuf *m, *m_last; struct sctp_nets *net; struct sctp_init_chunk *init; struct sctp_supported_addr_param *sup_addr; @@ -4745,7 +4738,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked /* start the INIT timer */ sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, net); - m = sctp_get_mbuf_for_msg(MCLBYTES, 1, M_DONTWAIT, 1, MT_DATA); + m = sctp_get_mbuf_for_msg(MCLBYTES, 1, M_NOWAIT, 1, MT_DATA); if (m == NULL) { /* No memory, INIT timer will re-attempt. */ SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - mbuf?\n"); @@ -4753,12 +4746,6 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked } chunk_len = (uint16_t) sizeof(struct sctp_init_chunk); padding_len = 0; - /* - * assume peer supports asconf in order to be able to queue local - * address changes while an INIT is in flight and before the assoc - * is established. - */ - stcb->asoc.peer_supports_asconf = 1; /* Now lets put the chunk header in place */ init = mtod(m, struct sctp_init_chunk *); /* now the chunk header */ @@ -4775,120 +4762,76 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked init->init.num_inbound_streams = htons(stcb->asoc.max_inbound_streams); init->init.initial_tsn = htonl(stcb->asoc.init_seq_number); - if (stcb->asoc.scope.ipv4_addr_legal || stcb->asoc.scope.ipv6_addr_legal) { - uint8_t i; - - parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); - if (stcb->asoc.scope.ipv4_addr_legal) { - parameter_len += (uint16_t) sizeof(uint16_t); - } - if (stcb->asoc.scope.ipv6_addr_legal) { - parameter_len += (uint16_t) sizeof(uint16_t); - } - sup_addr = (struct sctp_supported_addr_param *)(mtod(m, caddr_t)+chunk_len); - sup_addr->ph.param_type = htons(SCTP_SUPPORTED_ADDRTYPE); - sup_addr->ph.param_length = htons(parameter_len); - i = 0; - if (stcb->asoc.scope.ipv4_addr_legal) { - sup_addr->addr_type[i++] = htons(SCTP_IPV4_ADDRESS); - } - if (stcb->asoc.scope.ipv6_addr_legal) { - sup_addr->addr_type[i++] = htons(SCTP_IPV6_ADDRESS); - } - padding_len = 4 - 2 * i; - chunk_len += parameter_len; - } /* Adaptation layer indication parameter */ if (inp->sctp_ep.adaptation_layer_indicator_provided) { - if (padding_len > 0) { - memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); - chunk_len += padding_len; - padding_len = 0; - } parameter_len = (uint16_t) sizeof(struct sctp_adaptation_layer_indication); ali = (struct sctp_adaptation_layer_indication *)(mtod(m, caddr_t)+chunk_len); ali->ph.param_type = htons(SCTP_ULP_ADAPTATION); ali->ph.param_length = htons(parameter_len); - ali->indication = ntohl(inp->sctp_ep.adaptation_layer_indicator); + ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator); chunk_len += parameter_len; } - if (SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly)) { - /* Add NAT friendly parameter. */ - if (padding_len > 0) { - memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); - chunk_len += padding_len; - padding_len = 0; - } + /* ECN parameter */ + if (stcb->asoc.ecn_supported == 1) { parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); - ph->param_type = htons(SCTP_HAS_NAT_SUPPORT); + ph->param_type = htons(SCTP_ECN_CAPABLE); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } - /* now any cookie time extensions */ - if (stcb->asoc.cookie_preserve_req) { - struct sctp_cookie_perserve_param *cookie_preserve; - - if (padding_len > 0) { - memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); - chunk_len += padding_len; - padding_len = 0; - } - parameter_len = (uint16_t) sizeof(struct sctp_cookie_perserve_param); - cookie_preserve = (struct sctp_cookie_perserve_param *)(mtod(m, caddr_t)+chunk_len); - cookie_preserve->ph.param_type = htons(SCTP_COOKIE_PRESERVE); - cookie_preserve->ph.param_length = htons(parameter_len); - cookie_preserve->time = htonl(stcb->asoc.cookie_preserve_req); - stcb->asoc.cookie_preserve_req = 0; + /* PR-SCTP supported parameter */ + if (stcb->asoc.prsctp_supported == 1) { + parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); + ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); + ph->param_type = htons(SCTP_PRSCTP_SUPPORTED); + ph->param_length = htons(parameter_len); chunk_len += parameter_len; } - /* ECN parameter */ - if (stcb->asoc.ecn_allowed == 1) { - if (padding_len > 0) { - memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); - chunk_len += padding_len; - padding_len = 0; - } + /* Add NAT friendly parameter. */ + if (SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly)) { parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); - ph->param_type = htons(SCTP_ECN_CAPABLE); + ph->param_type = htons(SCTP_HAS_NAT_SUPPORT); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } - /* And now tell the peer we do support PR-SCTP. */ - if (padding_len > 0) { - memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); - chunk_len += padding_len; - padding_len = 0; - } - parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); - ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); - ph->param_type = htons(SCTP_PRSCTP_SUPPORTED); - ph->param_length = htons(parameter_len); - chunk_len += parameter_len; - - /* And now tell the peer we do all the extensions */ - pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len); - pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT); + /* And now tell the peer which extensions we support */ num_ext = 0; - pr_supported->chunk_types[num_ext++] = SCTP_ASCONF; - pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK; - pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN; - pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED; - pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET; - if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) { + pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len); + if (stcb->asoc.prsctp_supported == 1) { + pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN; + if (stcb->asoc.idata_supported) { + pr_supported->chunk_types[num_ext++] = SCTP_IFORWARD_CUM_TSN; + } + } + if (stcb->asoc.auth_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION; } - if (stcb->asoc.sctp_nr_sack_on_off == 1) { + if (stcb->asoc.asconf_supported == 1) { + pr_supported->chunk_types[num_ext++] = SCTP_ASCONF; + pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK; + } + if (stcb->asoc.reconfig_supported == 1) { + pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET; + } + if (stcb->asoc.idata_supported) { + pr_supported->chunk_types[num_ext++] = SCTP_IDATA; + } + if (stcb->asoc.nrsack_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK; } - parameter_len = (uint16_t) sizeof(struct sctp_supported_chunk_types_param) + num_ext; - pr_supported->ph.param_length = htons(parameter_len); - padding_len = SCTP_SIZE32(parameter_len) - parameter_len; - chunk_len += parameter_len; - + if (stcb->asoc.pktdrop_supported == 1) { + pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED; + } + if (num_ext > 0) { + parameter_len = (uint16_t) sizeof(struct sctp_supported_chunk_types_param) + num_ext; + pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT); + pr_supported->ph.param_length = htons(parameter_len); + padding_len = SCTP_SIZE32(parameter_len) - parameter_len; + chunk_len += parameter_len; + } /* add authentication parameters */ - if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) { + if (stcb->asoc.auth_supported) { /* attach RANDOM parameter, if available */ if (stcb->asoc.authinfo.random != NULL) { struct sctp_auth_random *randp; @@ -4906,8 +4849,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked chunk_len += parameter_len; } /* add HMAC_ALGO parameter */ - if ((stcb->asoc.local_hmacs != NULL) && - (stcb->asoc.local_hmacs->num_algo > 0)) { + if (stcb->asoc.local_hmacs != NULL) { struct sctp_auth_hmac_algo *hmacs; if (padding_len > 0) { @@ -4925,7 +4867,7 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked chunk_len += parameter_len; } /* add CHUNKS parameter */ - if (sctp_auth_get_chklist_size(stcb->asoc.local_auth_chunks) > 0) { + if (stcb->asoc.local_auth_chunks != NULL) { struct sctp_auth_chunk_list *chunks; if (padding_len > 0) { @@ -4943,8 +4885,52 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked chunk_len += parameter_len; } } - SCTP_BUF_LEN(m) = chunk_len; + /* now any cookie time extensions */ + if (stcb->asoc.cookie_preserve_req) { + struct sctp_cookie_perserve_param *cookie_preserve; + + if (padding_len > 0) { + memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); + chunk_len += padding_len; + padding_len = 0; + } + parameter_len = (uint16_t) sizeof(struct sctp_cookie_perserve_param); + cookie_preserve = (struct sctp_cookie_perserve_param *)(mtod(m, caddr_t)+chunk_len); + cookie_preserve->ph.param_type = htons(SCTP_COOKIE_PRESERVE); + cookie_preserve->ph.param_length = htons(parameter_len); + cookie_preserve->time = htonl(stcb->asoc.cookie_preserve_req); + stcb->asoc.cookie_preserve_req = 0; + chunk_len += parameter_len; + } + if (stcb->asoc.scope.ipv4_addr_legal || stcb->asoc.scope.ipv6_addr_legal) { + uint8_t i; + if (padding_len > 0) { + memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); + chunk_len += padding_len; + padding_len = 0; + } + parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); + if (stcb->asoc.scope.ipv4_addr_legal) { + parameter_len += (uint16_t) sizeof(uint16_t); + } + if (stcb->asoc.scope.ipv6_addr_legal) { + parameter_len += (uint16_t) sizeof(uint16_t); + } + sup_addr = (struct sctp_supported_addr_param *)(mtod(m, caddr_t)+chunk_len); + sup_addr->ph.param_type = htons(SCTP_SUPPORTED_ADDRTYPE); + sup_addr->ph.param_length = htons(parameter_len); + i = 0; + if (stcb->asoc.scope.ipv4_addr_legal) { + sup_addr->addr_type[i++] = htons(SCTP_IPV4_ADDRESS); + } + if (stcb->asoc.scope.ipv6_addr_legal) { + sup_addr->addr_type[i++] = htons(SCTP_IPV6_ADDRESS); + } + padding_len = 4 - 2 * i; + chunk_len += parameter_len; + } + SCTP_BUF_LEN(m) = chunk_len; /* now the addresses */ /* * To optimize this we could put the scoping stuff into a structure @@ -4952,18 +4938,13 @@ sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked * we could just sifa in the address within the stcb. But for now * this is a quick hack to get the address stuff teased apart. */ - sctp_add_addresses_to_i_ia(inp, stcb, &stcb->asoc.scope, m, cnt_inits_to, &padding_len, &chunk_len); + m_last = sctp_add_addresses_to_i_ia(inp, stcb, &stcb->asoc.scope, + m, cnt_inits_to, + &padding_len, &chunk_len); init->ch.chunk_length = htons(chunk_len); if (padding_len > 0) { - struct mbuf *m_at, *mp_last; - - mp_last = NULL; - for (m_at = m; m_at; m_at = SCTP_BUF_NEXT(m_at)) { - if (SCTP_BUF_NEXT(m_at) == NULL) - mp_last = m_at; - } - if ((mp_last == NULL) || sctp_add_pad_tombuf(mp_last, padding_len)) { + if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) { sctp_m_freem(m); return; } @@ -5100,7 +5081,6 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, *nat_friendly = 1; /* fall through */ case SCTP_PRSCTP_SUPPORTED: - if (padded_size != sizeof(struct sctp_paramhdr)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error prsctp/nat support %d\n", plen); goto invalid_size; @@ -5108,7 +5088,7 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, at += padded_size; break; case SCTP_ECN_CAPABLE: - if (padded_size != sizeof(struct sctp_ecn_supported_param)) { + if (padded_size != sizeof(struct sctp_paramhdr)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ecn %d\n", plen); goto invalid_size; } @@ -5138,13 +5118,14 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, if (op_err == NULL) { /* Ok need to try to get a mbuf */ #ifdef INET6 - l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); + l_len = SCTP_MIN_OVERHEAD; #else - l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); + l_len = SCTP_MIN_V4_OVERHEAD; #endif + l_len += sizeof(struct sctp_chunkhdr); l_len += plen; l_len += sizeof(struct sctp_paramhdr); - op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA); + op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); if (op_err) { SCTP_BUF_LEN(op_err) = 0; /* @@ -5207,13 +5188,14 @@ sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, /* Ok need to try to get an mbuf */ #ifdef INET6 - l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); + l_len = SCTP_MIN_OVERHEAD; #else - l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); + l_len = SCTP_MIN_V4_OVERHEAD; #endif + l_len += sizeof(struct sctp_chunkhdr); l_len += plen; l_len += sizeof(struct sctp_paramhdr); - op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA); + op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); if (op_err) { SCTP_BUF_LEN(op_err) = 0; #ifdef INET6 @@ -5282,12 +5264,13 @@ invalid_size: int l_len; #ifdef INET6 - l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); + l_len = SCTP_MIN_OVERHEAD; #else - l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); + l_len = SCTP_MIN_V4_OVERHEAD; #endif + l_len += sizeof(struct sctp_chunkhdr); l_len += (2 * sizeof(struct sctp_paramhdr)); - op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA); + op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); if (op_err) { SCTP_BUF_LEN(op_err) = 0; #ifdef INET6 @@ -5336,6 +5319,7 @@ sctp_are_there_new_addresses(struct sctp_association *asoc, uint16_t ptype, plen; uint8_t fnd; struct sctp_nets *net; + int check_src; #ifdef INET struct sockaddr_in sin4, *sa4; @@ -5357,39 +5341,61 @@ sctp_are_there_new_addresses(struct sctp_association *asoc, sin6.sin6_len = sizeof(sin6); #endif /* First what about the src address of the pkt ? */ - fnd = 0; - TAILQ_FOREACH(net, &asoc->nets, sctp_next) { - sa = (struct sockaddr *)&net->ro._l_addr; - if (sa->sa_family == src->sa_family) { + check_src = 0; + switch (src->sa_family) { +#ifdef INET + case AF_INET: + if (asoc->scope.ipv4_addr_legal) { + check_src = 1; + } + break; +#endif +#ifdef INET6 + case AF_INET6: + if (asoc->scope.ipv6_addr_legal) { + check_src = 1; + } + break; +#endif + default: + /* TSNH */ + break; + } + if (check_src) { + fnd = 0; + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + sa = (struct sockaddr *)&net->ro._l_addr; + if (sa->sa_family == src->sa_family) { #ifdef INET - if (sa->sa_family == AF_INET) { - struct sockaddr_in *src4; + if (sa->sa_family == AF_INET) { + struct sockaddr_in *src4; - sa4 = (struct sockaddr_in *)sa; - src4 = (struct sockaddr_in *)src; - if (sa4->sin_addr.s_addr == src4->sin_addr.s_addr) { - fnd = 1; - break; + sa4 = (struct sockaddr_in *)sa; + src4 = (struct sockaddr_in *)src; + if (sa4->sin_addr.s_addr == src4->sin_addr.s_addr) { + fnd = 1; + break; + } } - } #endif #ifdef INET6 - if (sa->sa_family == AF_INET6) { - struct sockaddr_in6 *src6; + if (sa->sa_family == AF_INET6) { + struct sockaddr_in6 *src6; - sa6 = (struct sockaddr_in6 *)sa; - src6 = (struct sockaddr_in6 *)src; - if (SCTP6_ARE_ADDR_EQUAL(sa6, src6)) { - fnd = 1; - break; + sa6 = (struct sockaddr_in6 *)sa; + src6 = (struct sockaddr_in6 *)src; + if (SCTP6_ARE_ADDR_EQUAL(sa6, src6)) { + fnd = 1; + break; + } } - } #endif + } + } + if (fnd == 0) { + /* New address added! no need to look further. */ + return (1); } - } - if (fnd == 0) { - /* New address added! no need to look futher. */ - return (1); } /* Ok so far lets munge through the rest of the packet */ offset += sizeof(struct sctp_init_chunk); @@ -5410,9 +5416,11 @@ sctp_are_there_new_addresses(struct sctp_association *asoc, phdr == NULL) { return (1); } - p4 = (struct sctp_ipv4addr_param *)phdr; - sin4.sin_addr.s_addr = p4->addr; - sa_touse = (struct sockaddr *)&sin4; + if (asoc->scope.ipv4_addr_legal) { + p4 = (struct sctp_ipv4addr_param *)phdr; + sin4.sin_addr.s_addr = p4->addr; + sa_touse = (struct sockaddr *)&sin4; + } break; } #endif @@ -5427,10 +5435,12 @@ sctp_are_there_new_addresses(struct sctp_association *asoc, phdr == NULL) { return (1); } - p6 = (struct sctp_ipv6addr_param *)phdr; - memcpy((caddr_t)&sin6.sin6_addr, p6->addr, - sizeof(p6->addr)); - sa_touse = (struct sockaddr *)&sin6; + if (asoc->scope.ipv6_addr_legal) { + p6 = (struct sctp_ipv6addr_param *)phdr; + memcpy((caddr_t)&sin6.sin6_addr, p6->addr, + sizeof(p6->addr)); + sa_touse = (struct sockaddr *)&sin6; + } break; } #endif @@ -5486,20 +5496,21 @@ sctp_are_there_new_addresses(struct sctp_association *asoc, */ void sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb, - struct mbuf *init_pkt, int iphlen, int offset, + struct sctp_nets *src_net, struct mbuf *init_pkt, + int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_chunk *init_chk, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port, int hold_inp_lock) { struct sctp_association *asoc; - struct mbuf *m, *m_at, *m_tmp, *m_cookie, *op_err, *mp_last; + struct mbuf *m, *m_tmp, *m_last, *m_cookie, *op_err; struct sctp_init_ack_chunk *initack; struct sctp_adaptation_layer_indication *ali; - struct sctp_ecn_supported_param *ecn; - struct sctp_prsctp_supported_param *prsctp; struct sctp_supported_chunk_types_param *pr_supported; + struct sctp_paramhdr *ph; union sctp_sockstore *over_addr; + struct sctp_scoping scp; #ifdef INET struct sockaddr_in *dst4 = (struct sockaddr_in *)dst; @@ -5519,33 +5530,50 @@ sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t *signature = NULL; int cnt_inits_to = 0; uint16_t his_limit, i_want; - int abort_flag, padval; - int num_ext; - int p_len; + int abort_flag; int nat_friendly = 0; struct socket *so; + uint16_t num_ext, chunk_len, padding_len, parameter_len; if (stcb) { asoc = &stcb->asoc; } else { asoc = NULL; } - mp_last = NULL; if ((asoc != NULL) && - (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) && - (sctp_are_there_new_addresses(asoc, init_pkt, offset, src))) { - /* new addresses, out of here in non-cookie-wait states */ - /* - * Send a ABORT, we don't add the new address error clause - * though we even set the T bit and copy in the 0 tag.. this - * looks no different than if no listener was present. - */ - op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), - "Address added"); - sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err, - use_mflowid, mflowid, - vrf_id, port); - return; + (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT)) { + if (sctp_are_there_new_addresses(asoc, init_pkt, offset, src)) { + /* + * new addresses, out of here in non-cookie-wait + * states + * + * Send an ABORT, without the new address error cause. + * This looks no different than if no listener was + * present. + */ + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + "Address added"); + sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err, + mflowtype, mflowid, inp->fibnum, + vrf_id, port); + return; + } + if (src_net != NULL && (src_net->port != port)) { + /* + * change of remote encapsulation port, out of here + * in non-cookie-wait states + * + * Send an ABORT, without an specific error cause. This + * looks no different than if no listener was + * present. + */ + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + "Remote encapsulation port changed"); + sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err, + mflowtype, mflowid, inp->fibnum, + vrf_id, port); + return; + } } abort_flag = 0; op_err = sctp_arethere_unrecognized_parameters(init_pkt, @@ -5556,24 +5584,25 @@ do_a_abort: if (op_err == NULL) { char msg[SCTP_DIAG_INFO_LEN]; - snprintf(msg, sizeof(msg), "%s:%d at %s\n", __FILE__, __LINE__, __FUNCTION__); + snprintf(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); } sctp_send_abort(init_pkt, iphlen, src, dst, sh, init_chk->init.initiate_tag, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, inp->fibnum, vrf_id, port); return; } - m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (m == NULL) { /* No memory, INIT timer will re-attempt. */ if (op_err) sctp_m_freem(op_err); return; } - SCTP_BUF_LEN(m) = sizeof(struct sctp_init_chunk); + chunk_len = (uint16_t) sizeof(struct sctp_init_ack_chunk); + padding_len = 0; /* * We might not overwrite the identification[] completely and on @@ -5605,7 +5634,7 @@ do_a_abort: stc.peerport = sh->src_port; /* - * If we wanted to honor cookie life extentions, we would add to + * If we wanted to honor cookie life extensions, we would add to * stc.cookie_life. For now we should NOT honor any extension */ stc.site_scope = stc.local_scope = stc.loopback_scope = 0; @@ -5620,11 +5649,7 @@ do_a_abort: stc.ipv6_addr_legal = 0; stc.ipv4_addr_legal = 1; } -#ifdef SCTP_DONT_DO_PRIVADDR_SCOPE - stc.ipv4_scope = 1; -#else stc.ipv4_scope = 0; -#endif if (net == NULL) { to = src; switch (dst->sa_family) { @@ -5645,13 +5670,10 @@ do_a_abort: stc.laddr_type = SCTP_IPV4_ADDRESS; /* scope_id is only for v6 */ stc.scope_id = 0; -#ifndef SCTP_DONT_DO_PRIVADDR_SCOPE - if (IN4_ISPRIVATE_ADDRESS(&src4->sin_addr)) { + if ((IN4_ISPRIVATE_ADDRESS(&src4->sin_addr)) || + (IN4_ISPRIVATE_ADDRESS(&dst4->sin_addr))) { stc.ipv4_scope = 1; } -#else - stc.ipv4_scope = 1; -#endif /* SCTP_DONT_DO_PRIVADDR_SCOPE */ /* Must use the address in this case */ if (sctp_is_address_on_local_host(src, vrf_id)) { stc.loopback_scope = 1; @@ -5667,22 +5689,24 @@ do_a_abort: { stc.addr_type = SCTP_IPV6_ADDRESS; memcpy(&stc.address, &src6->sin6_addr, sizeof(struct in6_addr)); - stc.scope_id = in6_getscope(&src6->sin6_addr); + stc.scope_id = ntohs(in6_getscope(&src6->sin6_addr)); if (sctp_is_address_on_local_host(src, vrf_id)) { stc.loopback_scope = 1; stc.local_scope = 0; stc.site_scope = 1; stc.ipv4_scope = 1; - } else if (IN6_IS_ADDR_LINKLOCAL(&src6->sin6_addr)) { + } else if (IN6_IS_ADDR_LINKLOCAL(&src6->sin6_addr) || + IN6_IS_ADDR_LINKLOCAL(&dst6->sin6_addr)) { /* - * If the new destination is a - * LINK_LOCAL we must have common - * both site and local scope. Don't - * set local scope though since we - * must depend on the source to be - * added implicitly. We cannot - * assure just because we share one - * link that all links are common. + * If the new destination or source + * is a LINK_LOCAL we must have + * common both site and local scope. + * Don't set local scope though + * since we must depend on the + * source to be added implicitly. We + * cannot assure just because we + * share one link that all links are + * common. */ stc.local_scope = 0; stc.site_scope = 1; @@ -5698,11 +5722,12 @@ do_a_abort: * pull out the scope_id from * incoming pkt */ - } else if (IN6_IS_ADDR_SITELOCAL(&src6->sin6_addr)) { + } else if (IN6_IS_ADDR_SITELOCAL(&src6->sin6_addr) || + IN6_IS_ADDR_SITELOCAL(&dst6->sin6_addr)) { /* - * If the new destination is - * SITE_LOCAL then we must have site - * scope in common. + * If the new destination or source + * is SITE_LOCAL then we must have + * site scope in common. */ stc.site_scope = 1; } @@ -5806,7 +5831,7 @@ do_a_abort: /* Now lets put the SCTP header in place */ initack = mtod(m, struct sctp_init_ack_chunk *); /* Save it off for quick ref */ - stc.peers_vtag = init_chk->init.initiate_tag; + stc.peers_vtag = ntohl(init_chk->init.initiate_tag); /* who are we */ memcpy(stc.identification, SCTP_VERSION_STRING, min(strlen(SCTP_VERSION_STRING), sizeof(stc.identification))); @@ -5876,10 +5901,10 @@ do_a_abort: his_limit = ntohs(init_chk->init.num_inbound_streams); /* choose what I want */ if (asoc != NULL) { - if (asoc->streamoutcnt > inp->sctp_ep.pre_open_stream_count) { + if (asoc->streamoutcnt > asoc->pre_open_streams) { i_want = asoc->streamoutcnt; } else { - i_want = inp->sctp_ep.pre_open_stream_count; + i_want = asoc->pre_open_streams; } } else { i_want = inp->sctp_ep.pre_open_stream_count; @@ -5897,161 +5922,182 @@ do_a_abort: /* adaptation layer indication parameter */ if (inp->sctp_ep.adaptation_layer_indicator_provided) { - ali = (struct sctp_adaptation_layer_indication *)((caddr_t)initack + sizeof(*initack)); + parameter_len = (uint16_t) sizeof(struct sctp_adaptation_layer_indication); + ali = (struct sctp_adaptation_layer_indication *)(mtod(m, caddr_t)+chunk_len); ali->ph.param_type = htons(SCTP_ULP_ADAPTATION); - ali->ph.param_length = htons(sizeof(*ali)); - ali->indication = ntohl(inp->sctp_ep.adaptation_layer_indicator); - SCTP_BUF_LEN(m) += sizeof(*ali); - ecn = (struct sctp_ecn_supported_param *)((caddr_t)ali + sizeof(*ali)); - } else { - ecn = (struct sctp_ecn_supported_param *)((caddr_t)initack + sizeof(*initack)); + ali->ph.param_length = htons(parameter_len); + ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator); + chunk_len += parameter_len; } - /* ECN parameter */ - if (((asoc != NULL) && (asoc->ecn_allowed == 1)) || - (inp->sctp_ecn_enable == 1)) { - ecn->ph.param_type = htons(SCTP_ECN_CAPABLE); - ecn->ph.param_length = htons(sizeof(*ecn)); - SCTP_BUF_LEN(m) += sizeof(*ecn); - - prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn + - sizeof(*ecn)); - } else { - prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn); + if (((asoc != NULL) && (asoc->ecn_supported == 1)) || + ((asoc == NULL) && (inp->ecn_supported == 1))) { + parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); + ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); + ph->param_type = htons(SCTP_ECN_CAPABLE); + ph->param_length = htons(parameter_len); + chunk_len += parameter_len; + } + /* PR-SCTP supported parameter */ + if (((asoc != NULL) && (asoc->prsctp_supported == 1)) || + ((asoc == NULL) && (inp->prsctp_supported == 1))) { + parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); + ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); + ph->param_type = htons(SCTP_PRSCTP_SUPPORTED); + ph->param_length = htons(parameter_len); + chunk_len += parameter_len; } - /* And now tell the peer we do pr-sctp */ - prsctp->ph.param_type = htons(SCTP_PRSCTP_SUPPORTED); - prsctp->ph.param_length = htons(sizeof(*prsctp)); - SCTP_BUF_LEN(m) += sizeof(*prsctp); + /* Add NAT friendly parameter */ if (nat_friendly) { - /* Add NAT friendly parameter */ - struct sctp_paramhdr *ph; - - ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + parameter_len = (uint16_t) sizeof(struct sctp_paramhdr); + ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_HAS_NAT_SUPPORT); - ph->param_length = htons(sizeof(struct sctp_paramhdr)); - SCTP_BUF_LEN(m) += sizeof(struct sctp_paramhdr); + ph->param_length = htons(parameter_len); + chunk_len += parameter_len; } - /* And now tell the peer we do all the extensions */ - pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); - pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT); + /* And now tell the peer which extensions we support */ num_ext = 0; - pr_supported->chunk_types[num_ext++] = SCTP_ASCONF; - pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK; - pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN; - pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED; - pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET; - if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) + pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len); + if (((asoc != NULL) && (asoc->prsctp_supported == 1)) || + ((asoc == NULL) && (inp->prsctp_supported == 1))) { + pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN; + if (((asoc != NULL) && (asoc->idata_supported == 1)) || + ((asoc == NULL) && (inp->idata_supported == 1))) { + pr_supported->chunk_types[num_ext++] = SCTP_IFORWARD_CUM_TSN; + } + } + if (((asoc != NULL) && (asoc->auth_supported == 1)) || + ((asoc == NULL) && (inp->auth_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION; - if (SCTP_BASE_SYSCTL(sctp_nr_sack_on_off)) + } + if (((asoc != NULL) && (asoc->asconf_supported == 1)) || + ((asoc == NULL) && (inp->asconf_supported == 1))) { + pr_supported->chunk_types[num_ext++] = SCTP_ASCONF; + pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK; + } + if (((asoc != NULL) && (asoc->reconfig_supported == 1)) || + ((asoc == NULL) && (inp->reconfig_supported == 1))) { + pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET; + } + if (((asoc != NULL) && (asoc->idata_supported == 1)) || + ((asoc == NULL) && (inp->idata_supported == 1))) { + pr_supported->chunk_types[num_ext++] = SCTP_IDATA; + } + if (((asoc != NULL) && (asoc->nrsack_supported == 1)) || + ((asoc == NULL) && (inp->nrsack_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK; - p_len = sizeof(*pr_supported) + num_ext; - pr_supported->ph.param_length = htons(p_len); - bzero((caddr_t)pr_supported + p_len, SCTP_SIZE32(p_len) - p_len); - SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); - - /* add authentication parameters */ - if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) { - struct sctp_auth_random *randp; + } + if (((asoc != NULL) && (asoc->pktdrop_supported == 1)) || + ((asoc == NULL) && (inp->pktdrop_supported == 1))) { + pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED; + } + if (num_ext > 0) { + parameter_len = (uint16_t) sizeof(struct sctp_supported_chunk_types_param) + num_ext; + pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT); + pr_supported->ph.param_length = htons(parameter_len); + padding_len = SCTP_SIZE32(parameter_len) - parameter_len; + chunk_len += parameter_len; + } + /* add authentication parameters */ + if (((asoc != NULL) && (asoc->auth_supported == 1)) || + ((asoc == NULL) && (inp->auth_supported == 1))) { + struct sctp_auth_random *randp; struct sctp_auth_hmac_algo *hmacs; struct sctp_auth_chunk_list *chunks; - uint16_t random_len; + if (padding_len > 0) { + memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); + chunk_len += padding_len; + padding_len = 0; + } /* generate and add RANDOM parameter */ - random_len = SCTP_AUTH_RANDOM_SIZE_DEFAULT; - randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+chunk_len); + parameter_len = (uint16_t) sizeof(struct sctp_auth_random) + + SCTP_AUTH_RANDOM_SIZE_DEFAULT; randp->ph.param_type = htons(SCTP_RANDOM); - p_len = sizeof(*randp) + random_len; - randp->ph.param_length = htons(p_len); - SCTP_READ_RANDOM(randp->random_data, random_len); - /* zero out any padding required */ - bzero((caddr_t)randp + p_len, SCTP_SIZE32(p_len) - p_len); - SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); + randp->ph.param_length = htons(parameter_len); + SCTP_READ_RANDOM(randp->random_data, SCTP_AUTH_RANDOM_SIZE_DEFAULT); + padding_len = SCTP_SIZE32(parameter_len) - parameter_len; + chunk_len += parameter_len; + if (padding_len > 0) { + memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); + chunk_len += padding_len; + padding_len = 0; + } /* add HMAC_ALGO parameter */ - hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); - p_len = sctp_serialize_hmaclist(inp->sctp_ep.local_hmacs, + hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+chunk_len); + parameter_len = (uint16_t) sizeof(struct sctp_auth_hmac_algo) + + sctp_serialize_hmaclist(inp->sctp_ep.local_hmacs, (uint8_t *) hmacs->hmac_ids); - if (p_len > 0) { - p_len += sizeof(*hmacs); - hmacs->ph.param_type = htons(SCTP_HMAC_LIST); - hmacs->ph.param_length = htons(p_len); - /* zero out any padding required */ - bzero((caddr_t)hmacs + p_len, SCTP_SIZE32(p_len) - p_len); - SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); + hmacs->ph.param_type = htons(SCTP_HMAC_LIST); + hmacs->ph.param_length = htons(parameter_len); + padding_len = SCTP_SIZE32(parameter_len) - parameter_len; + chunk_len += parameter_len; + + if (padding_len > 0) { + memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); + chunk_len += padding_len; + padding_len = 0; } /* add CHUNKS parameter */ - chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); - p_len = sctp_serialize_auth_chunks(inp->sctp_ep.local_auth_chunks, + chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+chunk_len); + parameter_len = (uint16_t) sizeof(struct sctp_auth_chunk_list) + + sctp_serialize_auth_chunks(inp->sctp_ep.local_auth_chunks, chunks->chunk_types); - if (p_len > 0) { - p_len += sizeof(*chunks); - chunks->ph.param_type = htons(SCTP_CHUNK_LIST); - chunks->ph.param_length = htons(p_len); - /* zero out any padding required */ - bzero((caddr_t)chunks + p_len, SCTP_SIZE32(p_len) - p_len); - SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); - } + chunks->ph.param_type = htons(SCTP_CHUNK_LIST); + chunks->ph.param_length = htons(parameter_len); + padding_len = SCTP_SIZE32(parameter_len) - parameter_len; + chunk_len += parameter_len; } - m_at = m; + SCTP_BUF_LEN(m) = chunk_len; + m_last = m; /* now the addresses */ - { - struct sctp_scoping scp; - - /* - * To optimize this we could put the scoping stuff into a - * structure and remove the individual uint8's from the stc - * structure. Then we could just sifa in the address within - * the stc.. but for now this is a quick hack to get the - * address stuff teased apart. - */ - scp.ipv4_addr_legal = stc.ipv4_addr_legal; - scp.ipv6_addr_legal = stc.ipv6_addr_legal; - scp.loopback_scope = stc.loopback_scope; - scp.ipv4_local_scope = stc.ipv4_scope; - scp.local_scope = stc.local_scope; - scp.site_scope = stc.site_scope; - m_at = sctp_add_addresses_to_i_ia(inp, stcb, &scp, m_at, cnt_inits_to, NULL, NULL); + /* + * To optimize this we could put the scoping stuff into a structure + * and remove the individual uint8's from the stc structure. Then we + * could just sifa in the address within the stc.. but for now this + * is a quick hack to get the address stuff teased apart. + */ + scp.ipv4_addr_legal = stc.ipv4_addr_legal; + scp.ipv6_addr_legal = stc.ipv6_addr_legal; + scp.loopback_scope = stc.loopback_scope; + scp.ipv4_local_scope = stc.ipv4_scope; + scp.local_scope = stc.local_scope; + scp.site_scope = stc.site_scope; + m_last = sctp_add_addresses_to_i_ia(inp, stcb, &scp, m_last, + cnt_inits_to, + &padding_len, &chunk_len); + /* padding_len can only be positive, if no addresses have been added */ + if (padding_len > 0) { + memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); + chunk_len += padding_len; + SCTP_BUF_LEN(m) += padding_len; + padding_len = 0; } - /* tack on the operational error if present */ if (op_err) { - struct mbuf *ol; - int llen; - - llen = 0; - ol = op_err; - - while (ol) { - llen += SCTP_BUF_LEN(ol); - ol = SCTP_BUF_NEXT(ol); + parameter_len = 0; + for (m_tmp = op_err; m_tmp != NULL; m_tmp = SCTP_BUF_NEXT(m_tmp)) { + parameter_len += SCTP_BUF_LEN(m_tmp); } - if (llen % 4) { - /* must add a pad to the param */ - uint32_t cpthis = 0; - int padlen; - - padlen = 4 - (llen % 4); - m_copyback(op_err, llen, padlen, (caddr_t)&cpthis); - } - while (SCTP_BUF_NEXT(m_at) != NULL) { - m_at = SCTP_BUF_NEXT(m_at); - } - SCTP_BUF_NEXT(m_at) = op_err; - while (SCTP_BUF_NEXT(m_at) != NULL) { - m_at = SCTP_BUF_NEXT(m_at); + padding_len = SCTP_SIZE32(parameter_len) - parameter_len; + SCTP_BUF_NEXT(m_last) = op_err; + while (SCTP_BUF_NEXT(m_last) != NULL) { + m_last = SCTP_BUF_NEXT(m_last); } + chunk_len += parameter_len; } - /* pre-calulate the size and update pkt header and chunk header */ - p_len = 0; - for (m_tmp = m; m_tmp; m_tmp = SCTP_BUF_NEXT(m_tmp)) { - p_len += SCTP_BUF_LEN(m_tmp); - if (SCTP_BUF_NEXT(m_tmp) == NULL) { - /* m_tmp should now point to last one */ - break; + if (padding_len > 0) { + m_last = sctp_add_pad_tombuf(m_last, padding_len); + if (m_last == NULL) { + /* Houston we have a problem, no space */ + sctp_m_freem(m); + return; } + chunk_len += padding_len; + padding_len = 0; } - /* Now we must build a cookie */ m_cookie = sctp_add_cookie(init_pkt, offset, m, 0, &stc, &signature); if (m_cookie == NULL) { @@ -6060,21 +6106,22 @@ do_a_abort: return; } /* Now append the cookie to the end and update the space/size */ - SCTP_BUF_NEXT(m_tmp) = m_cookie; - - for (m_tmp = m_cookie; m_tmp; m_tmp = SCTP_BUF_NEXT(m_tmp)) { - p_len += SCTP_BUF_LEN(m_tmp); + SCTP_BUF_NEXT(m_last) = m_cookie; + parameter_len = 0; + for (m_tmp = m_cookie; m_tmp != NULL; m_tmp = SCTP_BUF_NEXT(m_tmp)) { + parameter_len += SCTP_BUF_LEN(m_tmp); if (SCTP_BUF_NEXT(m_tmp) == NULL) { - /* m_tmp should now point to last one */ - mp_last = m_tmp; - break; + m_last = m_tmp; } } + padding_len = SCTP_SIZE32(parameter_len) - parameter_len; + chunk_len += parameter_len; + /* * Place in the size, but we don't include the last pad (if any) in * the INIT-ACK. */ - initack->ch.chunk_length = htons(p_len); + initack->ch.chunk_length = htons(chunk_len); /* * Time to sign the cookie, we don't sign over the cookie signature @@ -6088,11 +6135,8 @@ do_a_abort: * We sifa 0 here to NOT set IP_DF if its IPv4, we ignore the return * here since the timer will drive a retranmission. */ - padval = p_len % 4; - if ((padval) && (mp_last)) { - /* see my previous comments on mp_last */ - if (sctp_add_pad_tombuf(mp_last, (4 - padval))) { - /* Houston we have a problem, no space */ + if (padding_len > 0) { + if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) { sctp_m_freem(m); return; } @@ -6107,7 +6151,7 @@ do_a_abort: 0, 0, inp->sctp_lport, sh->src_port, init_chk->init.initiate_tag, port, over_addr, - use_mflowid, mflowid, + mflowtype, mflowid, SCTP_SO_NOT_LOCKED); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } @@ -6123,7 +6167,7 @@ sctp_prune_prsctp(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, *nchk; SCTP_TCB_LOCK_ASSERT(stcb); - if ((asoc->peer_supports_prsctp) && + if ((asoc->prsctp_supported) && (asoc->sent_queue_cnt_removeable > 0)) { TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { /* @@ -6165,7 +6209,7 @@ sctp_prune_prsctp(struct sctp_tcb *stcb, return; } } /* if chunk was present */ - } /* if of sufficent priority */ + } /* if of sufficient priority */ } /* if chunk has enabled */ } /* tailqforeach */ @@ -6206,11 +6250,15 @@ sctp_get_frag_point(struct sctp_tcb *stcb, * we use a larger frag point. */ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { - ovh = SCTP_MED_OVERHEAD; + ovh = SCTP_MIN_OVERHEAD; } else { - ovh = SCTP_MED_V4_OVERHEAD; + ovh = SCTP_MIN_V4_OVERHEAD; + } + if (stcb->asoc.idata_supported) { + ovh += sizeof(struct sctp_idata_chunk); + } else { + ovh += sizeof(struct sctp_data_chunk); } - if (stcb->asoc.sctp_frag_point > asoc->smallest_mtu) siz = asoc->smallest_mtu - ovh; else @@ -6335,6 +6383,7 @@ sctp_msg_append(struct sctp_tcb *stcb, sp->timetolive = srcv->sinfo_timetolive; sp->ppid = srcv->sinfo_ppid; sp->context = srcv->sinfo_context; + sp->fsn = 0; if (sp->sinfo_flags & SCTP_ADDR_OVER) { sp->net = net; atomic_add_int(&sp->net->ref_count, 1); @@ -6420,7 +6469,7 @@ error_out: if (outchain == NULL) { /* This is the general case */ new_mbuf: - outchain = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_HEADER); + outchain = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_HEADER); if (outchain == NULL) { goto error_out; } @@ -6453,10 +6502,10 @@ error_out: } } /* get the new end of length */ - len = M_TRAILINGSPACE(*endofchain); + len = (int)M_TRAILINGSPACE(*endofchain); } else { /* how much is left at the end? */ - len = M_TRAILINGSPACE(*endofchain); + len = (int)M_TRAILINGSPACE(*endofchain); } /* Find the end of the data, for appending */ cp = (mtod((*endofchain), caddr_t)+SCTP_BUF_LEN((*endofchain))); @@ -6474,7 +6523,7 @@ error_out: /* now we need another one */ sizeofcpy -= len; } - m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_HEADER); + m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_HEADER); if (m == NULL) { /* We failed */ goto error_out; @@ -6488,16 +6537,10 @@ error_out: return (outchain); } else { /* copy the old fashion way */ - appendchain = SCTP_M_COPYM(clonechain, 0, M_COPYALL, M_DONTWAIT); + appendchain = SCTP_M_COPYM(clonechain, 0, M_COPYALL, M_NOWAIT); #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = appendchain; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_ICOPY); - } - } + sctp_log_mbc(appendchain, SCTP_MBUF_ICOPY); } #endif } @@ -6523,7 +6566,7 @@ error_out: } } /* - * save off the end and update the end-chain postion + * save off the end and update the end-chain position */ m = appendchain; while (m) { @@ -6535,7 +6578,7 @@ error_out: } return (outchain); } else { - /* save off the end and update the end-chain postion */ + /* save off the end and update the end-chain position */ m = appendchain; while (m) { if (SCTP_BUF_NEXT(m) == NULL) { @@ -6582,7 +6625,7 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, return; } if (ca->sndlen > 0) { - m = SCTP_M_COPYM(ca->m, 0, M_COPYALL, M_DONTWAIT); + m = SCTP_M_COPYM(ca->m, 0, M_COPYALL, M_NOWAIT); if (m == NULL) { /* can't copy so we are done */ ca->cnt_failed++; @@ -6590,13 +6633,7 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_ICOPY); - } - } + sctp_log_mbc(m, SCTP_MBUF_ICOPY); } #endif } else { @@ -6622,7 +6659,7 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, ph = mtod(m, struct sctp_paramhdr *); ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); - ph->param_length = htons(sizeof(struct sctp_paramhdr) + ca->sndlen); + ph->param_length = htons((uint16_t) (sizeof(struct sctp_paramhdr) + ca->sndlen)); } /* * We add one here to keep the assoc from dis-appearing on @@ -6652,14 +6689,10 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, asoc = &stcb->asoc; if (ca->sndrcv.sinfo_flags & SCTP_EOF) { /* shutdown this assoc */ - int cnt; - - cnt = sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED); - if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && - (cnt == 0)) { - if (asoc->locked_on_sending) { + sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED) == 0) { + if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } /* @@ -6701,27 +6734,24 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { - if (asoc->locked_on_sending) { - /* - * Locked to send out the - * data - */ - struct sctp_stream_queue_pending *sp; - - sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead); - if (sp) { - if ((sp->length == 0) && (sp->msg_is_complete == 0)) - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; - } + if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; } asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { + struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; + abort_anyway: + snprintf(msg, sizeof(msg), + "%s:%d at %s", __FILE__, __LINE__, __func__); + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + msg); atomic_add_int(&stcb->asoc.refcnt, 1); sctp_abort_an_association(stcb->sctp_ep, stcb, - NULL, SCTP_SO_NOT_LOCKED); + op_err, SCTP_SO_NOT_LOCKED); atomic_add_int(&stcb->asoc.refcnt, -1); goto no_chunk_output; } @@ -6743,7 +6773,7 @@ sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, if (do_chunk_output) sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_NOT_LOCKED); else if (added_control) { - int num_out = 0, reason = 0, now_filled = 0; + int num_out, reason, now_filled = 0; struct timeval now; int frag_point; @@ -6768,7 +6798,7 @@ sctp_sendall_completes(void *ptr, uint32_t val SCTP_UNUSED) /* * Do a notify here? Kacheong suggests that the notify be done at * the send time.. so you would push up a notification if any send - * failed. Don't know if this is feasable since the only failures we + * failed. Don't know if this is feasible since the only failures we * have is "memory" related and if you cannot get an mbuf to send * the data you surely can't get an mbuf to send up to notify the * user you can't send the data :-> @@ -6779,20 +6809,13 @@ sctp_sendall_completes(void *ptr, uint32_t val SCTP_UNUSED) SCTP_FREE(ca, SCTP_M_COPYAL); } - -#define MC_ALIGN(m, len) do { \ - SCTP_BUF_RESV_UF(m, ((MCLBYTES - (len)) & ~(sizeof(long) - 1)); \ -} while (0) - - - static struct mbuf * sctp_copy_out_all(struct uio *uio, int len) { struct mbuf *ret, *at; int left, willcpy, cancpy, error; - ret = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_WAIT, 1, MT_DATA); + ret = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_WAITOK, 1, MT_DATA); if (ret == NULL) { /* TSNH */ return (NULL); @@ -6800,7 +6823,7 @@ sctp_copy_out_all(struct uio *uio, int len) left = len; SCTP_BUF_LEN(ret) = 0; /* save space for the data chunk header */ - cancpy = M_TRAILINGSPACE(ret); + cancpy = (int)M_TRAILINGSPACE(ret); willcpy = min(cancpy, left); at = ret; while (left > 0) { @@ -6815,13 +6838,13 @@ sctp_copy_out_all(struct uio *uio, int len) SCTP_BUF_NEXT_PKT(at) = SCTP_BUF_NEXT(at) = 0; left -= willcpy; if (left > 0) { - SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg(left, 0, M_WAIT, 1, MT_DATA); + SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg(left, 0, M_WAITOK, 1, MT_DATA); if (SCTP_BUF_NEXT(at) == NULL) { goto err_out_now; } at = SCTP_BUF_NEXT(at); SCTP_BUF_LEN(at) = 0; - cancpy = M_TRAILINGSPACE(at); + cancpy = (int)M_TRAILINGSPACE(at); willcpy = min(cancpy, left); } } @@ -6855,7 +6878,7 @@ sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, ca->sndrcv.sinfo_flags &= ~SCTP_SENDALL; /* get length and mbuf chain */ if (uio) { - ca->sndlen = uio->uio_resid; + ca->sndlen = (int)uio->uio_resid; ca->m = sctp_copy_out_all(uio, ca->sndlen); if (ca->m == NULL) { SCTP_FREE(ca, SCTP_M_COPYAL); @@ -7005,7 +7028,7 @@ all_done: sctp_misc_ints(SCTP_FLIGHT_LOG_UP, data_list[i]->whoTo->flight_size, data_list[i]->book_size, - (uintptr_t) data_list[i]->whoTo, + (uint32_t) (uintptr_t) data_list[i]->whoTo, data_list[i]->rec.data.TSN_seq); } sctp_flight_size_increase(data_list[i]); @@ -7135,7 +7158,6 @@ sctp_move_to_outqueue(struct sctp_tcb *stcb, struct sctp_stream_out *strq, uint32_t goal_mtu, uint32_t frag_point, - int *locked, int *giveup, int eeor_mode, int *bail, @@ -7149,8 +7171,10 @@ sctp_move_to_outqueue(struct sctp_tcb *stcb, struct sctp_association *asoc; struct sctp_stream_queue_pending *sp; struct sctp_tmit_chunk *chk; - struct sctp_data_chunk *dchkh; + struct sctp_data_chunk *dchkh = NULL; + struct sctp_idata_chunk *ndchkh = NULL; uint32_t to_move, length; + int leading; uint8_t rcv_flags = 0; uint8_t some_taken; uint8_t send_lock_up = 0; @@ -7161,7 +7185,6 @@ one_more_time: /* sa_ignore FREED_MEMORY */ sp = TAILQ_FIRST(&strq->outqueue); if (sp == NULL) { - *locked = 0; if (send_lock_up == 0) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; @@ -7170,7 +7193,9 @@ one_more_time: if (sp) { goto one_more_time; } - if (strq->last_msg_incomplete) { + if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_EXPLICIT_EOR) == 0) && + (stcb->asoc.idata_supported == 0) && + (strq->last_msg_incomplete)) { SCTP_PRINTF("Huh? Stream:%d lm_in_c=%d but queue is NULL\n", strq->stream_no, strq->last_msg_incomplete); @@ -7206,6 +7231,11 @@ one_more_time: atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&strq->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, strq, sp, send_lock_up); + if ((strq->state == SCTP_STREAM_RESET_PENDING) && + (strq->chunks_on_queues == 0) && + TAILQ_EMPTY(&strq->outqueue)) { + stcb->asoc.trigger_reset = 1; + } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; @@ -7216,8 +7246,6 @@ one_more_time: } sctp_free_a_strmoq(stcb, sp, so_locked); /* we can't be locked to it */ - *locked = 0; - stcb->asoc.locked_on_sending = NULL; if (send_lock_up) { SCTP_TCB_SEND_UNLOCK(stcb); send_lock_up = 0; @@ -7229,7 +7257,6 @@ one_more_time: * sender just finished this but still holds a * reference */ - *locked = 1; *giveup = 1; to_move = 0; goto out_of; @@ -7238,7 +7265,6 @@ one_more_time: /* is there some to get */ if (sp->length == 0) { /* no */ - *locked = 1; *giveup = 1; to_move = 0; goto out_of; @@ -7249,7 +7275,7 @@ one_more_time: } /* Whack down the size */ atomic_subtract_int(&stcb->asoc.total_output_queue_size, sp->length); - if ((stcb->sctp_socket != NULL) && \ + if ((stcb->sctp_socket != NULL) && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc, sp->length); @@ -7261,16 +7287,12 @@ one_more_time: } sp->length = 0; sp->some_taken = 1; - *locked = 1; *giveup = 1; to_move = 0; goto out_of; } } some_taken = sp->some_taken; - if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { - sp->msg_is_complete = 1; - } re_look: length = sp->length; if (sp->msg_is_complete) { @@ -7280,10 +7302,12 @@ re_look: /* All of it fits in the MTU */ if (sp->some_taken) { rcv_flags |= SCTP_DATA_LAST_FRAG; - sp->put_last_out = 1; } else { rcv_flags |= SCTP_DATA_NOT_FRAG; - sp->put_last_out = 1; + } + sp->put_last_out = 1; + if (sp->sinfo_flags & SCTP_SACK_IMMEDIATELY) { + rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY; } } else { /* Not all of it fits, we fragment */ @@ -7326,9 +7350,6 @@ re_look: } } else { /* Nothing to take. */ - if (sp->some_taken) { - *locked = 1; - } *giveup = 1; to_move = 0; goto out_of; @@ -7350,8 +7371,8 @@ re_look: if (sp->sinfo_flags & SCTP_UNORDERED) { rcv_flags |= SCTP_DATA_UNORDERED; } - if ((SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && ((sp->sinfo_flags & SCTP_EOF) == SCTP_EOF)) || - ((sp->sinfo_flags & SCTP_SACK_IMMEDIATELY) == SCTP_SACK_IMMEDIATELY)) { + if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && + (sp->sinfo_flags & SCTP_EOF) == SCTP_EOF) { rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY; } /* clear out the chunk before setting up */ @@ -7376,7 +7397,7 @@ re_look: struct mbuf *m; dont_do_it: - chk->data = SCTP_M_COPYM(sp->data, 0, to_move, M_DONTWAIT); + chk->data = SCTP_M_COPYM(sp->data, 0, to_move, M_NOWAIT); chk->last_mbuf = NULL; if (chk->data == NULL) { sp->some_taken = some_taken; @@ -7387,13 +7408,7 @@ dont_do_it: } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = chk->data; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_ICOPY); - } - } + sctp_log_mbc(chk->data, SCTP_MBUF_ICOPY); } #endif /* Pull off the data */ @@ -7428,7 +7443,7 @@ dont_do_it: chk->copy_by_ref = 0; } /* - * get last_mbuf and counts of mb useage This is ugly but hopefully + * get last_mbuf and counts of mb usage This is ugly but hopefully * its only one mbuf. */ if (chk->last_mbuf == NULL) { @@ -7451,11 +7466,16 @@ dont_do_it: } else { atomic_subtract_int(&sp->length, to_move); } - if (M_LEADINGSPACE(chk->data) < (int)sizeof(struct sctp_data_chunk)) { + if (stcb->asoc.idata_supported == 0) { + leading = sizeof(struct sctp_data_chunk); + } else { + leading = sizeof(struct sctp_idata_chunk); + } + if (M_LEADINGSPACE(chk->data) < leading) { /* Not enough room for a chunk header, get some */ struct mbuf *m; - m = sctp_get_mbuf_for_msg(1, 0, M_DONTWAIT, 0, MT_DATA); + m = sctp_get_mbuf_for_msg(1, 0, M_NOWAIT, 0, MT_DATA); if (m == NULL) { /* * we're in trouble here. _PREPEND below will free @@ -7466,7 +7486,7 @@ dont_do_it: SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } - if (chk->data == NULL) { + if (sp->data == NULL) { /* unsteal the data */ sp->data = chk->data; sp->tail_mbuf = chk->last_mbuf; @@ -7492,7 +7512,11 @@ dont_do_it: M_ALIGN(chk->data, 4); } } - SCTP_BUF_PREPEND(chk->data, sizeof(struct sctp_data_chunk), M_DONTWAIT); + if (stcb->asoc.idata_supported == 0) { + SCTP_BUF_PREPEND(chk->data, sizeof(struct sctp_data_chunk), M_NOWAIT); + } else { + SCTP_BUF_PREPEND(chk->data, sizeof(struct sctp_idata_chunk), M_NOWAIT); + } if (chk->data == NULL) { /* HELP, TSNH since we assured it would not above? */ #ifdef INVARIANTS @@ -7505,8 +7529,13 @@ dont_do_it: to_move = 0; goto out_of; } - sctp_snd_sb_alloc(stcb, sizeof(struct sctp_data_chunk)); - chk->book_size = chk->send_size = (to_move + sizeof(struct sctp_data_chunk)); + if (stcb->asoc.idata_supported == 0) { + sctp_snd_sb_alloc(stcb, sizeof(struct sctp_data_chunk)); + chk->book_size = chk->send_size = (uint16_t) (to_move + sizeof(struct sctp_data_chunk)); + } else { + sctp_snd_sb_alloc(stcb, sizeof(struct sctp_idata_chunk)); + chk->book_size = chk->send_size = (uint16_t) (to_move + sizeof(struct sctp_idata_chunk)); + } chk->book_size_scale = 0; chk->sent = SCTP_DATAGRAM_UNSENT; @@ -7514,10 +7543,28 @@ dont_do_it: chk->asoc = &stcb->asoc; chk->pad_inplace = 0; chk->no_fr_allowed = 0; - chk->rec.data.stream_seq = strq->next_sequence_send; - if ((rcv_flags & SCTP_DATA_LAST_FRAG) && - !(rcv_flags & SCTP_DATA_UNORDERED)) { - strq->next_sequence_send++; + if (stcb->asoc.idata_supported == 0) { + if (rcv_flags & SCTP_DATA_UNORDERED) { + /* Just use 0. The receiver ignores the values. */ + chk->rec.data.stream_seq = 0; + } else { + chk->rec.data.stream_seq = strq->next_mid_ordered; + if (rcv_flags & SCTP_DATA_LAST_FRAG) { + strq->next_mid_ordered++; + } + } + } else { + if (rcv_flags & SCTP_DATA_UNORDERED) { + chk->rec.data.stream_seq = strq->next_mid_unordered; + if (rcv_flags & SCTP_DATA_LAST_FRAG) { + strq->next_mid_unordered++; + } + } else { + chk->rec.data.stream_seq = strq->next_mid_ordered; + if (rcv_flags & SCTP_DATA_LAST_FRAG) { + strq->next_mid_ordered++; + } + } } chk->rec.data.stream_number = sp->stream; chk->rec.data.payloadtype = sp->ppid; @@ -7541,11 +7588,15 @@ dont_do_it: chk->rec.data.TSN_seq = atomic_fetchadd_int(&asoc->sending_seq, 1); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_AT_SEND_2_OUTQ) { sctp_misc_ints(SCTP_STRMOUT_LOG_SEND, - (uintptr_t) stcb, sp->length, + (uint32_t) (uintptr_t) stcb, sp->length, (uint32_t) ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq), chk->rec.data.TSN_seq); } - dchkh = mtod(chk->data, struct sctp_data_chunk *); + if (stcb->asoc.idata_supported == 0) { + dchkh = mtod(chk->data, struct sctp_data_chunk *); + } else { + ndchkh = mtod(chk->data, struct sctp_idata_chunk *); + } /* * Put the rest of the things in place now. Size was done earlier in * previous loop prior to padding. @@ -7567,14 +7618,28 @@ dont_do_it: asoc->out_tsnlog[asoc->tsn_out_at].in_out = 2; asoc->tsn_out_at++; #endif - - dchkh->ch.chunk_type = SCTP_DATA; - dchkh->ch.chunk_flags = chk->rec.data.rcv_flags; - dchkh->dp.tsn = htonl(chk->rec.data.TSN_seq); - dchkh->dp.stream_id = htons(strq->stream_no); - dchkh->dp.stream_sequence = htons(chk->rec.data.stream_seq); - dchkh->dp.protocol_id = chk->rec.data.payloadtype; - dchkh->ch.chunk_length = htons(chk->send_size); + if (stcb->asoc.idata_supported == 0) { + dchkh->ch.chunk_type = SCTP_DATA; + dchkh->ch.chunk_flags = chk->rec.data.rcv_flags; + dchkh->dp.tsn = htonl(chk->rec.data.TSN_seq); + dchkh->dp.stream_id = htons((strq->stream_no & 0x0000ffff)); + dchkh->dp.stream_sequence = htons((uint16_t) chk->rec.data.stream_seq); + dchkh->dp.protocol_id = chk->rec.data.payloadtype; + dchkh->ch.chunk_length = htons(chk->send_size); + } else { + ndchkh->ch.chunk_type = SCTP_IDATA; + ndchkh->ch.chunk_flags = chk->rec.data.rcv_flags; + ndchkh->dp.tsn = htonl(chk->rec.data.TSN_seq); + ndchkh->dp.stream_id = htons(strq->stream_no); + ndchkh->dp.reserved = htons(0); + ndchkh->dp.msg_id = htonl(chk->rec.data.stream_seq); + if (sp->fsn == 0) + ndchkh->dp.ppid_fsn.protocol_id = chk->rec.data.payloadtype; + else + ndchkh->dp.ppid_fsn.fsn = htonl(sp->fsn); + sp->fsn++; + ndchkh->ch.chunk_length = htons(chk->send_size); + } /* Now advance the chk->send_size by the actual pad needed. */ if (chk->send_size < SCTP_SIZE32(chk->book_size)) { /* need a pad */ @@ -7582,12 +7647,10 @@ dont_do_it: int pads; pads = SCTP_SIZE32(chk->book_size) - chk->send_size; - if (sctp_pad_lastmbuf(chk->data, pads, chk->last_mbuf) == 0) { - chk->pad_inplace = 1; - } - if ((lm = SCTP_BUF_NEXT(chk->last_mbuf)) != NULL) { - /* pad added an mbuf */ + lm = sctp_pad_lastmbuf(chk->data, pads, chk->last_mbuf); + if (lm != NULL) { chk->last_mbuf = lm; + chk->pad_inplace = 1; } chk->send_size += pads; } @@ -7596,7 +7659,6 @@ dont_do_it: } if (sp->msg_is_complete && (sp->length == 0) && (sp->sender_all_done)) { /* All done pull and kill the message */ - atomic_subtract_int(&asoc->stream_queue_cnt, 1); if (sp->put_last_out == 0) { SCTP_PRINTF("Gak, put out entire msg with NO end!-2\n"); SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n", @@ -7610,8 +7672,14 @@ dont_do_it: SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } + atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&strq->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, strq, sp, send_lock_up); + if ((strq->state == SCTP_STREAM_RESET_PENDING) && + (strq->chunks_on_queues == 0) && + TAILQ_EMPTY(&strq->outqueue)) { + stcb->asoc.trigger_reset = 1; + } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; @@ -7621,13 +7689,6 @@ dont_do_it: sp->data = NULL; } sctp_free_a_strmoq(stcb, sp, so_locked); - - /* we can't be locked to it */ - *locked = 0; - stcb->asoc.locked_on_sending = NULL; - } else { - /* more to go, we are locked */ - *locked = 1; } asoc->chunks_on_out_queue++; strq->chunks_on_queues++; @@ -7652,7 +7713,7 @@ sctp_fill_outqueue(struct sctp_tcb *stcb, struct sctp_association *asoc; struct sctp_stream_out *strq; int goal_mtu, moved_how_much, total_moved = 0, bail = 0; - int locked, giveup; + int giveup; SCTP_TCB_LOCK_ASSERT(stcb); asoc = &stcb->asoc; @@ -7673,40 +7734,28 @@ sctp_fill_outqueue(struct sctp_tcb *stcb, break; } /* Need an allowance for the data chunk header too */ - goal_mtu -= sizeof(struct sctp_data_chunk); + if (stcb->asoc.idata_supported == 0) { + goal_mtu -= sizeof(struct sctp_data_chunk); + } else { + goal_mtu -= sizeof(struct sctp_idata_chunk); + } /* must make even word boundary */ goal_mtu &= 0xfffffffc; - if (asoc->locked_on_sending) { - /* We are stuck on one stream until the message completes. */ - strq = asoc->locked_on_sending; - locked = 1; - } else { - strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc); - locked = 0; - } + strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc); while ((goal_mtu > 0) && strq) { giveup = 0; bail = 0; - moved_how_much = sctp_move_to_outqueue(stcb, strq, goal_mtu, frag_point, &locked, + moved_how_much = sctp_move_to_outqueue(stcb, strq, goal_mtu, frag_point, &giveup, eeor_mode, &bail, so_locked); - if (moved_how_much) - stcb->asoc.ss_functions.sctp_ss_scheduled(stcb, net, asoc, strq, moved_how_much); + stcb->asoc.ss_functions.sctp_ss_scheduled(stcb, net, asoc, strq, moved_how_much); - if (locked) { - asoc->locked_on_sending = strq; - if ((moved_how_much == 0) || (giveup) || bail) - /* no more to move for now */ - break; - } else { - asoc->locked_on_sending = NULL; - if ((giveup) || bail) { - break; - } - strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc); - if (strq == NULL) { - break; - } + if ((giveup) || bail) { + break; + } + strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc); + if (strq == NULL) { + break; } total_moved += moved_how_much; goal_mtu -= (moved_how_much + sizeof(struct sctp_data_chunk)); @@ -7784,12 +7833,15 @@ sctp_med_chunk_output(struct sctp_inpcb *inp, { /** * Ok this is the generic chunk service queue. we must do the - * following: - Service the stream queue that is next, moving any - * message (note I must get a complete message i.e. FIRST/MIDDLE and - * LAST to the out queue in one pass) and assigning TSN's - Check to - * see if the cwnd/rwnd allows any output, if so we go ahead and - * fomulate and send the low level chunks. Making sure to combine - * any control in the control chunk queue also. + * following: + * - Service the stream queue that is next, moving any + * message (note I must get a complete message i.e. FIRST/MIDDLE and + * LAST to the out queue in one pass) and assigning TSN's. This + * only applys though if the peer does not support NDATA. For NDATA + * chunks its ok to not send the entire message ;-) + * - Check to see if the cwnd/rwnd allows any output, if so we go ahead and + * fomulate and send the low level chunks. Making sure to combine + * any control in the control chunk queue also. */ struct sctp_nets *net, *start_at, *sack_goes_to = NULL, *old_start_at = NULL; struct mbuf *outchain, *endoutchain; @@ -7818,8 +7870,8 @@ sctp_med_chunk_output(struct sctp_inpcb *inp, int quit_now = 0; *num_out = 0; + *reason_code = 0; auth_keyid = stcb->asoc.authinfo.active_keyid; - if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED) || (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) { @@ -7838,7 +7890,7 @@ sctp_med_chunk_output(struct sctp_inpcb *inp, #endif SCTP_TCB_LOCK_ASSERT(stcb); hbflag = 0; - if ((control_only) || (asoc->stream_reset_outstanding)) + if (control_only) no_data_chunks = 1; else no_data_chunks = 0; @@ -7848,7 +7900,7 @@ sctp_med_chunk_output(struct sctp_inpcb *inp, (asoc->ctrl_queue_cnt == stcb->asoc.ecn_echo_cnt_onq)) && TAILQ_EMPTY(&asoc->asconf_send_queue) && TAILQ_EMPTY(&asoc->send_queue) && - stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { + sctp_is_there_unsent_data(stcb, so_locked) == 0) { nothing_to_send: *reason_code = 9; return (0); @@ -8006,31 +8058,15 @@ again_one_more_time: } else { skip_data_for_this_net = 0; } - if ((net->ro.ro_rt) && (net->ro.ro_rt->rt_ifp)) { - /* - * if we have a route and an ifp check to see if we - * have room to send to this guy - */ - struct ifnet *ifp; - - ifp = net->ro.ro_rt->rt_ifp; - if ((ifp->if_snd.ifq_len + 2) >= ifp->if_snd.ifq_maxlen) { - SCTP_STAT_INCR(sctps_ifnomemqueued); - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { - sctp_log_maxburst(stcb, net, ifp->if_snd.ifq_len, ifp->if_snd.ifq_maxlen, SCTP_MAX_IFP_APPLIED); - } - continue; - } - } switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) { #ifdef INET case AF_INET: - mtu = net->mtu - (sizeof(struct ip) + sizeof(struct sctphdr)); + mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: - mtu = net->mtu - (sizeof(struct ip6_hdr) + sizeof(struct sctphdr)); + mtu = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: @@ -8052,6 +8088,7 @@ again_one_more_time: } else { r_mtu = mtu; } + error = 0; /************************/ /* ASCONF transmission */ /************************/ @@ -8175,6 +8212,12 @@ again_one_more_time: * it is used to do appropriate * source address selection. */ + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(now); + *now_filled = 1; + } + net->last_sent_time = *now; + hbflag = 0; if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, outchain, auth_offset, auth, @@ -8185,21 +8228,18 @@ again_one_more_time: net->port, NULL, 0, 0, so_locked))) { - if (error == ENOBUFS) { - asoc->ifp_had_enobuf = 1; - SCTP_STAT_INCR(sctps_lowlevelerr); - } + /* + * error, we could not + * output + */ + SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (from_where == 0) { SCTP_STAT_INCR(sctps_lowlevelerrusr); } - if (*now_filled == 0) { - (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); - *now_filled = 1; - *now = net->last_sent_time; - } else { - net->last_sent_time = *now; + if (error == ENOBUFS) { + asoc->ifp_had_enobuf = 1; + SCTP_STAT_INCR(sctps_lowlevelerr); } - hbflag = 0; /* error, could not output */ if (error == EHOSTUNREACH) { /* @@ -8210,17 +8250,10 @@ again_one_more_time: sctp_move_chunks_from_net(stcb, net); } *reason_code = 7; - continue; - } else - asoc->ifp_had_enobuf = 0; - if (*now_filled == 0) { - (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); - *now_filled = 1; - *now = net->last_sent_time; + break; } else { - net->last_sent_time = *now; + asoc->ifp_had_enobuf = 0; } - hbflag = 0; /* * increase the number we sent, if a * cookie is sent we don't tell them @@ -8253,6 +8286,10 @@ again_one_more_time: } } } + if (error != 0) { + /* try next net */ + continue; + } /************************/ /* Control transmission */ /************************/ @@ -8391,7 +8428,8 @@ again_one_more_time: /* turn off the timer */ if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, - inp, stcb, net, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_1); + inp, stcb, net, + SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_1); } } ctl_cnt++; @@ -8448,6 +8486,15 @@ again_one_more_time: sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net); cookie = 0; } + /* Only HB or ASCONF advances time */ + if (hbflag) { + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(now); + *now_filled = 1; + } + net->last_sent_time = *now; + hbflag = 0; + } if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, outchain, @@ -8459,23 +8506,17 @@ again_one_more_time: net->port, NULL, 0, 0, so_locked))) { - if (error == ENOBUFS) { - asoc->ifp_had_enobuf = 1; - SCTP_STAT_INCR(sctps_lowlevelerr); - } + /* + * error, we could not + * output + */ + SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (from_where == 0) { SCTP_STAT_INCR(sctps_lowlevelerrusr); } - /* error, could not output */ - if (hbflag) { - if (*now_filled == 0) { - (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); - *now_filled = 1; - *now = net->last_sent_time; - } else { - net->last_sent_time = *now; - } - hbflag = 0; + if (error == ENOBUFS) { + asoc->ifp_had_enobuf = 1; + SCTP_STAT_INCR(sctps_lowlevelerr); } if (error == EHOSTUNREACH) { /* @@ -8486,19 +8527,9 @@ again_one_more_time: sctp_move_chunks_from_net(stcb, net); } *reason_code = 7; - continue; - } else + break; + } else { asoc->ifp_had_enobuf = 0; - /* Only HB or ASCONF advances time */ - if (hbflag) { - if (*now_filled == 0) { - (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); - *now_filled = 1; - *now = net->last_sent_time; - } else { - net->last_sent_time = *now; - } - hbflag = 0; } /* * increase the number we sent, if a @@ -8532,6 +8563,10 @@ again_one_more_time: } } } + if (error != 0) { + /* try next net */ + continue; + } /* JRI: if dest is in PF state, do not send data to it */ if ((asoc->sctp_cmt_on_off > 0) && (net != stcb->asoc.alternate) && @@ -8576,16 +8611,16 @@ again_one_more_time: switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) { #ifdef INET case AF_INET: - if (net->mtu > (sizeof(struct ip) + sizeof(struct sctphdr))) - omtu = net->mtu - (sizeof(struct ip) + sizeof(struct sctphdr)); + if (net->mtu > SCTP_MIN_V4_OVERHEAD) + omtu = net->mtu - SCTP_MIN_V4_OVERHEAD; else omtu = 0; break; #endif #ifdef INET6 case AF_INET6: - if (net->mtu > (sizeof(struct ip6_hdr) + sizeof(struct sctphdr))) - omtu = net->mtu - (sizeof(struct ip6_hdr) + sizeof(struct sctphdr)); + if (net->mtu > SCTP_MIN_OVERHEAD) + omtu = net->mtu - SCTP_MIN_OVERHEAD; else omtu = 0; break; @@ -8595,7 +8630,8 @@ again_one_more_time: omtu = 0; break; } - if ((((asoc->state & SCTP_STATE_OPEN) == SCTP_STATE_OPEN) && + if ((((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) && (skip_data_for_this_net == 0)) || (cookie)) { TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { @@ -8785,6 +8821,14 @@ no_data_fill: */ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); } + if (bundle_at || hbflag) { + /* For data/asconf and hb set time */ + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(now); + *now_filled = 1; + } + net->last_sent_time = *now; + } /* Now send it, if there is anything to send :> */ if ((error = sctp_lowlevel_chunk_output(inp, stcb, @@ -8803,23 +8847,13 @@ no_data_fill: 0, 0, so_locked))) { /* error, we could not output */ - if (error == ENOBUFS) { - SCTP_STAT_INCR(sctps_lowlevelerr); - asoc->ifp_had_enobuf = 1; - } + SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (from_where == 0) { SCTP_STAT_INCR(sctps_lowlevelerrusr); } - SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); - if (hbflag) { - if (*now_filled == 0) { - (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); - *now_filled = 1; - *now = net->last_sent_time; - } else { - net->last_sent_time = *now; - } - hbflag = 0; + if (error == ENOBUFS) { + SCTP_STAT_INCR(sctps_lowlevelerr); + asoc->ifp_had_enobuf = 1; } if (error == EHOSTUNREACH) { /* @@ -8844,16 +8878,6 @@ no_data_fill: endoutchain = NULL; auth = NULL; auth_offset = 0; - if (bundle_at || hbflag) { - /* For data/asconf and hb set time */ - if (*now_filled == 0) { - (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); - *now_filled = 1; - *now = net->last_sent_time; - } else { - net->last_sent_time = *now; - } - } if (!no_out_cnt) { *num_out += (ctl_cnt + bundle_at); } @@ -8914,9 +8938,37 @@ sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err) */ struct sctp_chunkhdr *hdr; struct sctp_tmit_chunk *chk; - struct mbuf *mat; + struct mbuf *mat, *last_mbuf; + uint32_t chunk_length; + uint16_t padding_length; SCTP_TCB_LOCK_ASSERT(stcb); + SCTP_BUF_PREPEND(op_err, sizeof(struct sctp_chunkhdr), M_NOWAIT); + if (op_err == NULL) { + return; + } + last_mbuf = NULL; + chunk_length = 0; + for (mat = op_err; mat != NULL; mat = SCTP_BUF_NEXT(mat)) { + chunk_length += SCTP_BUF_LEN(mat); + if (SCTP_BUF_NEXT(mat) == NULL) { + last_mbuf = mat; + } + } + if (chunk_length > SCTP_MAX_CHUNK_LENGTH) { + sctp_m_freem(op_err); + return; + } + padding_length = chunk_length % 4; + if (padding_length != 0) { + padding_length = 4 - padding_length; + } + if (padding_length != 0) { + if (sctp_add_pad_tombuf(last_mbuf, padding_length) == NULL) { + sctp_m_freem(op_err); + return; + } + } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ @@ -8924,32 +8976,19 @@ sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err) return; } chk->copy_by_ref = 0; - SCTP_BUF_PREPEND(op_err, sizeof(struct sctp_chunkhdr), M_DONTWAIT); - if (op_err == NULL) { - sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); - return; - } - chk->send_size = 0; - mat = op_err; - while (mat != NULL) { - chk->send_size += SCTP_BUF_LEN(mat); - mat = SCTP_BUF_NEXT(mat); - } - chk->rec.chunk_id.id = SCTP_OPERATION_ERROR; - chk->rec.chunk_id.can_take_data = 1; + chk->send_size = (uint16_t) chunk_length; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; - chk->flags = 0; chk->asoc = &stcb->asoc; chk->data = op_err; chk->whoTo = NULL; + chk->rec.chunk_id.id = SCTP_OPERATION_ERROR; + chk->rec.chunk_id.can_take_data = 0; hdr = mtod(op_err, struct sctp_chunkhdr *); hdr->chunk_type = SCTP_OPERATION_ERROR; hdr->chunk_flags = 0; hdr->chunk_length = htons(chk->send_size); - TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, - chk, - sctp_next); + TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; } @@ -8970,12 +9009,11 @@ sctp_send_cookie_echo(struct mbuf *m, struct sctp_tmit_chunk *chk; uint16_t ptype, plen; + SCTP_TCB_LOCK_ASSERT(stcb); /* First find the cookie in the param area */ cookie = NULL; at = offset + sizeof(struct sctp_init_chunk); - - SCTP_TCB_LOCK_ASSERT(stcb); - do { + for (;;) { phdr = sctp_get_next_param(m, at, &parm, sizeof(parm)); if (phdr == NULL) { return (-3); @@ -8989,32 +9027,21 @@ sctp_send_cookie_echo(struct mbuf *m, if ((pad = (plen % 4))) { plen += 4 - pad; } - cookie = SCTP_M_COPYM(m, at, plen, M_DONTWAIT); + cookie = SCTP_M_COPYM(m, at, plen, M_NOWAIT); if (cookie == NULL) { /* No memory */ return (-2); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = cookie; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_ICOPY); - } - } + sctp_log_mbc(cookie, SCTP_MBUF_ICOPY); } #endif break; } at += SCTP_SIZE32(plen); - } while (phdr); - if (cookie == NULL) { - /* Did not find the cookie */ - return (-3); } /* ok, we got the cookie lets change it into a cookie echo chunk */ - /* first the change from param to cookie */ hdr = mtod(cookie, struct sctp_chunkhdr *); hdr->chunk_type = SCTP_COOKIE_ECHO; @@ -9027,12 +9054,12 @@ sctp_send_cookie_echo(struct mbuf *m, return (-5); } chk->copy_by_ref = 0; - chk->send_size = plen; chk->rec.chunk_id.id = SCTP_COOKIE_ECHO; chk->rec.chunk_id.can_take_data = 0; + chk->flags = CHUNK_FLAGS_FRAGMENT_OK; + chk->send_size = plen; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; - chk->flags = CHUNK_FLAGS_FRAGMENT_OK; chk->asoc = &stcb->asoc; chk->data = cookie; chk->whoTo = net; @@ -9061,20 +9088,14 @@ sctp_send_heartbeat_ack(struct sctp_tcb *stcb, /* must have a net pointer */ return; - outchain = SCTP_M_COPYM(m, offset, chk_length, M_DONTWAIT); + outchain = SCTP_M_COPYM(m, offset, chk_length, M_NOWAIT); if (outchain == NULL) { /* gak out of memory */ return; } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = outchain; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_ICOPY); - } - } + sctp_log_mbc(outchain, SCTP_MBUF_ICOPY); } #endif chdr = mtod(outchain, struct sctp_chunkhdr *); @@ -9095,12 +9116,12 @@ sctp_send_heartbeat_ack(struct sctp_tcb *stcb, return; } chk->copy_by_ref = 0; - chk->send_size = chk_length; chk->rec.chunk_id.id = SCTP_HEARTBEAT_ACK; chk->rec.chunk_id.can_take_data = 1; + chk->flags = 0; + chk->send_size = chk_length; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; - chk->flags = 0; chk->asoc = &stcb->asoc; chk->data = outchain; chk->whoTo = net; @@ -9119,7 +9140,7 @@ sctp_send_cookie_ack(struct sctp_tcb *stcb) SCTP_TCB_LOCK_ASSERT(stcb); - cookie_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_DONTWAIT, 1, MT_HEADER); + cookie_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_NOWAIT, 1, MT_HEADER); if (cookie_ack == NULL) { /* no mbuf's */ return; @@ -9132,12 +9153,12 @@ sctp_send_cookie_ack(struct sctp_tcb *stcb) return; } chk->copy_by_ref = 0; - chk->send_size = sizeof(struct sctp_chunkhdr); chk->rec.chunk_id.id = SCTP_COOKIE_ACK; chk->rec.chunk_id.can_take_data = 1; + chk->flags = 0; + chk->send_size = sizeof(struct sctp_chunkhdr); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; - chk->flags = 0; chk->asoc = &stcb->asoc; chk->data = cookie_ack; if (chk->asoc->last_control_chunk_from != NULL) { @@ -9165,7 +9186,7 @@ sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net) struct sctp_shutdown_ack_chunk *ack_cp; struct sctp_tmit_chunk *chk; - m_shutdown_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_ack_chunk), 0, M_DONTWAIT, 1, MT_HEADER); + m_shutdown_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_ack_chunk), 0, M_NOWAIT, 1, MT_HEADER); if (m_shutdown_ack == NULL) { /* no mbuf's */ return; @@ -9178,9 +9199,10 @@ sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net) return; } chk->copy_by_ref = 0; - chk->send_size = sizeof(struct sctp_chunkhdr); chk->rec.chunk_id.id = SCTP_SHUTDOWN_ACK; chk->rec.chunk_id.can_take_data = 1; + chk->flags = 0; + chk->send_size = sizeof(struct sctp_chunkhdr); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->flags = 0; @@ -9208,7 +9230,7 @@ sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net) struct sctp_shutdown_chunk *shutdown_cp; struct sctp_tmit_chunk *chk; - m_shutdown = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_chunk), 0, M_DONTWAIT, 1, MT_HEADER); + m_shutdown = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_chunk), 0, M_NOWAIT, 1, MT_HEADER); if (m_shutdown == NULL) { /* no mbuf's */ return; @@ -9221,9 +9243,10 @@ sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net) return; } chk->copy_by_ref = 0; - chk->send_size = sizeof(struct sctp_shutdown_chunk); chk->rec.chunk_id.id = SCTP_SHUTDOWN; chk->rec.chunk_id.can_take_data = 1; + chk->flags = 0; + chk->send_size = sizeof(struct sctp_shutdown_chunk); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->flags = 0; @@ -9274,13 +9297,13 @@ sctp_send_asconf(struct sctp_tcb *stcb, struct sctp_nets *net, int addr_locked) return; } chk->copy_by_ref = 0; - chk->data = m_asconf; - chk->send_size = len; chk->rec.chunk_id.id = SCTP_ASCONF; chk->rec.chunk_id.can_take_data = 0; + chk->flags = CHUNK_FLAGS_FRAGMENT_OK; + chk->data = m_asconf; + chk->send_size = len; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; - chk->flags = CHUNK_FLAGS_FRAGMENT_OK; chk->asoc = &stcb->asoc; chk->whoTo = net; if (chk->whoTo) { @@ -9344,20 +9367,14 @@ sctp_send_asconf_ack(struct sctp_tcb *stcb) continue; } /* copy the asconf_ack */ - m_ack = SCTP_M_COPYM(ack->data, 0, M_COPYALL, M_DONTWAIT); + m_ack = SCTP_M_COPYM(ack->data, 0, M_COPYALL, M_NOWAIT); if (m_ack == NULL) { /* couldn't copy it */ return; } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - struct mbuf *mat; - - for (mat = m_ack; mat; mat = SCTP_BUF_NEXT(mat)) { - if (SCTP_BUF_IS_EXTENDED(mat)) { - sctp_log_mb(mat, SCTP_MBUF_ICOPY); - } - } + sctp_log_mbc(m_ack, SCTP_MBUF_ICOPY); } #endif @@ -9369,20 +9386,17 @@ sctp_send_asconf_ack(struct sctp_tcb *stcb) return; } chk->copy_by_ref = 0; - + chk->rec.chunk_id.id = SCTP_ASCONF_ACK; + chk->rec.chunk_id.can_take_data = 1; + chk->flags = CHUNK_FLAGS_FRAGMENT_OK; chk->whoTo = net; if (chk->whoTo) { atomic_add_int(&chk->whoTo->ref_count, 1); } chk->data = m_ack; - chk->send_size = 0; - /* Get size */ chk->send_size = ack->len; - chk->rec.chunk_id.id = SCTP_ASCONF_ACK; - chk->rec.chunk_id.can_take_data = 1; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; - chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; /* XXX */ chk->asoc = &stcb->asoc; TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); @@ -9491,7 +9505,7 @@ sctp_chunk_retransmission(struct sctp_inpcb *inp, cnt_thru = 0; /* do we have control chunks to retransmit? */ if (m != NULL) { - /* Start a timer no matter if we suceed or fail */ + /* Start a timer no matter if we succeed or fail */ if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) { sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, chk->whoTo); } else if (chk->rec.chunk_id.id == SCTP_ASCONF) @@ -9555,12 +9569,16 @@ sctp_chunk_retransmission(struct sctp_inpcb *inp, } if ((SCTP_BASE_SYSCTL(sctp_max_retran_chunk)) && (chk->snd_count >= SCTP_BASE_SYSCTL(sctp_max_retran_chunk))) { - /* Gak, we have exceeded max unlucky retran, abort! */ - SCTP_PRINTF("Gak, chk->snd_count:%d >= max:%d - send abort\n", - chk->snd_count, - SCTP_BASE_SYSCTL(sctp_max_retran_chunk)); + struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; + + snprintf(msg, sizeof(msg), "TSN %8.8x retransmitted %d times, giving up", + chk->rec.data.TSN_seq, chk->snd_count); + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + msg); atomic_add_int(&stcb->asoc.refcnt, 1); - sctp_abort_an_association(stcb->sctp_ep, stcb, NULL, so_locked); + sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, + so_locked); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); return (SCTP_RETRAN_EXIT); @@ -9752,7 +9770,7 @@ one_chunk_around: /* Is there something to send for this destination? */ if (m) { /* - * No matter if we fail/or suceed we should start a + * No matter if we fail/or succeed we should start a * timer. A failure is like a lost IP packet :-) */ if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { @@ -9850,7 +9868,7 @@ one_chunk_around: sctp_misc_ints(SCTP_FLIGHT_LOG_UP_RSND, data_list[i]->whoTo->flight_size, data_list[i]->book_size, - (uintptr_t) data_list[i]->whoTo, + (uint32_t) (uintptr_t) data_list[i]->whoTo, data_list[i]->rec.data.TSN_seq); } sctp_flight_size_increase(data_list[i]); @@ -9874,7 +9892,7 @@ one_chunk_around: * t3-expiring. */ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, inp, stcb, net, - SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_4); + SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_2); sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); } } @@ -9954,7 +9972,7 @@ sctp_chunk_output(struct sctp_inpcb *inp, */ struct sctp_association *asoc; struct sctp_nets *net; - int error = 0, num_out = 0, tot_out = 0, ret = 0, reason_code = 0; + int error = 0, num_out, tot_out = 0, ret = 0, reason_code; unsigned int burst_cnt = 0; struct timeval now; int now_filled = 0; @@ -9965,6 +9983,7 @@ sctp_chunk_output(struct sctp_inpcb *inp, unsigned int tot_frs = 0; asoc = &stcb->asoc; +do_it_again: /* The Nagle algorithm is only applied when handling a send call. */ if (from_where == SCTP_OUTPUT_FROM_USR_SEND) { if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY)) { @@ -9982,7 +10001,8 @@ sctp_chunk_output(struct sctp_inpcb *inp, if ((un_sent <= 0) && (TAILQ_EMPTY(&asoc->control_send_queue)) && (TAILQ_EMPTY(&asoc->asconf_send_queue)) && - (asoc->sent_queue_retran_cnt == 0)) { + (asoc->sent_queue_retran_cnt == 0) && + (asoc->trigger_reset == 0)) { /* Nothing to do unless there is something to be sent left */ return; } @@ -10156,15 +10176,14 @@ sctp_chunk_output(struct sctp_inpcb *inp, un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) + (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk))); if ((un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD)) && - (stcb->asoc.total_flight > 0) && - ((stcb->asoc.locked_on_sending == NULL) || - sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) { + (stcb->asoc.total_flight > 0)) { +/* && sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) {*/ break; } } if (TAILQ_EMPTY(&asoc->control_send_queue) && TAILQ_EMPTY(&asoc->send_queue) && - stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { + sctp_is_there_unsent_data(stcb, so_locked) == 0) { /* Nothing left to send */ break; } @@ -10201,6 +10220,12 @@ sctp_chunk_output(struct sctp_inpcb *inp, */ if (stcb->asoc.ecn_echo_cnt_onq) sctp_fix_ecn_echo(asoc); + + if (stcb->asoc.trigger_reset) { + if (sctp_send_stream_reset_out_if_possible(stcb, so_locked) == 0) { + goto do_it_again; + } + } return; } @@ -10235,10 +10260,21 @@ void send_forward_tsn(struct sctp_tcb *stcb, struct sctp_association *asoc) { - struct sctp_tmit_chunk *chk; + struct sctp_tmit_chunk *chk, *at, *tp1, *last; struct sctp_forward_tsn_chunk *fwdtsn; + struct sctp_strseq *strseq; + struct sctp_strseq_mid *strseq_m; uint32_t advance_peer_ack_point; + unsigned int cnt_of_space, i, ovh; + unsigned int space_needed; + unsigned int cnt_of_skipped = 0; + int old; + if (asoc->idata_supported) { + old = 0; + } else { + old = 1; + } SCTP_TCB_LOCK_ASSERT(stcb); TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) { @@ -10260,11 +10296,17 @@ send_forward_tsn(struct sctp_tcb *stcb, } asoc->fwd_tsn_cnt++; chk->copy_by_ref = 0; + /* + * We don't do the old thing here since this is used not for on-wire + * but to tell if we are sending a fwd-tsn by the stack during + * output. And if its a IFORWARD or a FORWARD it is a fwd-tsn. + */ chk->rec.chunk_id.id = SCTP_FORWARD_CUM_TSN; chk->rec.chunk_id.can_take_data = 0; + chk->flags = 0; chk->asoc = asoc; chk->whoTo = NULL; - chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; @@ -10280,132 +10322,155 @@ sctp_fill_in_rest: * stream/seq of the ones we skip. */ SCTP_BUF_LEN(chk->data) = 0; - { - struct sctp_tmit_chunk *at, *tp1, *last; - struct sctp_strseq *strseq; - unsigned int cnt_of_space, i, ovh; - unsigned int space_needed; - unsigned int cnt_of_skipped = 0; - - TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) { - if ((at->sent != SCTP_FORWARD_TSN_SKIP) && - (at->sent != SCTP_DATAGRAM_NR_ACKED)) { - /* no more to look at */ - break; - } - if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) { - /* We don't report these */ - continue; - } - cnt_of_skipped++; + TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) { + if ((at->sent != SCTP_FORWARD_TSN_SKIP) && + (at->sent != SCTP_DATAGRAM_NR_ACKED)) { + /* no more to look at */ + break; } + if (old && (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) { + /* We don't report these */ + continue; + } + cnt_of_skipped++; + } + if (old) { space_needed = (sizeof(struct sctp_forward_tsn_chunk) + (cnt_of_skipped * sizeof(struct sctp_strseq))); + } else { + space_needed = (sizeof(struct sctp_forward_tsn_chunk) + + (cnt_of_skipped * sizeof(struct sctp_strseq_mid))); + } + cnt_of_space = (unsigned int)M_TRAILINGSPACE(chk->data); - cnt_of_space = M_TRAILINGSPACE(chk->data); - - if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { - ovh = SCTP_MIN_OVERHEAD; - } else { - ovh = SCTP_MIN_V4_OVERHEAD; - } - if (cnt_of_space > (asoc->smallest_mtu - ovh)) { - /* trim to a mtu size */ - cnt_of_space = asoc->smallest_mtu - ovh; - } + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ovh = SCTP_MIN_OVERHEAD; + } else { + ovh = SCTP_MIN_V4_OVERHEAD; + } + if (cnt_of_space > (asoc->smallest_mtu - ovh)) { + /* trim to a mtu size */ + cnt_of_space = asoc->smallest_mtu - ovh; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { + sctp_misc_ints(SCTP_FWD_TSN_CHECK, + 0xff, 0, cnt_of_skipped, + asoc->advanced_peer_ack_point); + } + advance_peer_ack_point = asoc->advanced_peer_ack_point; + if (cnt_of_space < space_needed) { + /*- + * ok we must trim down the chunk by lowering the + * advance peer ack point. + */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { sctp_misc_ints(SCTP_FWD_TSN_CHECK, - 0xff, 0, cnt_of_skipped, - asoc->advanced_peer_ack_point); - + 0xff, 0xff, cnt_of_space, + space_needed); } - advance_peer_ack_point = asoc->advanced_peer_ack_point; - if (cnt_of_space < space_needed) { - /*- - * ok we must trim down the chunk by lowering the - * advance peer ack point. - */ - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { - sctp_misc_ints(SCTP_FWD_TSN_CHECK, - 0xff, 0xff, cnt_of_space, - space_needed); - } + if (old) { cnt_of_skipped = cnt_of_space - sizeof(struct sctp_forward_tsn_chunk); cnt_of_skipped /= sizeof(struct sctp_strseq); - /*- - * Go through and find the TSN that will be the one - * we report. - */ - at = TAILQ_FIRST(&asoc->sent_queue); - if (at != NULL) { - for (i = 0; i < cnt_of_skipped; i++) { - tp1 = TAILQ_NEXT(at, sctp_next); - if (tp1 == NULL) { - break; - } - at = tp1; + } else { + cnt_of_skipped = cnt_of_space - sizeof(struct sctp_forward_tsn_chunk); + cnt_of_skipped /= sizeof(struct sctp_strseq_mid); + } + /*- + * Go through and find the TSN that will be the one + * we report. + */ + at = TAILQ_FIRST(&asoc->sent_queue); + if (at != NULL) { + for (i = 0; i < cnt_of_skipped; i++) { + tp1 = TAILQ_NEXT(at, sctp_next); + if (tp1 == NULL) { + break; } + at = tp1; } - if (at && SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { - sctp_misc_ints(SCTP_FWD_TSN_CHECK, - 0xff, cnt_of_skipped, at->rec.data.TSN_seq, - asoc->advanced_peer_ack_point); - } - last = at; - /*- - * last now points to last one I can report, update - * peer ack point - */ - if (last) - advance_peer_ack_point = last->rec.data.TSN_seq; + } + if (at && SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { + sctp_misc_ints(SCTP_FWD_TSN_CHECK, + 0xff, cnt_of_skipped, at->rec.data.TSN_seq, + asoc->advanced_peer_ack_point); + } + last = at; + /*- + * last now points to last one I can report, update + * peer ack point + */ + if (last) { + advance_peer_ack_point = last->rec.data.TSN_seq; + } + if (old) { + space_needed = sizeof(struct sctp_forward_tsn_chunk) + + cnt_of_skipped * sizeof(struct sctp_strseq); + } else { space_needed = sizeof(struct sctp_forward_tsn_chunk) + - cnt_of_skipped * sizeof(struct sctp_strseq); + cnt_of_skipped * sizeof(struct sctp_strseq_mid); } - chk->send_size = space_needed; - /* Setup the chunk */ - fwdtsn = mtod(chk->data, struct sctp_forward_tsn_chunk *); - fwdtsn->ch.chunk_length = htons(chk->send_size); - fwdtsn->ch.chunk_flags = 0; + } + chk->send_size = space_needed; + /* Setup the chunk */ + fwdtsn = mtod(chk->data, struct sctp_forward_tsn_chunk *); + fwdtsn->ch.chunk_length = htons(chk->send_size); + fwdtsn->ch.chunk_flags = 0; + if (old) { fwdtsn->ch.chunk_type = SCTP_FORWARD_CUM_TSN; - fwdtsn->new_cumulative_tsn = htonl(advance_peer_ack_point); - SCTP_BUF_LEN(chk->data) = chk->send_size; - fwdtsn++; - /*- - * Move pointer to after the fwdtsn and transfer to the - * strseq pointer. - */ + } else { + fwdtsn->ch.chunk_type = SCTP_IFORWARD_CUM_TSN; + } + fwdtsn->new_cumulative_tsn = htonl(advance_peer_ack_point); + SCTP_BUF_LEN(chk->data) = chk->send_size; + fwdtsn++; + /*- + * Move pointer to after the fwdtsn and transfer to the + * strseq pointer. + */ + if (old) { strseq = (struct sctp_strseq *)fwdtsn; - /*- - * Now populate the strseq list. This is done blindly - * without pulling out duplicate stream info. This is - * inefficent but won't harm the process since the peer will - * look at these in sequence and will thus release anything. - * It could mean we exceed the PMTU and chop off some that - * we could have included.. but this is unlikely (aka 1432/4 - * would mean 300+ stream seq's would have to be reported in - * one FWD-TSN. With a bit of work we can later FIX this to - * optimize and pull out duplcates.. but it does add more - * overhead. So for now... not! - */ - at = TAILQ_FIRST(&asoc->sent_queue); - for (i = 0; i < cnt_of_skipped; i++) { - tp1 = TAILQ_NEXT(at, sctp_next); - if (tp1 == NULL) - break; + } else { + strseq_m = (struct sctp_strseq_mid *)fwdtsn; + } + /*- + * Now populate the strseq list. This is done blindly + * without pulling out duplicate stream info. This is + * inefficent but won't harm the process since the peer will + * look at these in sequence and will thus release anything. + * It could mean we exceed the PMTU and chop off some that + * we could have included.. but this is unlikely (aka 1432/4 + * would mean 300+ stream seq's would have to be reported in + * one FWD-TSN. With a bit of work we can later FIX this to + * optimize and pull out duplicates.. but it does add more + * overhead. So for now... not! + */ + i = 0; + TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) { + if (i >= cnt_of_skipped) { + break; + } + if (old && (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) { + /* We don't report these */ + continue; + } + if (at->rec.data.TSN_seq == advance_peer_ack_point) { + at->rec.data.fwd_tsn_cnt = 0; + } + if (old) { + strseq->stream = htons(at->rec.data.stream_number); + strseq->sequence = htons((uint16_t) at->rec.data.stream_seq); + strseq++; + } else { + strseq_m->stream = htons(at->rec.data.stream_number); if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) { - /* We don't report these */ - i--; - at = tp1; - continue; - } - if (at->rec.data.TSN_seq == advance_peer_ack_point) { - at->rec.data.fwd_tsn_cnt = 0; + strseq_m->flags = htons(PR_SCTP_UNORDERED_FLAG); + } else { + strseq_m->flags = 0; } - strseq->stream = ntohs(at->rec.data.stream_number); - strseq->sequence = ntohs(at->rec.data.stream_seq); - strseq++; - at = tp1; + strseq_m->msg_id = htonl(at->rec.data.stream_seq); + strseq_m++; } + i++; } return; } @@ -10428,7 +10493,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked struct sctp_sack_chunk *sack; struct sctp_nr_sack_chunk *nr_sack; struct sctp_gap_ack_block *gap_descriptor; - struct sack_track *selector; + const struct sack_track *selector; int mergeable = 0; int offset; caddr_t limit; @@ -10443,8 +10508,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked uint8_t type; uint8_t tsn_map; - if ((stcb->asoc.sctp_nr_sack_on_off == 1) && - (stcb->asoc.peer_supports_nr_sack == 1)) { + if (stcb->asoc.nrsack_supported == 1) { type = SCTP_NR_SELECTIVE_ACK; } else { type = SCTP_SELECTIVE_ACK; @@ -10481,7 +10545,8 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked /* No memory so we drop the idea, and set a timer */ if (stcb->asoc.delayed_ack) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, - stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_5); + stcb->sctp_ep, stcb, NULL, + SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_3); sctp_timer_start(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL); } else { @@ -10496,38 +10561,24 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked /* Clear our pkt counts */ asoc->data_pkts_seen = 0; + a_chk->flags = 0; a_chk->asoc = asoc; a_chk->snd_count = 0; a_chk->send_size = 0; /* fill in later */ a_chk->sent = SCTP_DATAGRAM_UNSENT; a_chk->whoTo = NULL; - if ((asoc->numduptsns) || - (!(asoc->last_data_chunk_from->dest_state & SCTP_ADDR_REACHABLE))) { + if (!(asoc->last_data_chunk_from->dest_state & SCTP_ADDR_REACHABLE)) { /*- - * Ok, we have some duplicates or the destination for the - * sack is unreachable, lets see if we can select an - * alternate than asoc->last_data_chunk_from + * Ok, the destination for the SACK is unreachable, lets see if + * we can select an alternate to asoc->last_data_chunk_from */ - if ((asoc->last_data_chunk_from->dest_state & SCTP_ADDR_REACHABLE) && - (asoc->used_alt_onsack > asoc->numnets)) { - /* We used an alt last time, don't this time */ - a_chk->whoTo = NULL; - } else { - asoc->used_alt_onsack++; - a_chk->whoTo = sctp_find_alternate_net(stcb, asoc->last_data_chunk_from, 0); - } + a_chk->whoTo = sctp_find_alternate_net(stcb, asoc->last_data_chunk_from, 0); if (a_chk->whoTo == NULL) { /* Nope, no alternate */ a_chk->whoTo = asoc->last_data_chunk_from; - asoc->used_alt_onsack = 0; } } else { - /* - * No duplicates so we use the last place we received data - * from. - */ - asoc->used_alt_onsack = 0; a_chk->whoTo = asoc->last_data_chunk_from; } if (a_chk->whoTo) { @@ -10550,7 +10601,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked space_req = MCLBYTES; } /* Ok now lets formulate a MBUF with our sack */ - a_chk->data = sctp_get_mbuf_for_msg(space_req, 0, M_DONTWAIT, 1, MT_DATA); + a_chk->data = sctp_get_mbuf_for_msg(space_req, 0, M_NOWAIT, 1, MT_DATA); if ((a_chk->data == NULL) || (a_chk->whoTo == NULL)) { /* rats, no mbuf memory */ @@ -10563,7 +10614,8 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked /* sa_ignore NO_NULL_CHK */ if (stcb->asoc.delayed_ack) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, - stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_6); + stcb->sctp_ep, stcb, NULL, + SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_4); sctp_timer_start(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL); } else { @@ -10573,7 +10625,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked } /* ok, lets go through and fill it in */ SCTP_BUF_RESV_UF(a_chk->data, SCTP_MIN_OVERHEAD); - space = M_TRAILINGSPACE(a_chk->data); + space = (unsigned int)M_TRAILINGSPACE(a_chk->data); if (space > (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD)) { space = (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD); } @@ -10642,7 +10694,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked * Clear all bits corresponding to TSNs * smaller or equal to the cumulative TSN. */ - tsn_map &= (~0 << (1 - offset)); + tsn_map &= (~0U << (1 - offset)); } selector = &sack_array[tsn_map]; if (mergeable && selector->right_edge) { @@ -10717,7 +10769,7 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked * TSNs smaller or equal to the * cumulative TSN. */ - tsn_map &= (~0 << (1 - offset)); + tsn_map &= (~0U << (1 - offset)); } selector = &sack_array[tsn_map]; if (mergeable && selector->right_edge) { @@ -10787,9 +10839,9 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked * queue. */ if (type == SCTP_SELECTIVE_ACK) { - a_chk->send_size = sizeof(struct sctp_sack_chunk) + + a_chk->send_size = (uint16_t) (sizeof(struct sctp_sack_chunk) + (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) + - num_dups * sizeof(int32_t); + num_dups * sizeof(int32_t)); SCTP_BUF_LEN(a_chk->data) = a_chk->send_size; sack->sack.cum_tsn_ack = htonl(asoc->cumulative_tsn); sack->sack.a_rwnd = htonl(asoc->my_rwnd); @@ -10799,9 +10851,9 @@ sctp_send_sack(struct sctp_tcb *stcb, int so_locked sack->ch.chunk_flags = flags; sack->ch.chunk_length = htons(a_chk->send_size); } else { - a_chk->send_size = sizeof(struct sctp_nr_sack_chunk) + + a_chk->send_size = (uint16_t) (sizeof(struct sctp_nr_sack_chunk) + (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) + - num_dups * sizeof(int32_t); + num_dups * sizeof(int32_t)); SCTP_BUF_LEN(a_chk->data) = a_chk->send_size; nr_sack->nr_sack.cum_tsn_ack = htonl(asoc->cumulative_tsn); nr_sack->nr_sack.a_rwnd = htonl(asoc->my_rwnd); @@ -10850,7 +10902,7 @@ sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked } else { m_out = NULL; } - m_abort = sctp_get_mbuf_for_msg(sizeof(struct sctp_abort_chunk), 0, M_DONTWAIT, 1, MT_HEADER); + m_abort = sctp_get_mbuf_for_msg(sizeof(struct sctp_abort_chunk), 0, M_NOWAIT, 1, MT_HEADER); if (m_abort == NULL) { if (m_out) { sctp_m_freem(m_out); @@ -10900,7 +10952,8 @@ sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked abort->ch.chunk_length = htons(chunk_len); /* Add padding, if necessary. */ if (padding_len > 0) { - if ((m_last == NULL) || sctp_add_pad_tombuf(m_last, padding_len)) { + if ((m_last == NULL) || + (sctp_add_pad_tombuf(m_last, padding_len) == NULL)) { sctp_m_freem(m_out); return; } @@ -10926,7 +10979,7 @@ sctp_send_shutdown_complete(struct sctp_tcb *stcb, uint32_t vtag; uint8_t flags; - m_shutdown_comp = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_DONTWAIT, 1, MT_HEADER); + m_shutdown_comp = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_NOWAIT, 1, MT_HEADER); if (m_shutdown_comp == NULL) { /* no mbuf's */ return; @@ -10959,20 +11012,21 @@ static void sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint32_t vtag, uint8_t type, struct mbuf *cause, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { struct mbuf *o_pak; struct mbuf *mout; struct sctphdr *shout; struct sctp_chunkhdr *ch; - struct udphdr *udp; - int len, cause_len, padding_len; #if defined(INET) || defined(INET6) + struct udphdr *udp; int ret; #endif + int len, cause_len, padding_len; + #ifdef INET struct sockaddr_in *src_sin, *dst_sin; struct ip *ip; @@ -10999,7 +11053,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, padding_len = 4 - padding_len; } if (padding_len != 0) { - if (sctp_add_pad_tombuf(m_last, padding_len)) { + if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) { sctp_m_freem(cause); return; } @@ -11023,10 +11077,12 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, default: break; } +#if defined(INET) || defined(INET6) if (port) { len += sizeof(struct udphdr); } - mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA); +#endif + mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_NOWAIT, 1, MT_DATA); if (mout == NULL) { if (cause) { sctp_m_freem(cause); @@ -11036,10 +11092,9 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, SCTP_BUF_RESV_UF(mout, max_linkhdr); SCTP_BUF_LEN(mout) = len; SCTP_BUF_NEXT(mout) = cause; - if (use_mflowid != 0) { - mout->m_pkthdr.flowid = mflowid; - mout->m_flags |= M_FLOWID; - } + M_SETFIB(mout, fibnum); + mout->m_pkthdr.flowid = mflowid; + M_HASHTYPE_SET(mout, mflowtype); #ifdef INET ip = NULL; #endif @@ -11055,8 +11110,8 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, ip->ip_v = IPVERSION; ip->ip_hl = (sizeof(struct ip) >> 2); ip->ip_tos = 0; - ip->ip_id = ip_newid(); ip->ip_off = 0; + ip_fillid(ip); ip->ip_ttl = MODULE_GLOBAL(ip_defttl); if (port) { ip->ip_p = IPPROTO_UDP; @@ -11096,6 +11151,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, shout = mtod(mout, struct sctphdr *); break; } +#if defined(INET) || defined(INET6) if (port) { if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) { sctp_m_freem(mout); @@ -11105,15 +11161,16 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); udp->uh_dport = port; udp->uh_sum = 0; - udp->uh_ulen = htons(sizeof(struct udphdr) + + udp->uh_ulen = htons((uint16_t) (sizeof(struct udphdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + - cause_len + padding_len); + cause_len + padding_len)); len += sizeof(struct udphdr); shout = (struct sctphdr *)((caddr_t)shout + sizeof(struct udphdr)); } else { udp = NULL; } +#endif shout->src_port = sh->dest_port; shout->dest_port = sh->src_port; shout->checksum = 0; @@ -11130,7 +11187,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, } else { ch->chunk_flags = SCTP_HAD_NO_TCB; } - ch->chunk_length = htons(sizeof(struct sctp_chunkhdr) + cause_len); + ch->chunk_length = htons((uint16_t) (sizeof(struct sctp_chunkhdr) + cause_len)); len += sizeof(struct sctp_chunkhdr); len += cause_len + padding_len; @@ -11149,7 +11206,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, udp->uh_sum = 0; } } - ip->ip_len = len; + ip->ip_len = htons(len); if (port) { #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); @@ -11179,7 +11236,7 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, #endif #ifdef INET6 case AF_INET6: - ip6->ip6_plen = len - sizeof(struct ip6_hdr); + ip6->ip6_plen = (uint16_t) (len - sizeof(struct ip6_hdr)); if (port) { #if defined(SCTP_WITH_NO_CSUM) SCTP_STAT_INCR(sctps_sendnocrc); @@ -11223,11 +11280,11 @@ sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, void sctp_send_shutdown_complete2(struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { sctp_send_resp_msg(src, dst, sh, 0, SCTP_SHUTDOWN_COMPLETE, NULL, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); } @@ -11267,10 +11324,11 @@ sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_HEARTBEAT_REQUEST; chk->rec.chunk_id.can_take_data = 1; + chk->flags = 0; chk->asoc = &stcb->asoc; chk->send_size = sizeof(struct sctp_heartbeat_chunk); - chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER); + chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, so_locked); return; @@ -11294,7 +11352,7 @@ sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked hb->heartbeat.hb_info.time_value_1 = now.tv_sec; hb->heartbeat.hb_info.time_value_2 = now.tv_usec; /* Did our user request this one, put it in */ - hb->heartbeat.hb_info.addr_family = net->ro._l_addr.sa.sa_family; + hb->heartbeat.hb_info.addr_family = (uint8_t) net->ro._l_addr.sa.sa_family; hb->heartbeat.hb_info.addr_len = net->ro._l_addr.sa.sa_len; if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { /* @@ -11323,6 +11381,11 @@ sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked break; #endif default: + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + sctp_free_a_chunk(stcb, chk, so_locked); return; break; } @@ -11368,13 +11431,14 @@ sctp_send_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net, if (chk == NULL) { return; } - chk->copy_by_ref = 0; SCTP_STAT_INCR(sctps_queue_upd_ecne); + chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ECN_ECHO; chk->rec.chunk_id.can_take_data = 0; + chk->flags = 0; chk->asoc = &stcb->asoc; chk->send_size = sizeof(struct sctp_ecne_chunk); - chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER); + chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; @@ -11417,7 +11481,7 @@ sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net, } asoc = &stcb->asoc; SCTP_TCB_LOCK_ASSERT(stcb); - if (asoc->peer_supports_pktdrop == 0) { + if (asoc->pktdrop_supported == 0) { /*- * peer must declare support before I send one. */ @@ -11431,6 +11495,9 @@ sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net, return; } chk->copy_by_ref = 0; + chk->rec.chunk_id.id = SCTP_PACKET_DROPPED; + chk->rec.chunk_id.can_take_data = 1; + chk->flags = 0; len -= iphlen; chk->send_size = len; /* Validate that we do not have an ABORT in here. */ @@ -11473,7 +11540,7 @@ sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net, was_trunc = 1; } chk->asoc = &stcb->asoc; - chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { jump_out: sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); @@ -11496,7 +11563,7 @@ jump_out: * Len is already adjusted to size minus overhead above take * out the pkt_drop chunk itself from it. */ - chk->send_size = len - sizeof(struct sctp_pktdrop_chunk); + chk->send_size = (uint16_t) (len - sizeof(struct sctp_pktdrop_chunk)); len = chk->send_size; } else { /* no truncation needed */ @@ -11517,8 +11584,6 @@ jump_out: } else { chk->whoTo = NULL; } - chk->rec.chunk_id.id = SCTP_PACKET_DROPPED; - chk->rec.chunk_id.can_take_data = 1; drp->ch.chunk_type = SCTP_PACKET_DROPPED; drp->ch.chunk_length = htons(chk->send_size); spc = SCTP_SB_LIMIT_RCV(stcb->sctp_socket); @@ -11584,9 +11649,10 @@ sctp_send_cwr(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn, u chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ECN_CWR; chk->rec.chunk_id.can_take_data = 1; + chk->flags = 0; chk->asoc = &stcb->asoc; chk->send_size = sizeof(struct sctp_cwr_chunk); - chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER); + chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; @@ -11606,30 +11672,60 @@ sctp_send_cwr(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn, u asoc->ctrl_queue_cnt++; } -void -sctp_add_stream_reset_out(struct sctp_tmit_chunk *chk, - int number_entries, uint16_t * list, +static int +sctp_add_stream_reset_out(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, uint32_t seq, uint32_t resp_seq, uint32_t last_sent) { uint16_t len, old_len, i; struct sctp_stream_reset_out_request *req_out; struct sctp_chunkhdr *ch; + int at; + int number_entries = 0; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); - /* get to new offset for the param. */ req_out = (struct sctp_stream_reset_out_request *)((caddr_t)ch + len); /* now how long will this param be? */ - len = (sizeof(struct sctp_stream_reset_out_request) + (sizeof(uint16_t) * number_entries)); + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + if ((stcb->asoc.strmout[i].state == SCTP_STREAM_RESET_PENDING) && + (stcb->asoc.strmout[i].chunks_on_queues == 0) && + TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { + number_entries++; + } + } + if (number_entries == 0) { + return (0); + } + if (number_entries == stcb->asoc.streamoutcnt) { + number_entries = 0; + } + if (number_entries > SCTP_MAX_STREAMS_AT_ONCE_RESET) { + number_entries = SCTP_MAX_STREAMS_AT_ONCE_RESET; + } + len = (uint16_t) (sizeof(struct sctp_stream_reset_out_request) + (sizeof(uint16_t) * number_entries)); req_out->ph.param_type = htons(SCTP_STR_RESET_OUT_REQUEST); req_out->ph.param_length = htons(len); req_out->request_seq = htonl(seq); req_out->response_seq = htonl(resp_seq); req_out->send_reset_at_tsn = htonl(last_sent); + at = 0; if (number_entries) { - for (i = 0; i < number_entries; i++) { - req_out->list_of_streams[i] = htons(list[i]); + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + if ((stcb->asoc.strmout[i].state == SCTP_STREAM_RESET_PENDING) && + (stcb->asoc.strmout[i].chunks_on_queues == 0) && + TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { + req_out->list_of_streams[at] = htons(i); + at++; + stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_IN_FLIGHT; + if (at >= number_entries) { + break; + } + } + } + } else { + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_IN_FLIGHT; } } if (SCTP_SIZE32(len) > len) { @@ -11646,7 +11742,7 @@ sctp_add_stream_reset_out(struct sctp_tmit_chunk *chk, chk->book_size_scale = 0; chk->send_size = SCTP_SIZE32(chk->book_size); SCTP_BUF_LEN(chk->data) = chk->send_size; - return; + return (1); } static void @@ -11664,7 +11760,7 @@ sctp_add_stream_reset_in(struct sctp_tmit_chunk *chk, /* get to new offset for the param. */ req_in = (struct sctp_stream_reset_in_request *)((caddr_t)ch + len); /* now how long will this param be? */ - len = (sizeof(struct sctp_stream_reset_in_request) + (sizeof(uint16_t) * number_entries)); + len = (uint16_t) (sizeof(struct sctp_stream_reset_in_request) + (sizeof(uint16_t) * number_entries)); req_in->ph.param_type = htons(SCTP_STR_RESET_IN_REQUEST); req_in->ph.param_length = htons(len); req_in->request_seq = htonl(seq); @@ -11747,6 +11843,68 @@ sctp_add_stream_reset_result(struct sctp_tmit_chunk *chk, return; } +void +sctp_send_deferred_reset_response(struct sctp_tcb *stcb, + struct sctp_stream_reset_list *ent, + int response) +{ + struct sctp_association *asoc; + struct sctp_tmit_chunk *chk; + struct sctp_chunkhdr *ch; + + asoc = &stcb->asoc; + + /* + * Reset our last reset action to the new one IP -> response + * (PERFORMED probably). This assures that if we fail to send, a + * retran from the peer will get the new response. + */ + asoc->last_reset_action[0] = response; + if (asoc->stream_reset_outstanding) { + return; + } + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return; + } + chk->copy_by_ref = 0; + chk->rec.chunk_id.id = SCTP_STREAM_RESET; + chk->rec.chunk_id.can_take_data = 0; + chk->flags = 0; + chk->asoc = &stcb->asoc; + chk->book_size = sizeof(struct sctp_chunkhdr); + chk->send_size = SCTP_SIZE32(chk->book_size); + chk->book_size_scale = 0; + chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); + if (chk->data == NULL) { + sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED); + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return; + } + SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); + /* setup chunk parameters */ + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + if (stcb->asoc.alternate) { + chk->whoTo = stcb->asoc.alternate; + } else { + chk->whoTo = stcb->asoc.primary_destination; + } + ch = mtod(chk->data, struct sctp_chunkhdr *); + ch->chunk_type = SCTP_STREAM_RESET; + ch->chunk_flags = 0; + ch->chunk_length = htons(chk->book_size); + atomic_add_int(&chk->whoTo->ref_count, 1); + SCTP_BUF_LEN(chk->data) = chk->send_size; + sctp_add_stream_reset_result(chk, ent->seq, response); + /* insert the chunk for sending */ + TAILQ_INSERT_TAIL(&asoc->control_send_queue, + chk, + sctp_next); + asoc->ctrl_queue_cnt++; +} + void sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *chk, uint32_t resp_seq, uint32_t result, @@ -11844,20 +12002,91 @@ sctp_add_an_in_stream(struct sctp_tmit_chunk *chk, return; } +int +sctp_send_stream_reset_out_if_possible(struct sctp_tcb *stcb, int so_locked) +{ + struct sctp_association *asoc; + struct sctp_tmit_chunk *chk; + struct sctp_chunkhdr *ch; + uint32_t seq; + + asoc = &stcb->asoc; + asoc->trigger_reset = 0; + if (asoc->stream_reset_outstanding) { + return (EALREADY); + } + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + chk->copy_by_ref = 0; + chk->rec.chunk_id.id = SCTP_STREAM_RESET; + chk->rec.chunk_id.can_take_data = 0; + chk->flags = 0; + chk->asoc = &stcb->asoc; + chk->book_size = sizeof(struct sctp_chunkhdr); + chk->send_size = SCTP_SIZE32(chk->book_size); + chk->book_size_scale = 0; + chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); + if (chk->data == NULL) { + sctp_free_a_chunk(stcb, chk, so_locked); + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); + + /* setup chunk parameters */ + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + if (stcb->asoc.alternate) { + chk->whoTo = stcb->asoc.alternate; + } else { + chk->whoTo = stcb->asoc.primary_destination; + } + ch = mtod(chk->data, struct sctp_chunkhdr *); + ch->chunk_type = SCTP_STREAM_RESET; + ch->chunk_flags = 0; + ch->chunk_length = htons(chk->book_size); + atomic_add_int(&chk->whoTo->ref_count, 1); + SCTP_BUF_LEN(chk->data) = chk->send_size; + seq = stcb->asoc.str_reset_seq_out; + if (sctp_add_stream_reset_out(stcb, chk, seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1))) { + seq++; + asoc->stream_reset_outstanding++; + } else { + m_freem(chk->data); + chk->data = NULL; + sctp_free_a_chunk(stcb, chk, so_locked); + return (ENOENT); + } + asoc->str_reset = chk; + /* insert the chunk for sending */ + TAILQ_INSERT_TAIL(&asoc->control_send_queue, + chk, + sctp_next); + asoc->ctrl_queue_cnt++; + + if (stcb->asoc.send_sack) { + sctp_send_sack(stcb, so_locked); + } + sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo); + return (0); +} + int sctp_send_str_reset_req(struct sctp_tcb *stcb, - int number_entries, uint16_t * list, - uint8_t send_out_req, + uint16_t number_entries, uint16_t * list, uint8_t send_in_req, uint8_t send_tsn_req, uint8_t add_stream, uint16_t adding_o, uint16_t adding_i, uint8_t peer_asked) { - struct sctp_association *asoc; struct sctp_tmit_chunk *chk; struct sctp_chunkhdr *ch; + int can_send_out_req = 0; uint32_t seq; asoc = &stcb->asoc; @@ -11868,16 +12097,26 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb, SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EBUSY); return (EBUSY); } - if ((send_out_req == 0) && (send_in_req == 0) && (send_tsn_req == 0) && + if ((send_in_req == 0) && (send_tsn_req == 0) && (add_stream == 0)) { /* nothing to do */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } - if (send_tsn_req && (send_out_req || send_in_req)) { + if (send_tsn_req && send_in_req) { /* error, can't do that */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); + } else if (send_in_req) { + can_send_out_req = 1; + } + if (number_entries > (MCLBYTES - + SCTP_MIN_OVERHEAD - + sizeof(struct sctp_chunkhdr) - + sizeof(struct sctp_stream_reset_out_request)) / + sizeof(uint16_t)) { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { @@ -11887,12 +12126,13 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb, chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; + chk->flags = 0; chk->asoc = &stcb->asoc; chk->book_size = sizeof(struct sctp_chunkhdr); chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; - chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED); SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); @@ -11916,12 +12156,14 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb, SCTP_BUF_LEN(chk->data) = chk->send_size; seq = stcb->asoc.str_reset_seq_out; - if (send_out_req) { - sctp_add_stream_reset_out(chk, number_entries, list, - seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1)); - asoc->stream_reset_out_is_outstanding = 1; - seq++; - asoc->stream_reset_outstanding++; + if (can_send_out_req) { + int ret; + + ret = sctp_add_stream_reset_out(stcb, chk, seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1)); + if (ret) { + seq++; + asoc->stream_reset_outstanding++; + } } if ((add_stream & 1) && ((stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt) < adding_o)) { @@ -11930,10 +12172,15 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb, struct sctp_stream_queue_pending *sp, *nsp; int i; +#if defined(SCTP_DETAILED_STR_STATS) + int j; + +#endif + oldstream = stcb->asoc.strmout; /* get some more */ SCTP_MALLOC(stcb->asoc.strmout, struct sctp_stream_out *, - ((stcb->asoc.streamoutcnt + adding_o) * sizeof(struct sctp_stream_out)), + (stcb->asoc.streamoutcnt + adding_o) * sizeof(struct sctp_stream_out), SCTP_M_STRMO); if (stcb->asoc.strmout == NULL) { uint8_t x; @@ -11953,32 +12200,44 @@ sctp_send_str_reset_req(struct sctp_tcb *stcb, for (i = 0; i < stcb->asoc.streamoutcnt; i++) { TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); stcb->asoc.strmout[i].chunks_on_queues = oldstream[i].chunks_on_queues; - stcb->asoc.strmout[i].next_sequence_send = oldstream[i].next_sequence_send; + stcb->asoc.strmout[i].next_mid_ordered = oldstream[i].next_mid_ordered; + stcb->asoc.strmout[i].next_mid_unordered = oldstream[i].next_mid_unordered; stcb->asoc.strmout[i].last_msg_incomplete = oldstream[i].last_msg_incomplete; stcb->asoc.strmout[i].stream_no = i; - stcb->asoc.ss_functions.sctp_ss_init_stream(&stcb->asoc.strmout[i], &oldstream[i]); + stcb->asoc.strmout[i].state = oldstream[i].state; + /* FIX ME FIX ME */ + /* + * This should be a SS_COPY operation FIX ME STREAM + * SCHEDULER EXPERT + */ + stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], &oldstream[i]); /* now anything on those queues? */ TAILQ_FOREACH_SAFE(sp, &oldstream[i].outqueue, next, nsp) { TAILQ_REMOVE(&oldstream[i].outqueue, sp, next); TAILQ_INSERT_TAIL(&stcb->asoc.strmout[i].outqueue, sp, next); } - /* Now move assoc pointers too */ - if (stcb->asoc.last_out_stream == &oldstream[i]) { - stcb->asoc.last_out_stream = &stcb->asoc.strmout[i]; - } - if (stcb->asoc.locked_on_sending == &oldstream[i]) { - stcb->asoc.locked_on_sending = &stcb->asoc.strmout[i]; - } + } /* now the new streams */ stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); for (i = stcb->asoc.streamoutcnt; i < (stcb->asoc.streamoutcnt + adding_o); i++) { TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); stcb->asoc.strmout[i].chunks_on_queues = 0; - stcb->asoc.strmout[i].next_sequence_send = 0x0; +#if defined(SCTP_DETAILED_STR_STATS) + for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { + stcb->asoc.strmout[i].abandoned_sent[j] = 0; + stcb->asoc.strmout[i].abandoned_unsent[j] = 0; + } +#else + stcb->asoc.strmout[i].abandoned_sent[0] = 0; + stcb->asoc.strmout[i].abandoned_unsent[0] = 0; +#endif + stcb->asoc.strmout[i].next_mid_ordered = 0; + stcb->asoc.strmout[i].next_mid_unordered = 0; stcb->asoc.strmout[i].stream_no = i; stcb->asoc.strmout[i].last_msg_incomplete = 0; - stcb->asoc.ss_functions.sctp_ss_init_stream(&stcb->asoc.strmout[i], NULL); + stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], NULL); + stcb->asoc.strmout[i].state = SCTP_STREAM_CLOSED; } stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt + adding_o; SCTP_FREE(oldstream, SCTP_M_STRMO); @@ -12012,6 +12271,9 @@ skip_stuff: chk, sctp_next); asoc->ctrl_queue_cnt++; + if (stcb->asoc.send_sack) { + sctp_send_sack(stcb, SCTP_SO_LOCKED); + } sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo); return (0); } @@ -12019,7 +12281,7 @@ skip_stuff: void sctp_send_abort(struct mbuf *m, int iphlen, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint32_t vtag, struct mbuf *cause, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { /* Don't respond to an ABORT with an ABORT. */ @@ -12029,7 +12291,7 @@ sctp_send_abort(struct mbuf *m, int iphlen, struct sockaddr *src, struct sockadd return; } sctp_send_resp_msg(src, dst, sh, vtag, SCTP_ABORT_ASSOCIATION, cause, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); return; } @@ -12037,11 +12299,11 @@ sctp_send_abort(struct mbuf *m, int iphlen, struct sockaddr *src, struct sockadd void sctp_send_operr_to(struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint32_t vtag, struct mbuf *cause, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { sctp_send_resp_msg(src, dst, sh, vtag, SCTP_OPERATION_ERROR, cause, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); return; } @@ -12073,9 +12335,6 @@ sctp_copy_one(struct sctp_stream_queue_pending *sp, struct uio *uio, int resv_upfront) { - int left; - - left = sp->length; sp->data = m_uiotombuf(uio, M_WAITOK, sp->length, resv_upfront, 0); if (sp->data == NULL) { @@ -12131,10 +12390,11 @@ sctp_copy_it_in(struct sctp_tcb *stcb, sp->timetolive = srcv->sinfo_timetolive; sp->ppid = srcv->sinfo_ppid; sp->context = srcv->sinfo_context; + sp->fsn = 0; (void)SCTP_GETTIME_TIMEVAL(&sp->ts); sp->stream = srcv->sinfo_stream; - sp->length = min(uio->uio_resid, max_send_len); + sp->length = (uint32_t) min(uio->uio_resid, max_send_len); if ((sp->length == (uint32_t) uio->uio_resid) && ((user_marks_eor == 0) || (srcv->sinfo_flags & SCTP_EOF) || @@ -12293,7 +12553,7 @@ sctp_lower_sosend(struct socket *so, SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } - sndlen = uio->uio_resid; + sndlen = (unsigned int)uio->uio_resid; } else { top = SCTP_HEADER_TO_CHAIN(i_pak); sndlen = SCTP_HEADER_LEN(i_pak); @@ -12381,7 +12641,10 @@ sctp_lower_sosend(struct socket *so, } SCTP_INP_RUNLOCK(inp); } else if (sinfo_assoc_id) { - stcb = sctp_findassociation_ep_asocid(inp, sinfo_assoc_id, 0); + stcb = sctp_findassociation_ep_asocid(inp, sinfo_assoc_id, 1); + if (stcb != NULL) { + hold_tcblock = 1; + } } else if (addr) { /*- * Since we did not use findep we must @@ -12469,8 +12732,9 @@ sctp_lower_sosend(struct socket *so, } #endif stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, - p - ); + inp->sctp_ep.pre_open_stream_count, + inp->sctp_ep.port, + p); if (stcb == NULL) { /* Error is setup for us in the call */ goto out_unlocked; @@ -12504,7 +12768,8 @@ sctp_lower_sosend(struct socket *so, if (control) { if (sctp_process_cmsgs_for_init(stcb, control, &error)) { - sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_7); + sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, + SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_5); hold_tcblock = 0; stcb = NULL; goto out_unlocked; @@ -12590,12 +12855,24 @@ sctp_lower_sosend(struct socket *so, SCTP_ASOC_CREATE_UNLOCK(inp); create_lock_applied = 0; } - if (asoc->stream_reset_outstanding) { + /* Is the stream no. valid? */ + if (srcv->sinfo_stream >= asoc->streamoutcnt) { + /* Invalid stream number */ + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_unlocked; + } + if ((asoc->strmout[srcv->sinfo_stream].state != SCTP_STREAM_OPEN) && + (asoc->strmout[srcv->sinfo_stream].state != SCTP_STREAM_OPENING)) { /* * Can't queue any data while stream reset is underway. */ - SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EAGAIN); - error = EAGAIN; + if (asoc->strmout[srcv->sinfo_stream].state > SCTP_STREAM_OPEN) { + error = EAGAIN; + } else { + error = EINVAL; + } + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, error); goto out_unlocked; } if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || @@ -12646,7 +12923,7 @@ sctp_lower_sosend(struct socket *so, if (top) { struct mbuf *cntm = NULL; - mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_WAIT, 1, MT_DATA); + mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_WAITOK, 1, MT_DATA); if (sndlen != 0) { for (cntm = top; cntm; cntm = SCTP_BUF_NEXT(cntm)) { tot_out += SCTP_BUF_LEN(cntm); @@ -12662,7 +12939,7 @@ sctp_lower_sosend(struct socket *so, error = EMSGSIZE; goto out; } - mm = sctp_get_mbuf_for_msg(tot_demand, 0, M_WAIT, 1, MT_DATA); + mm = sctp_get_mbuf_for_msg(tot_demand, 0, M_WAITOK, 1, MT_DATA); } if (mm == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM); @@ -12680,7 +12957,7 @@ sctp_lower_sosend(struct socket *so, /* now move forward the data pointer */ ph = mtod(mm, struct sctp_paramhdr *); ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); - ph->param_length = htons(sizeof(struct sctp_paramhdr) + tot_out); + ph->param_length = htons((uint16_t) (sizeof(struct sctp_paramhdr) + tot_out)); ph++; SCTP_BUF_LEN(mm) = tot_out + sizeof(struct sctp_paramhdr); if (top == NULL) { @@ -12736,13 +13013,6 @@ sctp_lower_sosend(struct socket *so, SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } - /* Is the stream no. valid? */ - if (srcv->sinfo_stream >= asoc->streamoutcnt) { - /* Invalid stream number */ - SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); - error = EINVAL; - goto out_unlocked; - } if (asoc->strmout == NULL) { /* huh? software error */ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT); @@ -12818,6 +13088,7 @@ sctp_lower_sosend(struct socket *so, asoc, stcb->asoc.total_output_queue_size); } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + SOCKBUF_UNLOCK(&so->so_snd); goto out_unlocked; } inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); @@ -12879,8 +13150,10 @@ skip_preblock: * interrupt. */ strm->last_msg_incomplete = 1; - asoc->stream_locked = 1; - asoc->stream_locked_on = srcv->sinfo_stream; + if (stcb->asoc.idata_supported == 0) { + asoc->stream_locked = 1; + asoc->stream_locked_on = srcv->sinfo_stream; + } sp->sender_all_done = 0; } sctp_snd_sb_alloc(stcb, sp->length); @@ -12959,7 +13232,9 @@ skip_preblock: sctp_snd_sb_alloc(stcb, sndout); atomic_add_int(&sp->length, sndout); len += sndout; - + if (srcv->sinfo_flags & SCTP_SACK_IMMEDIATELY) { + sp->sinfo_flags |= SCTP_SACK_IMMEDIATELY; + } /* Did we reach EOR? */ if ((uio->uio_resid == 0) && ((user_marks_eor == 0) || @@ -12976,7 +13251,7 @@ skip_preblock: continue; } /* PR-SCTP? */ - if ((asoc->peer_supports_prsctp) && (asoc->sent_queue_cnt_removeable > 0)) { + if ((asoc->prsctp_supported) && (asoc->sent_queue_cnt_removeable > 0)) { /* * This is ugly but we must assure locking * order @@ -13038,7 +13313,7 @@ skip_preblock: /*- * Ok, Nagle is set on and we have data outstanding. * Don't send anything and let SACKs drive out the - * data unless wen have a "full" segment to send. + * data unless we have a "full" segment to send. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED); @@ -13107,7 +13382,7 @@ skip_preblock: min(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTP_SB_LIMIT_SND(so)))) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_INTO_BLK, - asoc, uio->uio_resid); + asoc, (size_t)uio->uio_resid); } be.error = 0; stcb->block_entry = &be; @@ -13136,11 +13411,17 @@ skip_preblock: } } SCTP_TCB_SEND_LOCK(stcb); + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + SCTP_TCB_SEND_UNLOCK(stcb); + goto out_unlocked; + } if (sp) { if (sp->msg_is_complete == 0) { strm->last_msg_incomplete = 1; - asoc->stream_locked = 1; - asoc->stream_locked_on = srcv->sinfo_stream; + if (stcb->asoc.idata_supported == 0) { + asoc->stream_locked = 1; + asoc->stream_locked_on = srcv->sinfo_stream; + } } else { sp->sender_all_done = 1; strm->last_msg_incomplete = 0; @@ -13176,19 +13457,16 @@ dataless_eof: /* EOF thing ? */ if ((srcv->sinfo_flags & SCTP_EOF) && (got_all_of_the_send == 1)) { - int cnt; - SCTP_STAT_INCR(sctps_sends_with_eof); error = 0; if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } - cnt = sctp_is_there_unsent_data(stcb, SCTP_SO_LOCKED); if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && - (cnt == 0)) { - if (asoc->locked_on_sending) { + sctp_is_there_unsent_data(stcb, SCTP_SO_LOCKED) == 0) { + if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } /* there is nothing queued to send, so I'm done... */ @@ -13233,27 +13511,27 @@ dataless_eof: SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } - if (asoc->locked_on_sending) { - /* Locked to send out the data */ - struct sctp_stream_queue_pending *sp; - - sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead); - if (sp) { - if ((sp->length == 0) && (sp->msg_is_complete == 0)) - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; - } + if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; } asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { + struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; + abort_anyway: if (free_cnt_applied) { atomic_add_int(&stcb->asoc.refcnt, -1); free_cnt_applied = 0; } + snprintf(msg, sizeof(msg), + "%s:%d at %s", __FILE__, __LINE__, __func__); + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + msg); sctp_abort_an_association(stcb->sctp_ep, stcb, - NULL, SCTP_SO_LOCKED); + op_err, SCTP_SO_LOCKED); /* * now relock the stcb so everything * is sane @@ -13392,13 +13670,6 @@ out_unlocked: panic("Leaving with tcb send mtx owned?"); } } -#endif -#ifdef INVARIANTS - if (inp) { - sctp_validate_no_locks(inp); - } else { - SCTP_PRINTF("Warning - inp is NULL so cant validate locks\n"); - } #endif if (top) { sctp_m_freem(top); @@ -13427,19 +13698,14 @@ sctp_add_auth_chunk(struct mbuf *m, struct mbuf **m_end, (stcb == NULL)) return (m); - /* sysctl disabled auth? */ - if (SCTP_BASE_SYSCTL(sctp_auth_disable)) - return (m); - - /* peer doesn't do auth... */ - if (!stcb->asoc.peer_supports_auth) { + if (stcb->asoc.auth_supported == 0) { return (m); } /* does the requested chunk require auth? */ if (!sctp_auth_is_required_chunk(chunk, stcb->asoc.peer_auth_chunks)) { return (m); } - m_auth = sctp_get_mbuf_for_msg(sizeof(*auth), 0, M_DONTWAIT, 1, MT_HEADER); + m_auth = sctp_get_mbuf_for_msg(sizeof(*auth), 0, M_NOWAIT, 1, MT_HEADER); if (m_auth == NULL) { /* no mbuf's */ return (m); @@ -13538,7 +13804,7 @@ sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t * ro) } ifa = (struct ifaddr *)sifa->ifa; mask = (struct sockaddr_in *)(ifa->ifa_netmask); - sin = (struct sockaddr_in *)&sifa->address.sin; + sin = &sifa->address.sin; srcnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr); SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: src address is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa); diff --git a/freebsd/sys/netinet/sctp_output.h b/freebsd/sys/netinet/sctp_output.h index 59af5af2..b2441a6f 100644 --- a/freebsd/sys/netinet/sctp_output.h +++ b/freebsd/sys/netinet/sctp_output.h @@ -80,7 +80,8 @@ sctp_send_initiate(struct sctp_inpcb *, struct sctp_tcb *, int ); void -sctp_send_initiate_ack(struct sctp_inpcb *, struct sctp_tcb *, struct mbuf *, +sctp_send_initiate_ack(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *, struct mbuf *, int, int, struct sockaddr *, struct sockaddr *, struct sctphdr *, struct sctp_init_chunk *, @@ -117,7 +118,7 @@ void sctp_send_shutdown_complete(struct sctp_tcb *, struct sctp_nets *, int); void sctp_send_shutdown_complete2(struct sockaddr *, struct sockaddr *, struct sctphdr *, - uint8_t, uint32_t, + uint8_t, uint32_t, uint16_t, uint32_t, uint16_t); void sctp_send_asconf(struct sctp_tcb *, struct sctp_nets *, int addr_locked); @@ -170,30 +171,33 @@ void sctp_send_cwr(struct sctp_tcb *, struct sctp_nets *, uint32_t, uint8_t); void -sctp_add_stream_reset_out(struct sctp_tmit_chunk *, - int, uint16_t *, uint32_t, uint32_t, uint32_t); + sctp_add_stream_reset_result(struct sctp_tmit_chunk *, uint32_t, uint32_t); void - sctp_add_stream_reset_result(struct sctp_tmit_chunk *, uint32_t, uint32_t); +sctp_send_deferred_reset_response(struct sctp_tcb *, + struct sctp_stream_reset_list *, + int); void sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *, uint32_t, uint32_t, uint32_t, uint32_t); +int + sctp_send_stream_reset_out_if_possible(struct sctp_tcb *, int); int -sctp_send_str_reset_req(struct sctp_tcb *, int, uint16_t *, uint8_t, uint8_t, - uint8_t, uint8_t, uint16_t, uint16_t, uint8_t); +sctp_send_str_reset_req(struct sctp_tcb *, uint16_t, uint16_t *, + uint8_t, uint8_t, uint8_t, uint16_t, uint16_t, uint8_t); void sctp_send_abort(struct mbuf *, int, struct sockaddr *, struct sockaddr *, struct sctphdr *, uint32_t, struct mbuf *, - uint8_t, uint32_t, + uint8_t, uint32_t, uint16_t, uint32_t, uint16_t); void sctp_send_operr_to(struct sockaddr *, struct sockaddr *, struct sctphdr *, uint32_t, struct mbuf *, - uint8_t, uint32_t, + uint8_t, uint32_t, uint16_t, uint32_t, uint16_t); #endif /* _KERNEL || __Userspace__ */ diff --git a/freebsd/sys/netinet/sctp_pcb.c b/freebsd/sys/netinet/sctp_pcb.c index 16dc231f..62ef1e3d 100644 --- a/freebsd/sys/netinet/sctp_pcb.c +++ b/freebsd/sys/netinet/sctp_pcb.c @@ -48,7 +48,9 @@ __FBSDID("$FreeBSD$"); #include #include #include +#if defined(INET) || defined(INET6) #include +#endif #ifdef INET6 #include #endif @@ -330,7 +332,7 @@ sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr, goto out; } if (sctp_ifap->ifn_p == NULL) { - SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unuseable\n"); + SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n"); goto out; } if (if_name) { @@ -374,7 +376,7 @@ sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr, goto out; } if (sctp_ifap->ifn_p == NULL) { - SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unuseable\n"); + SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n"); goto out; } if (if_name) { @@ -625,7 +627,7 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)&sctp_ifap->address.sin; + sin = &sctp_ifap->address.sin; if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) || (IN4_ISLOOPBACK_ADDRESS(&sin->sin_addr))) { sctp_ifap->src_is_loop = 1; @@ -645,7 +647,7 @@ sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, /* ok to use deprecated addresses? */ struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)&sctp_ifap->address.sin6; + sin6 = &sctp_ifap->address.sin6; if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) || (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))) { sctp_ifap->src_is_loop = 1; @@ -974,7 +976,7 @@ sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to) { struct sockaddr_in *sin, *rsin; - sin = (struct sockaddr_in *)&laddr->ifa->address.sin; + sin = &laddr->ifa->address.sin; rsin = (struct sockaddr_in *)to; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { SCTP_IPI_ADDR_RUNLOCK(); @@ -988,7 +990,7 @@ sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to) { struct sockaddr_in6 *sin6, *rsin6; - sin6 = (struct sockaddr_in6 *)&laddr->ifa->address.sin6; + sin6 = &laddr->ifa->address.sin6; rsin6 = (struct sockaddr_in6 *)to; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { SCTP_IPI_ADDR_RUNLOCK(); @@ -1115,7 +1117,7 @@ sctp_tcb_special_locate(struct sctp_inpcb **inp_p, struct sockaddr *from, LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { - SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __FUNCTION__); + SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__); continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { @@ -1441,9 +1443,6 @@ sctp_findassociation_ep_addr(struct sctp_inpcb **inp_p, struct sockaddr *remote, } head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(rport, inp->sctp_hashmark)]; - if (head == NULL) { - goto null_return; - } LIST_FOREACH(stcb, head, sctp_tcbhash) { if (stcb->rport != rport) { /* remote port does not match */ @@ -1776,7 +1775,7 @@ sctp_endpoint_probe(struct sockaddr *nam, struct sctppcbhead *head, LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", - __FUNCTION__); + __func__); continue; } SCTPDBG(SCTP_DEBUG_PCB1, "Ok laddr->ifa:%p is possible, ", @@ -1870,7 +1869,7 @@ sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp) { /* For 1-2-1 with port reuse */ struct sctppcbhead *head; - struct sctp_inpcb *tinp; + struct sctp_inpcb *tinp, *ninp; if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) { /* only works with port reuse on */ @@ -1880,10 +1879,11 @@ sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp) return (0); } SCTP_INP_RUNLOCK(inp); + SCTP_INP_INFO_WLOCK(); head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))]; /* Kick out all non-listeners to the TCP hash */ - LIST_FOREACH(tinp, head, sctp_hash) { + LIST_FOREACH_SAFE(tinp, head, sctp_hash, ninp) { if (tinp->sctp_lport != inp->sctp_lport) { continue; } @@ -1911,6 +1911,7 @@ sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp) LIST_INSERT_HEAD(head, inp, sctp_hash); SCTP_INP_WUNLOCK(inp); SCTP_INP_RLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); return (0); } @@ -2166,11 +2167,6 @@ sctp_findassoc_by_vtag(struct sockaddr *from, struct sockaddr *to, uint32_t vtag SCTP_INP_INFO_RLOCK(); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(vtag, SCTP_BASE_INFO(hashasocmark))]; - if (head == NULL) { - /* invalid vtag */ - SCTP_INP_INFO_RUNLOCK(); - return (NULL); - } LIST_FOREACH(stcb, head, sctp_asocs) { SCTP_INP_RLOCK(stcb->sctp_ep); if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { @@ -2262,7 +2258,6 @@ sctp_findassociation_addr(struct mbuf *m, int offset, struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id) { - int find_tcp_pool; struct sctp_tcb *stcb; struct sctp_inpcb *inp; @@ -2274,21 +2269,13 @@ sctp_findassociation_addr(struct mbuf *m, int offset, return (stcb); } } - find_tcp_pool = 0; - if ((ch->chunk_type != SCTP_INITIATION) && - (ch->chunk_type != SCTP_INITIATION_ACK) && - (ch->chunk_type != SCTP_COOKIE_ACK) && - (ch->chunk_type != SCTP_COOKIE_ECHO)) { - /* Other chunk types go to the tcp pool. */ - find_tcp_pool = 1; - } if (inp_p) { stcb = sctp_findassociation_addr_sa(src, dst, inp_p, netp, - find_tcp_pool, vrf_id); + 1, vrf_id); inp = *inp_p; } else { stcb = sctp_findassociation_addr_sa(src, dst, &inp, netp, - find_tcp_pool, vrf_id); + 1, vrf_id); } SCTPDBG(SCTP_DEBUG_PCB1, "stcb:%p inp:%p\n", (void *)stcb, (void *)inp); if (stcb == NULL && inp) { @@ -2330,7 +2317,7 @@ sctp_findassociation_ep_asconf(struct mbuf *m, int offset, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id) { struct sctp_tcb *stcb; - struct sockaddr_storage remote_store; + union sctp_sockstore remote_store; struct sctp_paramhdr parm_buf, *phdr; int ptype; int zero_address = 0; @@ -2349,7 +2336,7 @@ sctp_findassociation_ep_asconf(struct mbuf *m, int offset, &parm_buf, sizeof(struct sctp_paramhdr)); if (phdr == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf lookup addr\n", - __FUNCTION__); + __func__); return NULL; } ptype = (int)((uint32_t) ntohs(phdr->param_type)); @@ -2369,10 +2356,10 @@ sctp_findassociation_ep_asconf(struct mbuf *m, int offset, &p6_buf.ph, sizeof(*p6)); if (p6 == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v6 lookup addr\n", - __FUNCTION__); + __func__); return (NULL); } - sin6 = (struct sockaddr_in6 *)&remote_store; + sin6 = &remote_store.sin6; sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = sh->src_port; @@ -2396,10 +2383,10 @@ sctp_findassociation_ep_asconf(struct mbuf *m, int offset, &p4_buf.ph, sizeof(*p4)); if (p4 == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v4 lookup addr\n", - __FUNCTION__); + __func__); return (NULL); } - sin = (struct sockaddr_in *)&remote_store; + sin = &remote_store.sin; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_port = sh->src_port; @@ -2422,7 +2409,7 @@ sctp_findassociation_ep_asconf(struct mbuf *m, int offset, } } else { stcb = sctp_findassociation_ep_addr(inp_p, - (struct sockaddr *)&remote_store, netp, + &remote_store.sa, netp, dst, NULL); } return (stcb); @@ -2482,8 +2469,18 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) inp->sctp_associd_counter = 1; inp->partial_delivery_point = SCTP_SB_LIMIT_RCV(so) >> SCTP_PARTIAL_DELIVERY_SHIFT; inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; + inp->max_cwnd = 0; inp->sctp_cmt_on_off = SCTP_BASE_SYSCTL(sctp_cmt_on_off); - inp->sctp_ecn_enable = SCTP_BASE_SYSCTL(sctp_ecn_enable); + inp->ecn_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_ecn_enable); + inp->prsctp_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_pr_enable); + inp->auth_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_auth_enable); + inp->asconf_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_asconf_enable); + inp->reconfig_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_reconfig_enable); + inp->nrsack_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_nrsack_enable); + inp->pktdrop_supported = (uint8_t) SCTP_BASE_SYSCTL(sctp_pktdrop_enable); + inp->idata_supported = 0; + + inp->fibnum = so->so_fibnum; /* init the small hash table we use to track asocid <-> tcb */ inp->sctp_asocidhash = SCTP_HASH_INIT(SCTP_STACK_VTAG_HASH_SIZE, &inp->hashasocidmark); if (inp->sctp_asocidhash == NULL) { @@ -2493,14 +2490,7 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) return (ENOBUFS); } #ifdef IPSEC - { - struct inpcbpolicy *pcb_sp = NULL; - - error = ipsec_init_policy(so, &pcb_sp); - /* Arrange to share the policy */ - inp->ip_inp.inp.inp_sp = pcb_sp; - ((struct in6pcb *)(&inp->ip_inp.inp))->in6p_sp = pcb_sp; - } + error = ipsec_init_policy(so, &inp->ip_inp.inp.inp_sp); if (error != 0) { crfree(inp->ip_inp.inp.inp_cred); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); @@ -2534,6 +2524,9 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EOPNOTSUPP); so->so_pcb = NULL; crfree(inp->ip_inp.inp.inp_cred); +#ifdef IPSEC + ipsec_delete_pcbpolicy(&inp->ip_inp.inp); +#endif SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); return (EOPNOTSUPP); } @@ -2554,6 +2547,9 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); so->so_pcb = NULL; crfree(inp->ip_inp.inp.inp_cred); +#ifdef IPSEC + ipsec_delete_pcbpolicy(&inp->ip_inp.inp); +#endif SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); return (ENOBUFS); } @@ -2647,12 +2643,15 @@ sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) */ m->local_hmacs = sctp_default_supported_hmaclist(); m->local_auth_chunks = sctp_alloc_chunklist(); + if (inp->asconf_supported) { + sctp_auth_add_chunk(SCTP_ASCONF, m->local_auth_chunks); + sctp_auth_add_chunk(SCTP_ASCONF_ACK, m->local_auth_chunks); + } m->default_dscp = 0; #ifdef INET6 m->default_flowlabel = 0; #endif m->port = 0; /* encapsulation disabled by default */ - sctp_auth_set_default_chunks(m->local_auth_chunks); LIST_INIT(&m->shared_keys); /* add default NULL key as key id 0 */ null_key = sctp_alloc_sharedkey(); @@ -2786,6 +2785,45 @@ sctp_move_pcb_and_assoc(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp, SCTP_INP_WUNLOCK(old_inp); } +/* + * insert an laddr entry with the given ifa for the desired list + */ +static int +sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act) +{ + struct sctp_laddr *laddr; + + laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); + if (laddr == NULL) { + /* out of memory? */ + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + return (EINVAL); + } + SCTP_INCR_LADDR_COUNT(); + bzero(laddr, sizeof(*laddr)); + (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time); + laddr->ifa = ifa; + laddr->action = act; + atomic_add_int(&ifa->refcount, 1); + /* insert it */ + LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr); + + return (0); +} + +/* + * Remove an laddr entry from the local address list (on an assoc) + */ +static void +sctp_remove_laddr(struct sctp_laddr *laddr) +{ + + /* remove from the list */ + LIST_REMOVE(laddr, sctp_nxt_addr); + sctp_free_ifa(laddr->ifa); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr); + SCTP_DECR_LADDR_COUNT(); +} @@ -3117,31 +3155,21 @@ continue_anyway: * too (before adding). */ struct sctp_ifa *ifa; - struct sockaddr_storage store_sa; + union sctp_sockstore store; - memset(&store_sa, 0, sizeof(store_sa)); + memset(&store, 0, sizeof(store)); switch (addr->sa_family) { #ifdef INET case AF_INET: - { - struct sockaddr_in *sin; - - sin = (struct sockaddr_in *)&store_sa; - memcpy(sin, addr, sizeof(struct sockaddr_in)); - sin->sin_port = 0; - break; - } + memcpy(&store.sin, addr, sizeof(struct sockaddr_in)); + store.sin.sin_port = 0; + break; #endif #ifdef INET6 case AF_INET6: - { - struct sockaddr_in6 *sin6; - - sin6 = (struct sockaddr_in6 *)&store_sa; - memcpy(sin6, addr, sizeof(struct sockaddr_in6)); - sin6->sin6_port = 0; - break; - } + memcpy(&store.sin6, addr, sizeof(struct sockaddr_in6)); + store.sin6.sin6_port = 0; + break; #endif default: break; @@ -3159,7 +3187,7 @@ continue_anyway: * pass things in via the sctp_ifap argument * (Panda). */ - ifa = sctp_find_ifa_by_addr((struct sockaddr *)&store_sa, + ifa = sctp_find_ifa_by_addr(&store.sa, vrf_id, SCTP_ADDR_NOT_LOCKED); } if (ifa == NULL) { @@ -3418,7 +3446,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) } else if (TAILQ_EMPTY(&asoc->asoc.send_queue) && TAILQ_EMPTY(&asoc->asoc.sent_queue) && (asoc->asoc.stream_queue_cnt == 0)) { - if (asoc->asoc.locked_on_sending) { + if ((*asoc->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (asoc, &asoc->asoc)) { goto abort_anyway; } if ((SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_SHUTDOWN_SENT) && @@ -3450,22 +3478,11 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) } } else { /* mark into shutdown pending */ - struct sctp_stream_queue_pending *sp; - asoc->asoc.state |= SCTP_STATE_SHUTDOWN_PENDING; sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc, asoc->asoc.primary_destination); - if (asoc->asoc.locked_on_sending) { - sp = TAILQ_LAST(&((asoc->asoc.locked_on_sending)->outqueue), - sctp_streamhead); - if (sp == NULL) { - SCTP_PRINTF("Error, sp is NULL, locked on sending is %p strm:%d\n", - (void *)asoc->asoc.locked_on_sending, - asoc->asoc.locked_on_sending->stream_no); - } else { - if ((sp->length == 0) && (sp->msg_is_complete == 0)) - asoc->asoc.state |= SCTP_STATE_PARTIAL_MSG_LEFT; - } + if ((*asoc->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (asoc, &asoc->asoc)) { + asoc->asoc.state |= SCTP_STATE_PARTIAL_MSG_LEFT; } if (TAILQ_EMPTY(&asoc->asoc.send_queue) && TAILQ_EMPTY(&asoc->asoc.sent_queue) && @@ -3550,7 +3567,8 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) { + if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_FORCE, + SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) { cnt++; } } @@ -3637,8 +3655,7 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) * no need to free the net count, since at this point all * assoc's are gone. */ - SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), sq); - SCTP_DECR_READQ_COUNT(); + sctp_free_a_readq(NULL, sq); } /* Now the sctp_pcb things */ /* @@ -3646,13 +3663,9 @@ sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) * macro here since le_next will get freed as part of the * sctp_free_assoc() call. */ - if (so) { #ifdef IPSEC - ipsec_delete_pcbpolicy(ip_pcb); -#endif /* IPSEC */ - - /* Unlocks not needed since the socket is gone now */ - } + ipsec_delete_pcbpolicy(ip_pcb); +#endif if (ip_pcb->inp_options) { (void)sctp_m_free(ip_pcb->inp_options); ip_pcb->inp_options = 0; @@ -3746,7 +3759,7 @@ sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id) */ int sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, - struct sctp_nets **netp, int set_scope, int from) + struct sctp_nets **netp, uint16_t port, int set_scope, int from) { /* * The following is redundant to the same lines in the @@ -3799,13 +3812,9 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, /* assure len is set */ sin->sin_len = sizeof(struct sockaddr_in); if (set_scope) { -#ifdef SCTP_DONT_DO_PRIVADDR_SCOPE - stcb->asoc.scope.ipv4_local_scope = 1; -#else if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { stcb->asoc.scope.ipv4_local_scope = 1; } -#endif /* SCTP_DONT_DO_PRIVADDR_SCOPE */ } else { /* Validate the address is in scope */ if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) && @@ -3928,7 +3937,7 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, stcb->asoc.numnets++; net->ref_count = 1; net->cwr_window_tsn = net->last_cwr_tsn = stcb->asoc.sending_seq - 1; - net->port = stcb->asoc.port; + net->port = port; net->dscp = stcb->asoc.default_dscp; #ifdef INET6 net->flowlabel = stcb->asoc.default_flowlabel; @@ -3960,7 +3969,9 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, sin6->sin6_scope_id = 0; } #endif - SCTP_RTALLOC((sctp_route_t *) & net->ro, stcb->asoc.vrf_id); + SCTP_RTALLOC((sctp_route_t *) & net->ro, + stcb->asoc.vrf_id, + stcb->sctp_ep->fibnum); if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro)) { /* Get source address */ @@ -3970,9 +3981,14 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, net, 0, stcb->asoc.vrf_id); - /* Now get the interface MTU */ - if (net->ro._s_addr && net->ro._s_addr->ifn_p) { - net->mtu = SCTP_GATHER_MTU_FROM_INTFC(net->ro._s_addr->ifn_p); + if (net->ro._s_addr != NULL) { + net->src_addr_selected = 1; + /* Now get the interface MTU */ + if (net->ro._s_addr->ifn_p != NULL) { + net->mtu = SCTP_GATHER_MTU_FROM_INTFC(net->ro._s_addr->ifn_p); + } + } else { + net->src_addr_selected = 0; } if (net->mtu > 0) { uint32_t rmtu; @@ -3994,6 +4010,8 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, net->mtu = rmtu; } } + } else { + net->src_addr_selected = 0; } if (net->mtu == 0) { switch (newaddr->sa_family) { @@ -4011,14 +4029,16 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, break; } } +#if defined(INET) || defined(INET6) if (net->port) { net->mtu -= (uint32_t) sizeof(struct udphdr); } +#endif if (from == SCTP_ALLOC_ASOC) { stcb->asoc.smallest_mtu = net->mtu; } if (stcb->asoc.smallest_mtu > net->mtu) { - stcb->asoc.smallest_mtu = net->mtu; + sctp_pathmtu_adjustment(stcb, net->mtu); } #ifdef INET6 if (newaddr->sa_family == AF_INET6) { @@ -4039,14 +4059,11 @@ sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, */ net->find_pseudo_cumack = 1; net->find_rtx_pseudo_cumack = 1; - net->src_addr_selected = 0; /* Choose an initial flowid. */ net->flowid = stcb->asoc.my_vtag ^ ntohs(stcb->rport) ^ ntohs(stcb->sctp_ep->sctp_lport); -#ifdef INVARIANTS - net->flowidset = 1; -#endif + net->flowtype = M_HASHTYPE_OPAQUE_HASH; if (netp) { *netp = net; } @@ -4167,6 +4184,7 @@ try_again: struct sctp_tcb * sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, int *error, uint32_t override_tag, uint32_t vrf_id, + uint16_t o_streams, uint16_t port, struct thread *p ) { @@ -4325,7 +4343,7 @@ sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, /* setup back pointer's */ stcb->sctp_ep = inp; stcb->sctp_socket = inp->sctp_socket; - if ((err = sctp_init_asoc(inp, stcb, override_tag, vrf_id))) { + if ((err = sctp_init_asoc(inp, stcb, override_tag, vrf_id, o_streams))) { /* failed */ SCTP_TCB_LOCK_DESTROY(stcb); SCTP_TCB_SEND_LOCK_DESTROY(stcb); @@ -4359,7 +4377,7 @@ sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, LIST_INSERT_HEAD(head, stcb, sctp_asocs); SCTP_INP_INFO_WUNLOCK(); - if ((err = sctp_add_remote_addr(stcb, firstaddr, NULL, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC))) { + if ((err = sctp_add_remote_addr(stcb, firstaddr, NULL, port, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC))) { /* failure.. memory error? */ if (asoc->strmout) { SCTP_FREE(asoc->strmout, SCTP_M_STRMO); @@ -4625,6 +4643,45 @@ sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t } } +void +sctp_clean_up_stream(struct sctp_tcb *stcb, struct sctp_readhead *rh) +{ + struct sctp_tmit_chunk *chk, *nchk; + struct sctp_queued_to_read *ctl, *nctl; + + TAILQ_FOREACH_SAFE(ctl, rh, next_instrm, nctl) { + TAILQ_REMOVE(rh, ctl, next_instrm); + ctl->on_strm_q = 0; + if (ctl->on_read_q == 0) { + sctp_free_remote_addr(ctl->whoFrom); + if (ctl->data) { + sctp_m_freem(ctl->data); + ctl->data = NULL; + } + } + /* Reassembly free? */ + TAILQ_FOREACH_SAFE(chk, &ctl->reasm, sctp_next, nchk) { + TAILQ_REMOVE(&ctl->reasm, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + if (chk->holds_key_ref) + sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); + sctp_free_remote_addr(chk->whoTo); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); + SCTP_DECR_CHK_COUNT(); + /* sa_ignore FREED_MEMORY */ + } + /* + * We don't free the address here since all the net's were + * freed above. + */ + if (ctl->on_read_q == 0) { + sctp_free_a_readq(stcb, ctl); + } + } +} /*- @@ -4925,7 +4982,9 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre outs = &asoc->strmout[i]; /* now clean up any chunks here */ TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { + atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); + stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 0); sctp_free_spbufspace(stcb, asoc, sp); if (sp->data) { if (so) { @@ -4962,8 +5021,7 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre sq->whoFrom = NULL; sq->stcb = NULL; /* Free the ctl entry */ - SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), sq); - SCTP_DECR_READQ_COUNT(); + sctp_free_a_readq(stcb, sq); /* sa_ignore FREED_MEMORY */ } TAILQ_FOREACH_SAFE(chk, &asoc->free_chunks, sctp_next, nchk) { @@ -5076,20 +5134,6 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } - TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) { - TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); - if (chk->data) { - sctp_m_freem(chk->data); - chk->data = NULL; - } - if (chk->holds_key_ref) - sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); - sctp_free_remote_addr(chk->whoTo); - SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); - SCTP_DECR_CHK_COUNT(); - /* sa_ignore FREED_MEMORY */ - } - if (asoc->mapping_array) { SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); asoc->mapping_array = NULL; @@ -5105,23 +5149,9 @@ sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfre } asoc->strm_realoutsize = asoc->streamoutcnt = 0; if (asoc->strmin) { - struct sctp_queued_to_read *ctl, *nctl; - for (i = 0; i < asoc->streamincnt; i++) { - TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[i].inqueue, next, nctl) { - TAILQ_REMOVE(&asoc->strmin[i].inqueue, ctl, next); - sctp_free_remote_addr(ctl->whoFrom); - if (ctl->data) { - sctp_m_freem(ctl->data); - ctl->data = NULL; - } - /* - * We don't free the address here since all - * the net's were freed above. - */ - SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), ctl); - SCTP_DECR_READQ_COUNT(); - } + sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue); + sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue); } SCTP_FREE(asoc->strmin, SCTP_M_STRMI); asoc->strmin = NULL; @@ -5302,7 +5332,7 @@ sctp_update_ep_vflag(struct sctp_inpcb *inp) LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", - __FUNCTION__); + __func__); continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { @@ -5333,6 +5363,7 @@ void sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t action) { struct sctp_laddr *laddr; + struct sctp_tcb *stcb; int fnd, error = 0; fnd = 0; @@ -5378,6 +5409,9 @@ sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t ac default: break; } + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + sctp_add_local_addr_restricted(stcb, ifa); + } } return; } @@ -5408,7 +5442,7 @@ sctp_select_primary_destination(struct sctp_tcb *stcb) /* - * Delete the address from the endpoint local address list There is nothing + * Delete the address from the endpoint local address list. There is nothing * to be done if we are bound to all addresses */ void @@ -5459,8 +5493,7 @@ sctp_del_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa) * to laddr */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { - if (net->ro._s_addr && - (net->ro._s_addr->ifa == laddr->ifa)) { + if (net->ro._s_addr == laddr->ifa) { /* Yep, purge src address selected */ sctp_rtentry_t *rt; @@ -5523,46 +5556,6 @@ sctp_add_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) return; } -/* - * insert an laddr entry with the given ifa for the desired list - */ -int -sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act) -{ - struct sctp_laddr *laddr; - - laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); - if (laddr == NULL) { - /* out of memory? */ - SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); - return (EINVAL); - } - SCTP_INCR_LADDR_COUNT(); - bzero(laddr, sizeof(*laddr)); - (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time); - laddr->ifa = ifa; - laddr->action = act; - atomic_add_int(&ifa->refcount, 1); - /* insert it */ - LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr); - - return (0); -} - -/* - * Remove an laddr entry from the local address list (on an assoc) - */ -void -sctp_remove_laddr(struct sctp_laddr *laddr) -{ - - /* remove from the list */ - LIST_REMOVE(laddr, sctp_nxt_addr); - sctp_free_ifa(laddr->ifa); - SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr); - SCTP_DECR_LADDR_COUNT(); -} - /* * Remove a local address from the TCB local address restricted list */ @@ -5774,7 +5767,7 @@ sctp_pcb_init() { /* * SCTP initialization for the PCB structures should be called by - * the sctp_init() funciton. + * the sctp_init() function. */ int i; struct timeval tv; @@ -5933,12 +5926,32 @@ sctp_pcb_finish(void) int i; struct sctp_iterator *it, *nit; + if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { + SCTP_PRINTF("%s: race condition on teardown.\n", __func__); + return; + } + SCTP_BASE_VAR(sctp_pcb_initialized) = 0; /* * In FreeBSD the iterator thread never exits but we do clean up. * The only way FreeBSD reaches here is if we have VRF's but we * still add the ifdef to make it compile on old versions. */ +retry: SCTP_IPI_ITERATOR_WQ_LOCK(); + /* + * sctp_iterator_worker() might be working on an it entry without + * holding the lock. We won't find it on the list either and + * continue and free/destroy it. While holding the lock, spin, to + * avoid the race condition as sctp_iterator_worker() will have to + * wait to re-aquire the lock. + */ + if (sctp_it_ctl.iterator_running != 0 || sctp_it_ctl.cur_it != NULL) { + SCTP_IPI_ITERATOR_WQ_UNLOCK(); + SCTP_PRINTF("%s: Iterator running while we held the lock. Retry. " + "cur_it=%p\n", __func__, sctp_it_ctl.cur_it); + DELAY(10); + goto retry; + } TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) { if (it->vn != curvnet) { continue; @@ -5956,11 +5969,14 @@ sctp_pcb_finish(void) sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT; } SCTP_ITERATOR_UNLOCK(); - SCTP_OS_TIMER_STOP(&SCTP_BASE_INFO(addr_wq_timer.timer)); + SCTP_OS_TIMER_STOP_DRAIN(&SCTP_BASE_INFO(addr_wq_timer.timer)); SCTP_WQ_ADDR_LOCK(); LIST_FOREACH_SAFE(wi, &SCTP_BASE_INFO(addr_wq), sctp_nxt_addr, nwi) { LIST_REMOVE(wi, sctp_nxt_addr); SCTP_DECR_LADDR_COUNT(); + if (wi->action == SCTP_DEL_IP_ADDRESS) { + SCTP_FREE(wi->ifa, SCTP_M_IFA); + } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), wi); } SCTP_WQ_ADDR_UNLOCK(); @@ -6020,6 +6036,14 @@ sctp_pcb_finish(void) SCTP_WQ_ADDR_DESTROY(); + /* Get rid of other stuff too. */ + if (SCTP_BASE_INFO(sctp_asochash) != NULL) + SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark)); + if (SCTP_BASE_INFO(sctp_ephash) != NULL) + SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark)); + if (SCTP_BASE_INFO(sctp_tcpephash) != NULL) + SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark)); + SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_ep)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asoc)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_laddr)); @@ -6029,13 +6053,6 @@ sctp_pcb_finish(void) SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_strmoq)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf_ack)); - /* Get rid of other stuff to */ - if (SCTP_BASE_INFO(sctp_asochash) != NULL) - SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark)); - if (SCTP_BASE_INFO(sctp_ephash) != NULL) - SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark)); - if (SCTP_BASE_INFO(sctp_tcpephash) != NULL) - SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark)); #if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT) SCTP_FREE(SCTP_BASE_STATS, SCTP_M_MCORE); #endif @@ -6046,7 +6063,7 @@ int sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, int offset, int limit, struct sockaddr *src, struct sockaddr *dst, - struct sockaddr *altsa) + struct sockaddr *altsa, uint16_t port) { /* * grub through the INIT pulling addresses and loading them to the @@ -6075,7 +6092,15 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, sctp_key_t *new_key; uint32_t keylen; int got_random = 0, got_hmacs = 0, got_chklist = 0; - uint8_t ecn_allowed; + uint8_t peer_supports_ecn; + uint8_t peer_supports_prsctp; + uint8_t peer_supports_auth; + uint8_t peer_supports_asconf; + uint8_t peer_supports_asconf_ack; + uint8_t peer_supports_reconfig; + uint8_t peer_supports_nrsack; + uint8_t peer_supports_pktdrop; + uint8_t peer_supports_idata; #ifdef INET struct sockaddr_in sin; @@ -6104,8 +6129,14 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, } else { sa = src; } - /* Turn off ECN until we get through all params */ - ecn_allowed = 0; + peer_supports_idata = 0; + peer_supports_ecn = 0; + peer_supports_prsctp = 0; + peer_supports_auth = 0; + peer_supports_asconf = 0; + peer_supports_reconfig = 0; + peer_supports_nrsack = 0; + peer_supports_pktdrop = 0; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* mark all addresses that we have currently on the list */ net->dest_state |= SCTP_ADDR_NOT_IN_ASSOC; @@ -6123,7 +6154,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, #ifdef INET case AF_INET: if (stcb->asoc.scope.ipv4_addr_legal) { - if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) { + if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) { return (-1); } } @@ -6132,7 +6163,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, #ifdef INET6 case AF_INET6: if (stcb->asoc.scope.ipv6_addr_legal) { - if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) { + if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) { return (-2); } } @@ -6155,12 +6186,6 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, /* the assoc was freed? */ return (-4); } - /* - * peer must explicitly turn this on. This may have been initialized - * to be "on" in order to allow local addr changes while INIT's are - * in flight. - */ - stcb->asoc.peer_supports_asconf = 0; /* now we must go through each of the params. */ phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf)); while (phdr) { @@ -6223,7 +6248,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, /* the assoc was freed? */ return (-7); } - if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) { + if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) { return (-8); } } else if (stcb_tmp == stcb) { @@ -6243,12 +6268,20 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, */ if (stcb_tmp) { if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) { + struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; + /* * in setup state we * abort this guy */ + snprintf(msg, sizeof(msg), + "%s:%d at %s", __FILE__, __LINE__, __func__); + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + msg); sctp_abort_an_association(stcb_tmp->sctp_ep, - stcb_tmp, NULL, SCTP_SO_NOT_LOCKED); + stcb_tmp, op_err, + SCTP_SO_NOT_LOCKED); goto add_it_now; } SCTP_TCB_UNLOCK(stcb_tmp); @@ -6310,7 +6343,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, * we must add the address, no scope * set */ - if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) { + if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) { return (-17); } } else if (stcb_tmp == stcb) { @@ -6332,18 +6365,26 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, * strange, address is in another * assoc? straighten out locks. */ - if (stcb_tmp) + if (stcb_tmp) { if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) { + struct mbuf *op_err; + char msg[SCTP_DIAG_INFO_LEN]; + /* * in setup state we * abort this guy */ + snprintf(msg, sizeof(msg), + "%s:%d at %s", __FILE__, __LINE__, __func__); + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + msg); sctp_abort_an_association(stcb_tmp->sctp_ep, - stcb_tmp, NULL, SCTP_SO_NOT_LOCKED); + stcb_tmp, op_err, + SCTP_SO_NOT_LOCKED); goto add_it_now6; } - SCTP_TCB_UNLOCK(stcb_tmp); - + SCTP_TCB_UNLOCK(stcb_tmp); + } if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-21); @@ -6354,7 +6395,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, } else #endif if (ptype == SCTP_ECN_CAPABLE) { - ecn_allowed = 1; + peer_supports_ecn = 1; } else if (ptype == SCTP_ULP_ADAPTATION) { if (stcb->asoc.state != SCTP_STATE_OPEN) { struct sctp_adaptation_layer_indication ai, @@ -6378,7 +6419,9 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, #endif - stcb->asoc.peer_supports_asconf = 1; + if (stcb->asoc.asconf_supported == 0) { + return (-100); + } if (plen > sizeof(lstore)) { return (-23); } @@ -6430,7 +6473,7 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, stcb->asoc.peer_supports_nat = 1; } else if (ptype == SCTP_PRSCTP_SUPPORTED) { /* Peer supports pr-sctp */ - stcb->asoc.peer_supports_prsctp = 1; + peer_supports_prsctp = 1; } else if (ptype == SCTP_SUPPORTED_CHUNK_EXT) { /* A supported extension chunk */ struct sctp_supported_chunk_types_param *pr_supported; @@ -6442,34 +6485,33 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, if (phdr == NULL) { return (-25); } - stcb->asoc.peer_supports_asconf = 0; - stcb->asoc.peer_supports_prsctp = 0; - stcb->asoc.peer_supports_pktdrop = 0; - stcb->asoc.peer_supports_strreset = 0; - stcb->asoc.peer_supports_nr_sack = 0; - stcb->asoc.peer_supports_auth = 0; pr_supported = (struct sctp_supported_chunk_types_param *)phdr; num_ent = plen - sizeof(struct sctp_paramhdr); for (i = 0; i < num_ent; i++) { switch (pr_supported->chunk_types[i]) { case SCTP_ASCONF: + peer_supports_asconf = 1; + break; case SCTP_ASCONF_ACK: - stcb->asoc.peer_supports_asconf = 1; + peer_supports_asconf_ack = 1; break; case SCTP_FORWARD_CUM_TSN: - stcb->asoc.peer_supports_prsctp = 1; + peer_supports_prsctp = 1; break; case SCTP_PACKET_DROPPED: - stcb->asoc.peer_supports_pktdrop = 1; + peer_supports_pktdrop = 1; break; case SCTP_NR_SELECTIVE_ACK: - stcb->asoc.peer_supports_nr_sack = 1; + peer_supports_nrsack = 1; break; case SCTP_STREAM_RESET: - stcb->asoc.peer_supports_strreset = 1; + peer_supports_reconfig = 1; break; case SCTP_AUTHENTICATION: - stcb->asoc.peer_supports_auth = 1; + peer_supports_auth = 1; + break; + case SCTP_IDATA: + peer_supports_idata = 1; break; default: /* one I have not learned yet */ @@ -6498,8 +6540,8 @@ sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, } got_random = 1; } else if (ptype == SCTP_HMAC_LIST) { - int num_hmacs; - int i; + uint16_t num_hmacs; + uint16_t i; if (plen > sizeof(hmacs_store)) break; @@ -6606,24 +6648,51 @@ next_param: } } } - if (ecn_allowed == 0) { - stcb->asoc.ecn_allowed = 0; + if ((stcb->asoc.ecn_supported == 1) && + (peer_supports_ecn == 0)) { + stcb->asoc.ecn_supported = 0; } - /* validate authentication required parameters */ - if (got_random && got_hmacs) { - stcb->asoc.peer_supports_auth = 1; - } else { - stcb->asoc.peer_supports_auth = 0; + if ((stcb->asoc.prsctp_supported == 1) && + (peer_supports_prsctp == 0)) { + stcb->asoc.prsctp_supported = 0; + } + if ((stcb->asoc.auth_supported == 1) && + ((peer_supports_auth == 0) || + (got_random == 0) || (got_hmacs == 0))) { + stcb->asoc.auth_supported = 0; + } + if ((stcb->asoc.asconf_supported == 1) && + ((peer_supports_asconf == 0) || (peer_supports_asconf_ack == 0) || + (stcb->asoc.auth_supported == 0) || + (saw_asconf == 0) || (saw_asconf_ack == 0))) { + stcb->asoc.asconf_supported = 0; + } + if ((stcb->asoc.reconfig_supported == 1) && + (peer_supports_reconfig == 0)) { + stcb->asoc.reconfig_supported = 0; } - if (!stcb->asoc.peer_supports_auth && got_chklist) { + if ((stcb->asoc.idata_supported == 1) && + (peer_supports_idata == 0)) { + stcb->asoc.idata_supported = 0; + } + if ((stcb->asoc.nrsack_supported == 1) && + (peer_supports_nrsack == 0)) { + stcb->asoc.nrsack_supported = 0; + } + if ((stcb->asoc.pktdrop_supported == 1) && + (peer_supports_pktdrop == 0)) { + stcb->asoc.pktdrop_supported = 0; + } + /* validate authentication required parameters */ + if ((peer_supports_auth == 0) && (got_chklist == 1)) { /* peer does not support auth but sent a chunks list? */ return (-31); } - if (!SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) && stcb->asoc.peer_supports_asconf && - !stcb->asoc.peer_supports_auth) { + if ((peer_supports_asconf == 1) && (peer_supports_auth == 0)) { /* peer supports asconf but not auth? */ return (-32); - } else if ((stcb->asoc.peer_supports_asconf) && (stcb->asoc.peer_supports_auth) && + } else if ((peer_supports_asconf == 1) && + (peer_supports_auth == 1) && ((saw_asconf == 0) || (saw_asconf_ack == 0))) { return (-33); } @@ -6718,10 +6787,6 @@ sctp_is_vtag_good(uint32_t tag, uint16_t lport, uint16_t rport, struct timeval * SCTP_INP_INFO_RLOCK(); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag, SCTP_BASE_INFO(hashasocmark))]; - if (head == NULL) { - /* invalid vtag */ - goto skip_vtag_check; - } LIST_FOREACH(stcb, head, sctp_asocs) { /* * We choose not to lock anything here. TCB's can't be @@ -6745,8 +6810,6 @@ sctp_is_vtag_good(uint32_t tag, uint16_t lport, uint16_t rport, struct timeval * return (0); } } -skip_vtag_check: - chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; /* Now what about timed wait ? */ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { @@ -6803,26 +6866,15 @@ sctp_drain_mbufs(struct sctp_tcb *stcb) SCTP_STAT_INCR(sctps_protocol_drains_done); cumulative_tsn_p1 = asoc->cumulative_tsn + 1; cnt = 0; - /* First look in the re-assembly queue */ - TAILQ_FOREACH_SAFE(chk, &asoc->reasmqueue, sctp_next, nchk) { - if (SCTP_TSN_GT(chk->rec.data.TSN_seq, cumulative_tsn_p1)) { - /* Yep it is above cum-ack */ - cnt++; - SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.TSN_seq, asoc->mapping_array_base_tsn); - asoc->size_on_reasm_queue = sctp_sbspace_sub(asoc->size_on_reasm_queue, chk->send_size); - sctp_ucount_decr(asoc->cnt_on_reasm_queue); - SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); - TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); - if (chk->data) { - sctp_m_freem(chk->data); - chk->data = NULL; - } - sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); - } - } /* Ok that was fun, now we will drain all the inbound streams? */ for (strmat = 0; strmat < asoc->streamincnt; strmat++) { - TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[strmat].inqueue, next, nctl) { + TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[strmat].inqueue, next_instrm, nctl) { +#ifdef INVARIANTS + if (ctl->on_strm_q != SCTP_ON_ORDERED) { + panic("Huh control: %p on_q: %d -- not ordered?", + ctl, ctl->on_strm_q); + } +#endif if (SCTP_TSN_GT(ctl->sinfo_tsn, cumulative_tsn_p1)) { /* Yep it is above cum-ack */ cnt++; @@ -6830,14 +6882,74 @@ sctp_drain_mbufs(struct sctp_tcb *stcb) asoc->size_on_all_streams = sctp_sbspace_sub(asoc->size_on_all_streams, ctl->length); sctp_ucount_decr(asoc->cnt_on_all_streams); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); - TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, ctl, next); + if (ctl->on_read_q) { + TAILQ_REMOVE(&stcb->sctp_ep->read_queue, ctl, next); + ctl->on_read_q = 0; + } + TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, ctl, next_instrm); + ctl->on_strm_q = 0; if (ctl->data) { sctp_m_freem(ctl->data); ctl->data = NULL; } sctp_free_remote_addr(ctl->whoFrom); - SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), ctl); - SCTP_DECR_READQ_COUNT(); + /* Now its reasm? */ + TAILQ_FOREACH_SAFE(chk, &ctl->reasm, sctp_next, nchk) { + cnt++; + SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.TSN_seq, asoc->mapping_array_base_tsn); + asoc->size_on_reasm_queue = sctp_sbspace_sub(asoc->size_on_reasm_queue, chk->send_size); + sctp_ucount_decr(asoc->cnt_on_reasm_queue); + SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); + TAILQ_REMOVE(&ctl->reasm, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); + } + sctp_free_a_readq(stcb, ctl); + } + } + TAILQ_FOREACH_SAFE(ctl, &asoc->strmin[strmat].uno_inqueue, next_instrm, nctl) { +#ifdef INVARIANTS + if (ctl->on_strm_q != SCTP_ON_UNORDERED) { + panic("Huh control: %p on_q: %d -- not unordered?", + ctl, ctl->on_strm_q); + } +#endif + if (SCTP_TSN_GT(ctl->sinfo_tsn, cumulative_tsn_p1)) { + /* Yep it is above cum-ack */ + cnt++; + SCTP_CALC_TSN_TO_GAP(gap, ctl->sinfo_tsn, asoc->mapping_array_base_tsn); + asoc->size_on_all_streams = sctp_sbspace_sub(asoc->size_on_all_streams, ctl->length); + sctp_ucount_decr(asoc->cnt_on_all_streams); + SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); + if (ctl->on_read_q) { + TAILQ_REMOVE(&stcb->sctp_ep->read_queue, ctl, next); + ctl->on_read_q = 0; + } + TAILQ_REMOVE(&asoc->strmin[strmat].uno_inqueue, ctl, next_instrm); + ctl->on_strm_q = 0; + if (ctl->data) { + sctp_m_freem(ctl->data); + ctl->data = NULL; + } + sctp_free_remote_addr(ctl->whoFrom); + /* Now its reasm? */ + TAILQ_FOREACH_SAFE(chk, &ctl->reasm, sctp_next, nchk) { + cnt++; + SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.TSN_seq, asoc->mapping_array_base_tsn); + asoc->size_on_reasm_queue = sctp_sbspace_sub(asoc->size_on_reasm_queue, chk->send_size); + sctp_ucount_decr(asoc->cnt_on_reasm_queue); + SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); + TAILQ_REMOVE(&ctl->reasm, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); + } + sctp_free_a_readq(stcb, ctl); } } } @@ -6962,6 +7074,11 @@ sctp_initiate_iterator(inp_func inpf, if (af == NULL) { return (-1); } + if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { + SCTP_PRINTF("%s: abort on initialize being %d\n", __func__, + SCTP_BASE_VAR(sctp_pcb_initialized)); + return (-1); + } SCTP_MALLOC(it, struct sctp_iterator *, sizeof(struct sctp_iterator), SCTP_M_ITER); if (it == NULL) { @@ -7000,7 +7117,13 @@ sctp_initiate_iterator(inp_func inpf, } SCTP_IPI_ITERATOR_WQ_LOCK(); - + if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { + SCTP_IPI_ITERATOR_WQ_UNLOCK(); + SCTP_PRINTF("%s: rollback on initialize being %d it=%p\n", __func__, + SCTP_BASE_VAR(sctp_pcb_initialized), it); + SCTP_FREE(it, SCTP_M_ITER); + return (-1); + } TAILQ_INSERT_TAIL(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); if (sctp_it_ctl.iterator_running == 0) { sctp_wakeup_iterator(); diff --git a/freebsd/sys/netinet/sctp_pcb.h b/freebsd/sys/netinet/sctp_pcb.h index 8045765c..98204096 100644 --- a/freebsd/sys/netinet/sctp_pcb.h +++ b/freebsd/sys/netinet/sctp_pcb.h @@ -107,7 +107,7 @@ struct sctp_ifa { * that we MUST lock appropriate locks. This * is for V6. */ union sctp_sockstore address; - uint32_t refcount; /* number of folks refering to this */ + uint32_t refcount; /* number of folks referring to this */ uint32_t flags; uint32_t localifa_flags; uint32_t vrf_id; /* vrf_id of this addr (for deleting) */ @@ -360,7 +360,7 @@ struct sctp_pcbtsn_rlog { struct sctp_inpcb { /*- * put an inpcb in front of it all, kind of a waste but we need to - * for compatability with all the other stuff. + * for compatibility with all the other stuff. */ union { struct inpcb inp; @@ -404,9 +404,17 @@ struct sctp_inpcb { uint32_t sctp_frag_point; uint32_t partial_delivery_point; uint32_t sctp_context; + uint32_t max_cwnd; uint8_t local_strreset_support; uint32_t sctp_cmt_on_off; - uint32_t sctp_ecn_enable; + uint8_t ecn_supported; + uint8_t prsctp_supported; + uint8_t auth_supported; + uint8_t idata_supported; + uint8_t asconf_supported; + uint8_t reconfig_supported; + uint8_t nrsack_supported; + uint8_t pktdrop_supported; struct sctp_nonpad_sndrcvinfo def_send; /*- * These three are here for the sosend_dgram @@ -423,6 +431,7 @@ struct sctp_inpcb { struct mtx inp_rdata_mtx; int32_t refcount; uint32_t def_vrf_id; + uint16_t fibnum; uint32_t total_sends; uint32_t total_recvs; uint32_t last_abort_code; @@ -576,7 +585,7 @@ void sctp_inpcb_free(struct sctp_inpcb *, int, int); struct sctp_tcb * sctp_aloc_assoc(struct sctp_inpcb *, struct sockaddr *, - int *, uint32_t, uint32_t, struct thread *); + int *, uint32_t, uint32_t, uint16_t, uint16_t, struct thread *); int sctp_free_assoc(struct sctp_inpcb *, struct sctp_tcb *, int, int); @@ -590,13 +599,9 @@ void void sctp_add_local_addr_ep(struct sctp_inpcb *, struct sctp_ifa *, uint32_t); -int sctp_insert_laddr(struct sctpladdr *, struct sctp_ifa *, uint32_t); - -void sctp_remove_laddr(struct sctp_laddr *); - void sctp_del_local_addr_ep(struct sctp_inpcb *, struct sctp_ifa *); -int sctp_add_remote_addr(struct sctp_tcb *, struct sockaddr *, struct sctp_nets **, int, int); +int sctp_add_remote_addr(struct sctp_tcb *, struct sockaddr *, struct sctp_nets **, uint16_t, int, int); void sctp_remove_net(struct sctp_tcb *, struct sctp_nets *); @@ -611,7 +616,7 @@ void sctp_del_local_addr_restricted(struct sctp_tcb *, struct sctp_ifa *); int sctp_load_addresses_from_init(struct sctp_tcb *, struct mbuf *, int, int, - struct sockaddr *, struct sockaddr *, struct sockaddr *); + struct sockaddr *, struct sockaddr *, struct sockaddr *, uint16_t); int sctp_set_primary_addr(struct sctp_tcb *, struct sockaddr *, @@ -625,6 +630,8 @@ int sctp_destination_is_reachable(struct sctp_tcb *, struct sockaddr *); int sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp); +void sctp_clean_up_stream(struct sctp_tcb *stcb, struct sctp_readhead *rh); + /*- * Null in last arg inpcb indicate run on ALL ep's. Specific inp in last arg * indicates run on ONLY assoc's of the specified endpoint. @@ -646,11 +653,5 @@ void #endif -#ifdef INVARIANTS -void - sctp_validate_no_locks(struct sctp_inpcb *inp); - -#endif - #endif /* _KERNEL */ #endif /* !__sctp_pcb_h__ */ diff --git a/freebsd/sys/netinet/sctp_peeloff.c b/freebsd/sys/netinet/sctp_peeloff.c index e8bb0444..3603e41a 100644 --- a/freebsd/sys/netinet/sctp_peeloff.c +++ b/freebsd/sys/netinet/sctp_peeloff.c @@ -120,9 +120,16 @@ sctp_do_peeloff(struct socket *head, struct socket *so, sctp_assoc_t assoc_id) n_inp->sctp_mobility_features = inp->sctp_mobility_features; n_inp->sctp_frag_point = inp->sctp_frag_point; n_inp->sctp_cmt_on_off = inp->sctp_cmt_on_off; - n_inp->sctp_ecn_enable = inp->sctp_ecn_enable; + n_inp->ecn_supported = inp->ecn_supported; + n_inp->prsctp_supported = inp->prsctp_supported; + n_inp->auth_supported = inp->auth_supported; + n_inp->asconf_supported = inp->asconf_supported; + n_inp->reconfig_supported = inp->reconfig_supported; + n_inp->nrsack_supported = inp->nrsack_supported; + n_inp->pktdrop_supported = inp->pktdrop_supported; n_inp->partial_delivery_point = inp->partial_delivery_point; n_inp->sctp_context = inp->sctp_context; + n_inp->max_cwnd = inp->max_cwnd; n_inp->local_strreset_support = inp->local_strreset_support; n_inp->inp_starting_point_for_iterator = NULL; /* copy in the authentication parameters from the original endpoint */ diff --git a/freebsd/sys/netinet/sctp_structs.h b/freebsd/sys/netinet/sctp_structs.h index a8b86c62..280100bb 100644 --- a/freebsd/sys/netinet/sctp_structs.h +++ b/freebsd/sys/netinet/sctp_structs.h @@ -76,6 +76,7 @@ TAILQ_HEAD(sctpnetlisthead, sctp_nets); struct sctp_stream_reset_list { TAILQ_ENTRY(sctp_stream_reset_list) next_resp; + uint32_t seq; uint32_t tsn; uint32_t number_entries; uint16_t list_of_streams[]; @@ -188,9 +189,12 @@ struct iterator_control { struct sctp_net_route { sctp_rtentry_t *ro_rt; - void *ro_lle; - void *ro_ia; - int ro_flags; + struct llentry *ro_lle; + char *ro_prepend; + uint16_t ro_plen; + uint16_t ro_flags; + uint16_t ro_mtu; + uint16_t spare; union sctp_sockstore _l_addr; /* remote peer addr */ struct sctp_ifa *_s_addr; /* our selected src addr */ }; @@ -380,15 +384,13 @@ struct sctp_nets { uint8_t lan_type; uint8_t rto_needed; uint32_t flowid; -#ifdef INVARIANTS - uint8_t flowidset; -#endif + uint8_t flowtype; }; struct sctp_data_chunkrec { uint32_t TSN_seq; /* the TSN of this transmit */ - uint16_t stream_seq; /* the stream sequence number of this transmit */ + uint32_t stream_seq; /* the stream sequence number of this transmit */ uint16_t stream_number; /* the stream number of this guy */ uint32_t payloadtype; uint32_t context; /* from send */ @@ -399,6 +401,7 @@ struct sctp_data_chunkrec { */ uint32_t fast_retran_tsn; /* sending_seq at the time of FR */ struct timeval timetodrop; /* time we drop it from queue */ + uint32_t fsn_num; /* Fragment Sequence Number */ uint8_t doing_fast_retransmit; uint8_t rcv_flags; /* flags pulled from data chunk on inbound for * outbound holds sending flags for PR-SCTP. */ @@ -418,8 +421,8 @@ TAILQ_HEAD(sctpchunk_listhead, sctp_tmit_chunk); #define CHUNK_FLAGS_FRAGMENT_OK 0x0100 struct chk_id { - uint16_t id; - uint16_t can_take_data; + uint8_t id; + uint8_t can_take_data; }; @@ -450,14 +453,9 @@ struct sctp_tmit_chunk { uint8_t window_probe; }; -/* - * The first part of this structure MUST be the entire sinfo structure. Maybe - * I should have made it a sub structure... we can circle back later and do - * that if we want. - */ struct sctp_queued_to_read { /* sinfo structure Pluse more */ uint16_t sinfo_stream; /* off the wire */ - uint16_t sinfo_ssn; /* off the wire */ + uint32_t sinfo_ssn; /* off the wire */ uint16_t sinfo_flags; /* SCTP_UNORDERED from wire use SCTP_EOF for * EOR */ uint32_t sinfo_ppid; /* off the wire */ @@ -467,8 +465,11 @@ struct sctp_queued_to_read { /* sinfo structure Pluse more */ uint32_t sinfo_cumtsn; /* Use this in reassembly as last TSN */ sctp_assoc_t sinfo_assoc_id; /* our assoc id */ /* Non sinfo stuff */ + uint32_t msg_id; /* Fragment Index */ uint32_t length; /* length of data */ uint32_t held_length; /* length held in sb */ + uint32_t top_fsn; /* Highest FSN in queue */ + uint32_t fsn_included; /* Highest FSN in *data portion */ struct sctp_nets *whoFrom; /* where it came from */ struct mbuf *data; /* front of the mbuf chain of data with * PKT_HDR */ @@ -477,14 +478,24 @@ struct sctp_queued_to_read { /* sinfo structure Pluse more */ * take it from us */ struct sctp_tcb *stcb; /* assoc, used for window update */ TAILQ_ENTRY(sctp_queued_to_read) next; + TAILQ_ENTRY(sctp_queued_to_read) next_instrm; + struct sctpchunk_listhead reasm; uint16_t port_from; uint16_t spec_flags; /* Flags to hold the notification field */ uint8_t do_not_ref_stcb; uint8_t end_added; uint8_t pdapi_aborted; + uint8_t pdapi_started; uint8_t some_taken; + uint8_t last_frag_seen; + uint8_t first_frag_seen; + uint8_t on_read_q; + uint8_t on_strm_q; }; +#define SCTP_ON_ORDERED 1 +#define SCTP_ON_UNORDERED 2 + /* This data structure will be on the outbound * stream queues. Data will be pulled off from * the front of the mbuf data and chunk-ified @@ -510,6 +521,7 @@ struct sctp_stream_queue_pending { struct sctp_nets *net; TAILQ_ENTRY(sctp_stream_queue_pending) next; TAILQ_ENTRY(sctp_stream_queue_pending) ss_next; + uint32_t fsn; uint32_t length; uint32_t timetolive; uint32_t ppid; @@ -533,14 +545,17 @@ struct sctp_stream_queue_pending { TAILQ_HEAD(sctpwheelunrel_listhead, sctp_stream_in); struct sctp_stream_in { struct sctp_readhead inqueue; + struct sctp_readhead uno_inqueue; + uint32_t last_sequence_delivered; /* used for re-order */ uint16_t stream_no; - uint16_t last_sequence_delivered; /* used for re-order */ uint8_t delivery_started; + uint8_t pd_api_started; }; TAILQ_HEAD(sctpwheel_listhead, sctp_stream_out); TAILQ_HEAD(sctplist_listhead, sctp_stream_queue_pending); + /* Round-robin schedulers */ struct ss_rr { /* next link in wheel */ @@ -567,9 +582,14 @@ struct ss_fb { * This union holds all data necessary for * different stream schedulers. */ -union scheduling_data { - struct sctpwheel_listhead out_wheel; - struct sctplist_listhead out_list; +struct scheduling_data { + struct sctp_stream_out *locked_on_sending; + /* circular looking for output selection */ + struct sctp_stream_out *last_out_stream; + union { + struct sctpwheel_listhead wheel; + struct sctplist_listhead list; + } out; }; /* @@ -582,14 +602,37 @@ union scheduling_parameters { struct ss_fb fb; }; +/* States for outgoing streams */ +#define SCTP_STREAM_CLOSED 0x00 +#define SCTP_STREAM_OPENING 0x01 +#define SCTP_STREAM_OPEN 0x02 +#define SCTP_STREAM_RESET_PENDING 0x03 +#define SCTP_STREAM_RESET_IN_FLIGHT 0x04 + +#define SCTP_MAX_STREAMS_AT_ONCE_RESET 200 + /* This struct is used to track the traffic on outbound streams */ struct sctp_stream_out { struct sctp_streamhead outqueue; union scheduling_parameters ss_params; - uint32_t chunks_on_queues; + uint32_t chunks_on_queues; /* send queue and sent queue */ +#if defined(SCTP_DETAILED_STR_STATS) + uint32_t abandoned_unsent[SCTP_PR_SCTP_MAX + 1]; + uint32_t abandoned_sent[SCTP_PR_SCTP_MAX + 1]; +#else + /* Only the aggregation */ + uint32_t abandoned_unsent[1]; + uint32_t abandoned_sent[1]; +#endif + /* + * For associations using DATA chunks, the lower 16-bit of + * next_mid_ordered are used as the next SSN. + */ + uint32_t next_mid_ordered; + uint32_t next_mid_unordered; uint16_t stream_no; - uint16_t next_sequence_send; /* next one I expect to send out */ uint8_t last_msg_incomplete; + uint8_t state; }; /* used to keep track of the addresses yet to try to add/delete */ @@ -616,12 +659,13 @@ struct sctp_scoping { struct sctp_tsn_log { void *stcb; uint32_t tsn; + uint32_t seq; uint16_t strm; - uint16_t seq; uint16_t sz; uint16_t flgs; uint16_t in_pos; uint16_t in_out; + uint16_t resv; }; #define SCTP_FS_SPEC_LOG_SIZE 200 @@ -697,7 +741,7 @@ struct sctp_ss_functions { int holds_lock); void (*sctp_ss_clear) (struct sctp_tcb *stcb, struct sctp_association *asoc, int clear_values, int holds_lock); - void (*sctp_ss_init_stream) (struct sctp_stream_out *strq, struct sctp_stream_out *with_strq); + void (*sctp_ss_init_stream) (struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq); void (*sctp_ss_add_to_stream) (struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock); int (*sctp_ss_is_empty) (struct sctp_tcb *stcb, struct sctp_association *asoc); @@ -713,6 +757,7 @@ struct sctp_ss_functions { struct sctp_stream_out *strq, uint16_t * value); int (*sctp_ss_set_value) (struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, uint16_t value); + int (*sctp_ss_is_user_msgs_incomplete) (struct sctp_tcb *stcb, struct sctp_association *asoc); }; /* used to save ASCONF chunks for retransmission */ @@ -792,19 +837,8 @@ struct sctp_association { struct sctpchunk_listhead sent_queue; struct sctpchunk_listhead send_queue; - /* re-assembly queue for fragmented chunks on the inbound path */ - struct sctpchunk_listhead reasmqueue; - /* Scheduling queues */ - union scheduling_data ss_data; - - /* - * This pointer will be set to NULL most of the time. But when we - * have a fragmented message, where we could not get out all of the - * message at the last send then this will point to the stream to go - * get data from. - */ - struct sctp_stream_out *locked_on_sending; + struct scheduling_data ss_data; /* If an iterator is looking at me, this is it */ struct sctp_iterator *stcb_starting_point_for_iterator; @@ -837,8 +871,6 @@ struct sctp_association { /* last place I got a control from */ struct sctp_nets *last_control_chunk_from; - /* circular looking for output selection */ - struct sctp_stream_out *last_out_stream; /* * wait to the point the cum-ack passes req->send_reset_at_tsn for @@ -862,7 +894,6 @@ struct sctp_association { uint32_t stream_scheduling_module; uint32_t vrf_id; - uint32_t cookie_preserve_req; /* ASCONF next seq I am sending out, inits at init-tsn */ uint32_t asconf_seq_out; @@ -936,7 +967,7 @@ struct sctp_association { uint32_t sat_t3_recovery_tsn; uint32_t tsn_last_delivered; /* - * For the pd-api we should re-write this a bit more efficent. We + * For the pd-api we should re-write this a bit more efficient. We * could have multiple sctp_queued_to_read's that we are building at * once. Now we only do this when we get ready to deliver to the * socket buffer. Note that we depend on the fact that the struct is @@ -1142,7 +1173,7 @@ struct sctp_association { uint8_t hb_random_idx; uint8_t default_dscp; uint8_t asconf_del_pending; /* asconf delete last addr pending */ - + uint8_t trigger_reset; /* * This value, plus all other ack'd but above cum-ack is added * together to cross check against the bit that we have yet to @@ -1150,34 +1181,24 @@ struct sctp_association { * sum is updated as well. */ - /* Flag to tell if ECN is allowed */ - uint8_t ecn_allowed; + /* Flags whether an extension is supported or not */ + uint8_t ecn_supported; + uint8_t prsctp_supported; + uint8_t auth_supported; + uint8_t asconf_supported; + uint8_t reconfig_supported; + uint8_t nrsack_supported; + uint8_t pktdrop_supported; + uint8_t idata_supported; /* Did the peer make the stream config (add out) request */ uint8_t peer_req_out; - /* flag to indicate if peer can do asconf */ - uint8_t peer_supports_asconf; - /* EY - flag to indicate if peer can do nr_sack */ - uint8_t peer_supports_nr_sack; - /* pr-sctp support flag */ - uint8_t peer_supports_prsctp; - /* peer authentication support flag */ - uint8_t peer_supports_auth; - /* stream resets are supported by the peer */ - uint8_t peer_supports_strreset; uint8_t local_strreset_support; - uint8_t peer_supports_nat; - /* - * packet drop's are supported by the peer, we don't really care - * about this but we bookkeep it anyway. - */ - uint8_t peer_supports_pktdrop; struct sctp_scoping scope; /* flags to handle send alternate net tracking */ - uint8_t used_alt_onsack; uint8_t used_alt_asconfack; uint8_t fast_retran_loss_recovery; uint8_t sat_t3_loss_recovery; @@ -1198,12 +1219,11 @@ struct sctp_association { uint8_t sctp_cmt_on_off; uint8_t iam_blocking; uint8_t cookie_how[8]; - /* EY 05/05/08 - NR_SACK variable */ - uint8_t sctp_nr_sack_on_off; /* JRS 5/21/07 - CMT PF variable */ uint8_t sctp_cmt_pf; uint8_t use_precise_time; uint64_t sctp_features; + uint32_t max_cwnd; uint16_t port; /* remote UDP encapsulation port */ /* * The mapping array is used to track out of order sequences above @@ -1222,6 +1242,8 @@ struct sctp_association { uint32_t timoshutdownack; struct timeval start_time; struct timeval discontinuity_time; + uint64_t abandoned_unsent[SCTP_PR_SCTP_MAX + 1]; + uint64_t abandoned_sent[SCTP_PR_SCTP_MAX + 1]; }; #endif diff --git a/freebsd/sys/netinet/sctp_sysctl.c b/freebsd/sys/netinet/sctp_sysctl.c index d0da7a6f..8715c69b 100644 --- a/freebsd/sys/netinet/sctp_sysctl.c +++ b/freebsd/sys/netinet/sctp_sysctl.c @@ -43,6 +43,9 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include + +FEATURE(sctp, "Stream Control Transmission Protocol"); /* * sysctl tunable variables @@ -56,7 +59,12 @@ sctp_init_sysctls() SCTP_BASE_SYSCTL(sctp_auto_asconf) = SCTPCTL_AUTOASCONF_DEFAULT; SCTP_BASE_SYSCTL(sctp_multiple_asconfs) = SCTPCTL_MULTIPLEASCONFS_DEFAULT; SCTP_BASE_SYSCTL(sctp_ecn_enable) = SCTPCTL_ECN_ENABLE_DEFAULT; - SCTP_BASE_SYSCTL(sctp_strict_sacks) = SCTPCTL_STRICT_SACKS_DEFAULT; + SCTP_BASE_SYSCTL(sctp_pr_enable) = SCTPCTL_PR_ENABLE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_auth_enable) = SCTPCTL_AUTH_ENABLE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_asconf_enable) = SCTPCTL_ASCONF_ENABLE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_reconfig_enable) = SCTPCTL_RECONFIG_ENABLE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_nrsack_enable) = SCTPCTL_NRSACK_ENABLE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_pktdrop_enable) = SCTPCTL_PKTDROP_ENABLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_peer_chunk_oh) = SCTPCTL_PEER_CHKOH_DEFAULT; SCTP_BASE_SYSCTL(sctp_max_burst_default) = SCTPCTL_MAXBURST_DEFAULT; SCTP_BASE_SYSCTL(sctp_fr_max_burst_default) = SCTPCTL_FRMAXBURST_DEFAULT; @@ -86,25 +94,18 @@ sctp_init_sysctls() SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default) = SCTPCTL_INCOMING_STREAMS_DEFAULT; SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default) = SCTPCTL_OUTGOING_STREAMS_DEFAULT; SCTP_BASE_SYSCTL(sctp_cmt_on_off) = SCTPCTL_CMT_ON_OFF_DEFAULT; - /* EY */ - SCTP_BASE_SYSCTL(sctp_nr_sack_on_off) = SCTPCTL_NR_SACK_ON_OFF_DEFAULT; SCTP_BASE_SYSCTL(sctp_cmt_use_dac) = SCTPCTL_CMT_USE_DAC_DEFAULT; SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) = SCTPCTL_CWND_MAXBURST_DEFAULT; - SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) = SCTPCTL_ASCONF_AUTH_NOCHK_DEFAULT; - SCTP_BASE_SYSCTL(sctp_auth_disable) = SCTPCTL_AUTH_DISABLE_DEFAULT; SCTP_BASE_SYSCTL(sctp_nat_friendly) = SCTPCTL_NAT_FRIENDLY_DEFAULT; SCTP_BASE_SYSCTL(sctp_L2_abc_variable) = SCTPCTL_ABC_L_VAR_DEFAULT; SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count) = SCTPCTL_MAX_CHAINED_MBUFS_DEFAULT; SCTP_BASE_SYSCTL(sctp_do_drain) = SCTPCTL_DO_SCTP_DRAIN_DEFAULT; SCTP_BASE_SYSCTL(sctp_hb_maxburst) = SCTPCTL_HB_MAX_BURST_DEFAULT; SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit) = SCTPCTL_ABORT_AT_LIMIT_DEFAULT; - SCTP_BASE_SYSCTL(sctp_strict_data_order) = SCTPCTL_STRICT_DATA_ORDER_DEFAULT; SCTP_BASE_SYSCTL(sctp_min_residual) = SCTPCTL_MIN_RESIDUAL_DEFAULT; SCTP_BASE_SYSCTL(sctp_max_retran_chunk) = SCTPCTL_MAX_RETRAN_CHUNK_DEFAULT; SCTP_BASE_SYSCTL(sctp_logging_level) = SCTPCTL_LOGGING_LEVEL_DEFAULT; - /* JRS - Variable for default congestion control module */ SCTP_BASE_SYSCTL(sctp_default_cc_module) = SCTPCTL_DEFAULT_CC_MODULE_DEFAULT; - /* RS - Variable for default stream scheduling module */ SCTP_BASE_SYSCTL(sctp_default_ss_module) = SCTPCTL_DEFAULT_SS_MODULE_DEFAULT; SCTP_BASE_SYSCTL(sctp_default_frag_interleave) = SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DEFAULT; SCTP_BASE_SYSCTL(sctp_mobility_base) = SCTPCTL_MOBILITY_BASE_DEFAULT; @@ -136,7 +137,7 @@ sctp_init_sysctls() /* It returns an upper limit. No filtering is done here */ static unsigned int -number_of_addresses(struct sctp_inpcb *inp) +sctp_sysctl_number_of_addresses(struct sctp_inpcb *inp) { unsigned int cnt; struct sctp_vrf *vrf; @@ -186,7 +187,7 @@ number_of_addresses(struct sctp_inpcb *inp) } static int -copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sysctl_req *req) +sctp_sysctl_copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sysctl_req *req) { struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; @@ -251,7 +252,7 @@ copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct s if (ipv4_addr_legal) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)&sctp_ifa->address.sa; + sin = &sctp_ifa->address.sin; if (sin->sin_addr.s_addr == 0) continue; if (prison_check_ip4(inp->ip_inp.inp.inp_cred, @@ -270,7 +271,7 @@ copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct s if (ipv6_addr_legal) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa; + sin6 = &sctp_ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) continue; if (prison_check_ip6(inp->ip_inp.inp.inp_cred, @@ -280,15 +281,6 @@ copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct s if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { if (local_scope == 0) continue; - if (sin6->sin6_scope_id == 0) { - /* - * bad link - * local - * address - */ - if (sa6_recoverscope(sin6) != 0) - continue; - } } if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) continue; @@ -352,7 +344,7 @@ copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct s * sysctl functions */ static int -sctp_assoclist(SYSCTL_HANDLER_ARGS) +sctp_sysctl_handle_assoclist(SYSCTL_HANDLER_ARGS) { unsigned int number_of_endpoints; unsigned int number_of_local_addresses; @@ -374,14 +366,14 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS) number_of_remote_addresses = 0; SCTP_INP_INFO_RLOCK(); - if (req->oldptr == USER_ADDR_NULL) { + if (req->oldptr == NULL) { LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) { SCTP_INP_RLOCK(inp); number_of_endpoints++; - number_of_local_addresses += number_of_addresses(inp); + number_of_local_addresses += sctp_sysctl_number_of_addresses(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { number_of_associations++; - number_of_local_addresses += number_of_addresses(inp); + number_of_local_addresses += sctp_sysctl_number_of_addresses(inp); TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { number_of_remote_addresses++; } @@ -398,7 +390,7 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS) req->oldidx = (n + n / 8); return (0); } - if (req->newptr != USER_ADDR_NULL) { + if (req->newptr != NULL) { SCTP_INP_INFO_RUNLOCK(); SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_SYSCTL, EPERM); return (EPERM); @@ -412,11 +404,12 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS) xinpcb.last = 0; xinpcb.local_port = ntohs(inp->sctp_lport); xinpcb.flags = inp->sctp_flags; - xinpcb.features = (uint32_t) inp->sctp_features; + xinpcb.features = inp->sctp_features; xinpcb.total_sends = inp->total_sends; xinpcb.total_recvs = inp->total_recvs; xinpcb.total_nospaces = inp->total_nospaces; xinpcb.fragmentation_point = inp->sctp_frag_point; + xinpcb.socket = inp->sctp_socket; so = inp->sctp_socket; if ((so == NULL) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { @@ -424,7 +417,11 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS) xinpcb.maxqlen = 0; } else { xinpcb.qlen = so->so_qlen; + xinpcb.qlen_old = so->so_qlen > USHRT_MAX ? + USHRT_MAX : (uint16_t) so->so_qlen; xinpcb.maxqlen = so->so_qlimit; + xinpcb.maxqlen_old = so->so_qlimit > USHRT_MAX ? + USHRT_MAX : (uint16_t) so->so_qlimit; } SCTP_INP_INCR_REF(inp); SCTP_INP_RUNLOCK(inp); @@ -436,7 +433,7 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS) } SCTP_INP_INFO_RLOCK(); SCTP_INP_RLOCK(inp); - error = copy_out_local_addresses(inp, NULL, req); + error = sctp_sysctl_copy_out_local_addresses(inp, NULL, req); if (error) { SCTP_INP_DECR_REF(inp); return (error); @@ -451,7 +448,7 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS) if (stcb->asoc.primary_destination != NULL) xstcb.primary_addr = stcb->asoc.primary_destination->ro._l_addr; xstcb.heartbeat_interval = stcb->asoc.heart_beat_delay; - xstcb.state = SCTP_GET_STATE(&stcb->asoc); /* FIXME */ + xstcb.state = (uint32_t) sctp_map_assoc_state(stcb->asoc.state); /* 7.0 does not support these */ xstcb.assoc_id = sctp_get_associd(stcb); xstcb.peers_rwnd = stcb->asoc.peers_rwnd; @@ -487,7 +484,7 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS) } SCTP_INP_INFO_RLOCK(); SCTP_INP_RLOCK(inp); - error = copy_out_local_addresses(inp, stcb, req); + error = sctp_sysctl_copy_out_local_addresses(inp, stcb, req); if (error) { SCTP_INP_DECR_REF(inp); atomic_subtract_int(&stcb->asoc.refcnt, 1); @@ -509,6 +506,7 @@ sctp_assoclist(SYSCTL_HANDLER_ARGS) xraddr.mtu = net->mtu; xraddr.rtt = net->rtt / 1000; xraddr.heartbeat_interval = net->heart_beat_delay; + xraddr.ssthresh = net->ssthresh; xraddr.start_time.tv_sec = (uint32_t) net->start_time.tv_sec; xraddr.start_time.tv_usec = (uint32_t) net->start_time.tv_usec; SCTP_INP_RUNLOCK(inp); @@ -555,153 +553,120 @@ skip: return (error); } - -#define RANGECHK(var, min, max) \ - if ((var) < (min)) { (var) = (min); } \ - else if ((var) > (max)) { (var) = (max); } - static int -sysctl_sctp_udp_tunneling_check(SYSCTL_HANDLER_ARGS) +sctp_sysctl_handle_udp_tunneling(SYSCTL_HANDLER_ARGS) { int error; - uint32_t old_sctp_udp_tunneling_port; + uint32_t old, new; SCTP_INP_INFO_RLOCK(); - old_sctp_udp_tunneling_port = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port); + old = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port); SCTP_INP_INFO_RUNLOCK(); - error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); - if (error == 0) { - RANGECHK(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port), SCTPCTL_UDP_TUNNELING_PORT_MIN, SCTPCTL_UDP_TUNNELING_PORT_MAX); - if (old_sctp_udp_tunneling_port == SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) { - error = 0; - goto out; - } - SCTP_INP_INFO_WLOCK(); - if (old_sctp_udp_tunneling_port) { - sctp_over_udp_stop(); - } - if (SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) { - if (sctp_over_udp_start()) { - SCTP_BASE_SYSCTL(sctp_udp_tunneling_port) = 0; + new = old; + error = sysctl_handle_int(oidp, &new, 0, req); + if ((error == 0) && + (req->newptr != NULL)) { +#if (SCTPCTL_UDP_TUNNELING_PORT_MIN == 0) + if (new > SCTPCTL_UDP_TUNNELING_PORT_MAX) { +#else + if ((new < SCTPCTL_UDP_TUNNELING_PORT_MIN) || + (new > SCTPCTL_UDP_TUNNELING_PORT_MAX)) { +#endif + error = EINVAL; + } else { + SCTP_INP_INFO_WLOCK(); + SCTP_BASE_SYSCTL(sctp_udp_tunneling_port) = new; + if (old != 0) { + sctp_over_udp_stop(); } + if (new != 0) { + error = sctp_over_udp_start(); + } + SCTP_INP_INFO_WUNLOCK(); } - SCTP_INP_INFO_WUNLOCK(); } -out: return (error); } static int -sysctl_sctp_check(SYSCTL_HANDLER_ARGS) +sctp_sysctl_handle_auth(SYSCTL_HANDLER_ARGS) { int error; - -#ifdef VIMAGE - error = vnet_sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + uint32_t new; + + new = SCTP_BASE_SYSCTL(sctp_auth_enable); + error = sysctl_handle_int(oidp, &new, 0, req); + if ((error == 0) && + (req->newptr != NULL)) { +#if (SCTPCTL_AUTH_ENABLE_MIN == 0) + if ((new > SCTPCTL_AUTH_ENABLE_MAX) || + ((new == 0) && (SCTP_BASE_SYSCTL(sctp_asconf_enable) == 1))) { #else - error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + if ((new < SCTPCTL_AUTH_ENABLE_MIN) || + (new > SCTPCTL_AUTH_ENABLE_MAX) || + ((new == 0) && (SCTP_BASE_SYSCTL(sctp_asconf_enable) == 1))) { #endif - if (error == 0) { - RANGECHK(SCTP_BASE_SYSCTL(sctp_sendspace), SCTPCTL_MAXDGRAM_MIN, SCTPCTL_MAXDGRAM_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_recvspace), SCTPCTL_RECVSPACE_MIN, SCTPCTL_RECVSPACE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_auto_asconf), SCTPCTL_AUTOASCONF_MIN, SCTPCTL_AUTOASCONF_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_ecn_enable), SCTPCTL_ECN_ENABLE_MIN, SCTPCTL_ECN_ENABLE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_sacks), SCTPCTL_STRICT_SACKS_MIN, SCTPCTL_STRICT_SACKS_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_peer_chunk_oh), SCTPCTL_PEER_CHKOH_MIN, SCTPCTL_PEER_CHKOH_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_max_burst_default), SCTPCTL_MAXBURST_MIN, SCTPCTL_MAXBURST_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_fr_max_burst_default), SCTPCTL_FRMAXBURST_MIN, SCTPCTL_FRMAXBURST_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue), SCTPCTL_MAXCHUNKS_MIN, SCTPCTL_MAXCHUNKS_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_hashtblsize), SCTPCTL_TCBHASHSIZE_MIN, SCTPCTL_TCBHASHSIZE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_pcbtblsize), SCTPCTL_PCBHASHSIZE_MIN, SCTPCTL_PCBHASHSIZE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_min_split_point), SCTPCTL_MIN_SPLIT_POINT_MIN, SCTPCTL_MIN_SPLIT_POINT_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_chunkscale), SCTPCTL_CHUNKSCALE_MIN, SCTPCTL_CHUNKSCALE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default), SCTPCTL_DELAYED_SACK_TIME_MIN, SCTPCTL_DELAYED_SACK_TIME_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_sack_freq_default), SCTPCTL_SACK_FREQ_MIN, SCTPCTL_SACK_FREQ_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_system_free_resc_limit), SCTPCTL_SYS_RESOURCE_MIN, SCTPCTL_SYS_RESOURCE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit), SCTPCTL_ASOC_RESOURCE_MIN, SCTPCTL_ASOC_RESOURCE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default), SCTPCTL_HEARTBEAT_INTERVAL_MIN, SCTPCTL_HEARTBEAT_INTERVAL_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default), SCTPCTL_PMTU_RAISE_TIME_MIN, SCTPCTL_PMTU_RAISE_TIME_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default), SCTPCTL_SHUTDOWN_GUARD_TIME_MIN, SCTPCTL_SHUTDOWN_GUARD_TIME_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default), SCTPCTL_SECRET_LIFETIME_MIN, SCTPCTL_SECRET_LIFETIME_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_max_default), SCTPCTL_RTO_MAX_MIN, SCTPCTL_RTO_MAX_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_min_default), SCTPCTL_RTO_MIN_MIN, SCTPCTL_RTO_MIN_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_initial_default), SCTPCTL_RTO_INITIAL_MIN, SCTPCTL_RTO_INITIAL_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_init_rto_max_default), SCTPCTL_INIT_RTO_MAX_MIN, SCTPCTL_INIT_RTO_MAX_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default), SCTPCTL_VALID_COOKIE_LIFE_MIN, SCTPCTL_VALID_COOKIE_LIFE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_init_rtx_max_default), SCTPCTL_INIT_RTX_MAX_MIN, SCTPCTL_INIT_RTX_MAX_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default), SCTPCTL_ASSOC_RTX_MAX_MIN, SCTPCTL_ASSOC_RTX_MAX_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), SCTPCTL_PATH_RTX_MAX_MIN, SCTPCTL_PATH_RTX_MAX_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_path_pf_threshold), SCTPCTL_PATH_PF_THRESHOLD_MIN, SCTPCTL_PATH_PF_THRESHOLD_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTPCTL_ADD_MORE_ON_OUTPUT_MIN, SCTPCTL_ADD_MORE_ON_OUTPUT_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default), SCTPCTL_INCOMING_STREAMS_MIN, SCTPCTL_INCOMING_STREAMS_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), SCTPCTL_OUTGOING_STREAMS_MIN, SCTPCTL_OUTGOING_STREAMS_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_on_off), SCTPCTL_CMT_ON_OFF_MIN, SCTPCTL_CMT_ON_OFF_MAX); - /* EY */ - RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_sack_on_off), SCTPCTL_NR_SACK_ON_OFF_MIN, SCTPCTL_NR_SACK_ON_OFF_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_use_dac), SCTPCTL_CMT_USE_DAC_MIN, SCTPCTL_CMT_USE_DAC_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst), SCTPCTL_CWND_MAXBURST_MIN, SCTPCTL_CWND_MAXBURST_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk), SCTPCTL_ASCONF_AUTH_NOCHK_MIN, SCTPCTL_ASCONF_AUTH_NOCHK_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_auth_disable), SCTPCTL_AUTH_DISABLE_MIN, SCTPCTL_AUTH_DISABLE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_nat_friendly), SCTPCTL_NAT_FRIENDLY_MIN, SCTPCTL_NAT_FRIENDLY_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_L2_abc_variable), SCTPCTL_ABC_L_VAR_MIN, SCTPCTL_ABC_L_VAR_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count), SCTPCTL_MAX_CHAINED_MBUFS_MIN, SCTPCTL_MAX_CHAINED_MBUFS_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_do_drain), SCTPCTL_DO_SCTP_DRAIN_MIN, SCTPCTL_DO_SCTP_DRAIN_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_hb_maxburst), SCTPCTL_HB_MAX_BURST_MIN, SCTPCTL_HB_MAX_BURST_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit), SCTPCTL_ABORT_AT_LIMIT_MIN, SCTPCTL_ABORT_AT_LIMIT_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_data_order), SCTPCTL_STRICT_DATA_ORDER_MIN, SCTPCTL_STRICT_DATA_ORDER_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_min_residual), SCTPCTL_MIN_RESIDUAL_MIN, SCTPCTL_MIN_RESIDUAL_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_max_retran_chunk), SCTPCTL_MAX_RETRAN_CHUNK_MIN, SCTPCTL_MAX_RETRAN_CHUNK_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_logging_level), SCTPCTL_LOGGING_LEVEL_MIN, SCTPCTL_LOGGING_LEVEL_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_default_cc_module), SCTPCTL_DEFAULT_CC_MODULE_MIN, SCTPCTL_DEFAULT_CC_MODULE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_default_ss_module), SCTPCTL_DEFAULT_SS_MODULE_MIN, SCTPCTL_DEFAULT_SS_MODULE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_default_frag_interleave), SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MIN, SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_vtag_time_wait), SCTPCTL_TIME_WAIT_MIN, SCTPCTL_TIME_WAIT_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_buffer_splitting), SCTPCTL_BUFFER_SPLITTING_MIN, SCTPCTL_BUFFER_SPLITTING_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_initial_cwnd), SCTPCTL_INITIAL_CWND_MIN, SCTPCTL_INITIAL_CWND_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_rttvar_bw), SCTPCTL_RTTVAR_BW_MIN, SCTPCTL_RTTVAR_BW_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_rttvar_rtt), SCTPCTL_RTTVAR_RTT_MIN, SCTPCTL_RTTVAR_RTT_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_rttvar_eqret), SCTPCTL_RTTVAR_EQRET_MIN, SCTPCTL_RTTVAR_EQRET_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_steady_step), SCTPCTL_RTTVAR_STEADYS_MIN, SCTPCTL_RTTVAR_STEADYS_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_use_dccc_ecn), SCTPCTL_RTTVAR_DCCCECN_MIN, SCTPCTL_RTTVAR_DCCCECN_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_mobility_base), SCTPCTL_MOBILITY_BASE_MIN, SCTPCTL_MOBILITY_BASE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff), SCTPCTL_MOBILITY_FASTHANDOFF_MIN, SCTPCTL_MOBILITY_FASTHANDOFF_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_enable_sack_immediately), SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN, SCTPCTL_SACK_IMMEDIATELY_ENABLE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly), SCTPCTL_NAT_FRIENDLY_INITS_MIN, SCTPCTL_NAT_FRIENDLY_INITS_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_blackhole), SCTPCTL_BLACKHOLE_MIN, SCTPCTL_BLACKHOLE_MAX); - RANGECHK(SCTP_BASE_SYSCTL(sctp_diag_info_code), SCTPCTL_DIAG_INFO_CODE_MIN, SCTPCTL_DIAG_INFO_CODE_MAX); + error = EINVAL; + } else { + SCTP_BASE_SYSCTL(sctp_auth_enable) = new; + } + } + return (error); +} -#ifdef SCTP_DEBUG - RANGECHK(SCTP_BASE_SYSCTL(sctp_debug_on), SCTPCTL_DEBUG_MIN, SCTPCTL_DEBUG_MAX); -#endif -#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) - RANGECHK(SCTP_BASE_SYSCTL(sctp_output_unlocked), SCTPCTL_OUTPUT_UNLOCKED_MIN, SCTPCTL_OUTPUT_UNLOCKED_MAX); +static int +sctp_sysctl_handle_asconf(SYSCTL_HANDLER_ARGS) +{ + int error; + uint32_t new; + + new = SCTP_BASE_SYSCTL(sctp_asconf_enable); + error = sysctl_handle_int(oidp, &new, 0, req); + if ((error == 0) && + (req->newptr != NULL)) { +#if (SCTPCTL_ASCONF_ENABLE_MIN == 0) + if ((new > SCTPCTL_ASCONF_ENABLE_MAX) || + ((new == 1) && (SCTP_BASE_SYSCTL(sctp_auth_enable) == 0))) { +#else + if ((new < SCTPCTL_ASCONF_ENABLE_MIN) || + (new > SCTPCTL_ASCONF_ENABLE_MAX) || + ((new == 1) && (SCTP_BASE_SYSCTL(sctp_auth_enable) == 0))) { #endif + error = EINVAL; + } else { + SCTP_BASE_SYSCTL(sctp_asconf_enable) = new; + } } return (error); } -#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT) static int -sysctl_stat_get(SYSCTL_HANDLER_ARGS) +sctp_sysctl_handle_stats(SYSCTL_HANDLER_ARGS) { - int cpu, error; - struct sctpstat sb, sb_temp, *sarry, *cpin = NULL; + int error; - if ((req->newptr) && (req->newlen == sizeof(struct sctpstat))) { - /* - * User wants us to clear or at least reset the counters to - * the specified values. - */ - cpin = &sb_temp; - memset(&sb_temp, 0, sizeof(sb_temp)); - error = SYSCTL_IN(req, &sb_temp, sizeof(sb_temp)); - if (error != 0) - return (error); - } else if (req->newptr) { - /* Must be a stat structure */ +#if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) + struct sctpstat *sarry; + struct sctpstat sb; + int cpu; + +#endif + struct sctpstat sb_temp; + + if ((req->newptr != NULL) && + (req->newlen != sizeof(struct sctpstat))) { return (EINVAL); } + memset(&sb_temp, 0, sizeof(struct sctpstat)); + + if (req->newptr != NULL) { + error = SYSCTL_IN(req, &sb_temp, sizeof(struct sctpstat)); + if (error != 0) { + return (error); + } + } +#if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) memset(&sb, 0, sizeof(sb)); for (cpu = 0; cpu < mp_maxid; cpu++) { sarry = &SCTP_BASE_STATS[cpu]; @@ -830,19 +795,35 @@ sysctl_stat_get(SYSCTL_HANDLER_ARGS) sb.sctps_send_burst_avoid += sarry->sctps_send_burst_avoid; sb.sctps_send_cwnd_avoid += sarry->sctps_send_cwnd_avoid; sb.sctps_fwdtsn_map_over += sarry->sctps_fwdtsn_map_over; - if (cpin) { - memcpy(sarry, cpin, sizeof(struct sctpstat)); + if (req->newptr != NULL) { + memcpy(sarry, &sb_temp, sizeof(struct sctpstat)); } } - error = SYSCTL_OUT(req, &sb, sizeof(sb)); + error = SYSCTL_OUT(req, &sb, sizeof(struct sctpstat)); +#else + error = SYSCTL_OUT(req, &SCTP_BASE_STATS, sizeof(struct sctpstat)); + if (error != 0) { + return (error); + } + if (req->newptr != NULL) { + memcpy(&SCTP_BASE_STATS, &sb_temp, sizeof(struct sctpstat)); + } +#endif return (error); } -#endif - #if defined(SCTP_LOCAL_TRACE_BUF) static int -sysctl_sctp_cleartrace(SYSCTL_HANDLER_ARGS) +sctp_sysctl_handle_trace_log(SYSCTL_HANDLER_ARGS) +{ + int error; + + error = SYSCTL_OUT(req, &SCTP_BASE_SYSCTL(sctp_log), sizeof(struct sctp_log)); + return (error); +} + +static int +sctp_sysctl_handle_trace_log_clear(SYSCTL_HANDLER_ARGS) { int error = 0; @@ -852,314 +833,117 @@ sysctl_sctp_cleartrace(SYSCTL_HANDLER_ARGS) #endif +#define SCTP_UINT_SYSCTL(mib_name, var_name, prefix) \ + static int \ + sctp_sysctl_handle_##mib_name(SYSCTL_HANDLER_ARGS) \ + { \ + int error; \ + uint32_t new; \ + \ + new = SCTP_BASE_SYSCTL(var_name); \ + error = sysctl_handle_int(oidp, &new, 0, req); \ + if ((error == 0) && (req->newptr != NULL)) { \ + if ((new < prefix##_MIN) || \ + (new > prefix##_MAX)) { \ + error = EINVAL; \ + } else { \ + SCTP_BASE_SYSCTL(var_name) = new; \ + } \ + } \ + return (error); \ + } \ + SYSCTL_PROC(_net_inet_sctp, OID_AUTO, mib_name, \ + CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, NULL, 0, \ + sctp_sysctl_handle_##mib_name, "UI", prefix##_DESC); /* * sysctl definitions */ -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, sendspace, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_sendspace), 0, sysctl_sctp_check, "IU", - SCTPCTL_MAXDGRAM_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, recvspace, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_recvspace), 0, sysctl_sctp_check, "IU", - SCTPCTL_RECVSPACE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, auto_asconf, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_auto_asconf), 0, sysctl_sctp_check, "IU", - SCTPCTL_AUTOASCONF_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, ecn_enable, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_ecn_enable), 0, sysctl_sctp_check, "IU", - SCTPCTL_ECN_ENABLE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, strict_sacks, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_strict_sacks), 0, sysctl_sctp_check, "IU", - SCTPCTL_STRICT_SACKS_DESC); - - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, peer_chkoh, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_peer_chunk_oh), 0, sysctl_sctp_check, "IU", - SCTPCTL_PEER_CHKOH_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, maxburst, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_max_burst_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_MAXBURST_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, fr_maxburst, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_fr_max_burst_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_FRMAXBURST_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, maxchunks, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue), 0, sysctl_sctp_check, "IU", - SCTPCTL_MAXCHUNKS_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, tcbhashsize, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_hashtblsize), 0, sysctl_sctp_check, "IU", - SCTPCTL_TCBHASHSIZE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, pcbhashsize, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_pcbtblsize), 0, sysctl_sctp_check, "IU", - SCTPCTL_PCBHASHSIZE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, min_split_point, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_min_split_point), 0, sysctl_sctp_check, "IU", - SCTPCTL_MIN_SPLIT_POINT_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, chunkscale, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_chunkscale), 0, sysctl_sctp_check, "IU", - SCTPCTL_CHUNKSCALE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, delayed_sack_time, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_DELAYED_SACK_TIME_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, sack_freq, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_sack_freq_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_SACK_FREQ_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, sys_resource, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_system_free_resc_limit), 0, sysctl_sctp_check, "IU", - SCTPCTL_SYS_RESOURCE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, asoc_resource, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit), 0, sysctl_sctp_check, "IU", - SCTPCTL_ASOC_RESOURCE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, heartbeat_interval, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_HEARTBEAT_INTERVAL_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, pmtu_raise_time, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_PMTU_RAISE_TIME_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, shutdown_guard_time, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_SHUTDOWN_GUARD_TIME_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, secret_lifetime, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_secret_lifetime_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_SECRET_LIFETIME_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rto_max, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_rto_max_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_RTO_MAX_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rto_min, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_rto_min_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_RTO_MIN_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rto_initial, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_rto_initial_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_RTO_INITIAL_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, init_rto_max, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_init_rto_max_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_INIT_RTO_MAX_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, valid_cookie_life, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_VALID_COOKIE_LIFE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, init_rtx_max, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_init_rtx_max_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_INIT_RTX_MAX_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, assoc_rtx_max, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_ASSOC_RTX_MAX_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, path_rtx_max, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_PATH_RTX_MAX_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, path_pf_threshold, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_path_pf_threshold), 0, sysctl_sctp_check, "IU", - SCTPCTL_PATH_PF_THRESHOLD_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, add_more_on_output, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_add_more_threshold), 0, sysctl_sctp_check, "IU", - SCTPCTL_ADD_MORE_ON_OUTPUT_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, incoming_streams, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_INCOMING_STREAMS_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, outgoing_streams, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), 0, sysctl_sctp_check, "IU", - SCTPCTL_OUTGOING_STREAMS_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, cmt_on_off, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_cmt_on_off), 0, sysctl_sctp_check, "IU", - SCTPCTL_CMT_ON_OFF_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, nr_sack_on_off, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_nr_sack_on_off), 0, sysctl_sctp_check, "IU", - SCTPCTL_NR_SACK_ON_OFF_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, cmt_use_dac, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_cmt_use_dac), 0, sysctl_sctp_check, "IU", - SCTPCTL_CMT_USE_DAC_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, cwnd_maxburst, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst), 0, sysctl_sctp_check, "IU", - SCTPCTL_CWND_MAXBURST_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, asconf_auth_nochk, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk), 0, sysctl_sctp_check, "IU", - SCTPCTL_ASCONF_AUTH_NOCHK_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, auth_disable, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_auth_disable), 0, sysctl_sctp_check, "IU", - SCTPCTL_AUTH_DISABLE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, nat_friendly, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_nat_friendly), 0, sysctl_sctp_check, "IU", - SCTPCTL_NAT_FRIENDLY_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, abc_l_var, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_L2_abc_variable), 0, sysctl_sctp_check, "IU", - SCTPCTL_ABC_L_VAR_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, max_chained_mbufs, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count), 0, sysctl_sctp_check, "IU", - SCTPCTL_MAX_CHAINED_MBUFS_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, do_sctp_drain, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_do_drain), 0, sysctl_sctp_check, "IU", - SCTPCTL_DO_SCTP_DRAIN_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, hb_max_burst, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_hb_maxburst), 0, sysctl_sctp_check, "IU", - SCTPCTL_HB_MAX_BURST_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, abort_at_limit, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit), 0, sysctl_sctp_check, "IU", - SCTPCTL_ABORT_AT_LIMIT_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, strict_data_order, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_strict_data_order), 0, sysctl_sctp_check, "IU", - SCTPCTL_STRICT_DATA_ORDER_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, min_residual, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_min_residual), 0, sysctl_sctp_check, "IU", - SCTPCTL_MIN_RESIDUAL_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, max_retran_chunk, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_max_retran_chunk), 0, sysctl_sctp_check, "IU", - SCTPCTL_MAX_RETRAN_CHUNK_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, log_level, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_logging_level), 0, sysctl_sctp_check, "IU", - SCTPCTL_LOGGING_LEVEL_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, default_cc_module, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_default_cc_module), 0, sysctl_sctp_check, "IU", - SCTPCTL_DEFAULT_CC_MODULE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, default_ss_module, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_default_ss_module), 0, sysctl_sctp_check, "IU", - SCTPCTL_DEFAULT_SS_MODULE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, default_frag_interleave, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_default_frag_interleave), 0, sysctl_sctp_check, "IU", - SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, mobility_base, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_mobility_base), 0, sysctl_sctp_check, "IU", - SCTPCTL_MOBILITY_BASE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, mobility_fasthandoff, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff), 0, sysctl_sctp_check, "IU", - SCTPCTL_MOBILITY_FASTHANDOFF_DESC); - +SCTP_UINT_SYSCTL(sendspace, sctp_sendspace, SCTPCTL_MAXDGRAM) +SCTP_UINT_SYSCTL(recvspace, sctp_recvspace, SCTPCTL_RECVSPACE) +SCTP_UINT_SYSCTL(auto_asconf, sctp_auto_asconf, SCTPCTL_AUTOASCONF) +SCTP_UINT_SYSCTL(ecn_enable, sctp_ecn_enable, SCTPCTL_ECN_ENABLE) +SCTP_UINT_SYSCTL(pr_enable, sctp_pr_enable, SCTPCTL_PR_ENABLE) +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, auth_enable, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + NULL, 0, sctp_sysctl_handle_auth, "IU", SCTPCTL_AUTH_ENABLE_DESC); +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, asconf_enable, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + NULL, 0, sctp_sysctl_handle_asconf, "IU", SCTPCTL_ASCONF_ENABLE_DESC); +SCTP_UINT_SYSCTL(reconfig_enable, sctp_reconfig_enable, SCTPCTL_RECONFIG_ENABLE) +SCTP_UINT_SYSCTL(nrsack_enable, sctp_nrsack_enable, SCTPCTL_NRSACK_ENABLE) +SCTP_UINT_SYSCTL(pktdrop_enable, sctp_pktdrop_enable, SCTPCTL_PKTDROP_ENABLE) +SCTP_UINT_SYSCTL(peer_chkoh, sctp_peer_chunk_oh, SCTPCTL_PEER_CHKOH) +SCTP_UINT_SYSCTL(maxburst, sctp_max_burst_default, SCTPCTL_MAXBURST) +SCTP_UINT_SYSCTL(fr_maxburst, sctp_fr_max_burst_default, SCTPCTL_FRMAXBURST) +SCTP_UINT_SYSCTL(maxchunks, sctp_max_chunks_on_queue, SCTPCTL_MAXCHUNKS) +SCTP_UINT_SYSCTL(tcbhashsize, sctp_hashtblsize, SCTPCTL_TCBHASHSIZE) +SCTP_UINT_SYSCTL(pcbhashsize, sctp_pcbtblsize, SCTPCTL_PCBHASHSIZE) +SCTP_UINT_SYSCTL(min_split_point, sctp_min_split_point, SCTPCTL_MIN_SPLIT_POINT) +SCTP_UINT_SYSCTL(chunkscale, sctp_chunkscale, SCTPCTL_CHUNKSCALE) +SCTP_UINT_SYSCTL(delayed_sack_time, sctp_delayed_sack_time_default, SCTPCTL_DELAYED_SACK_TIME) +SCTP_UINT_SYSCTL(sack_freq, sctp_sack_freq_default, SCTPCTL_SACK_FREQ) +SCTP_UINT_SYSCTL(sys_resource, sctp_system_free_resc_limit, SCTPCTL_SYS_RESOURCE) +SCTP_UINT_SYSCTL(asoc_resource, sctp_asoc_free_resc_limit, SCTPCTL_ASOC_RESOURCE) +SCTP_UINT_SYSCTL(heartbeat_interval, sctp_heartbeat_interval_default, SCTPCTL_HEARTBEAT_INTERVAL) +SCTP_UINT_SYSCTL(pmtu_raise_time, sctp_pmtu_raise_time_default, SCTPCTL_PMTU_RAISE_TIME) +SCTP_UINT_SYSCTL(shutdown_guard_time, sctp_shutdown_guard_time_default, SCTPCTL_SHUTDOWN_GUARD_TIME) +SCTP_UINT_SYSCTL(secret_lifetime, sctp_secret_lifetime_default, SCTPCTL_SECRET_LIFETIME) +SCTP_UINT_SYSCTL(rto_max, sctp_rto_max_default, SCTPCTL_RTO_MAX) +SCTP_UINT_SYSCTL(rto_min, sctp_rto_min_default, SCTPCTL_RTO_MIN) +SCTP_UINT_SYSCTL(rto_initial, sctp_rto_initial_default, SCTPCTL_RTO_INITIAL) +SCTP_UINT_SYSCTL(init_rto_max, sctp_init_rto_max_default, SCTPCTL_INIT_RTO_MAX) +SCTP_UINT_SYSCTL(valid_cookie_life, sctp_valid_cookie_life_default, SCTPCTL_VALID_COOKIE_LIFE) +SCTP_UINT_SYSCTL(init_rtx_max, sctp_init_rtx_max_default, SCTPCTL_INIT_RTX_MAX) +SCTP_UINT_SYSCTL(assoc_rtx_max, sctp_assoc_rtx_max_default, SCTPCTL_ASSOC_RTX_MAX) +SCTP_UINT_SYSCTL(path_rtx_max, sctp_path_rtx_max_default, SCTPCTL_PATH_RTX_MAX) +SCTP_UINT_SYSCTL(path_pf_threshold, sctp_path_pf_threshold, SCTPCTL_PATH_PF_THRESHOLD) +SCTP_UINT_SYSCTL(add_more_on_output, sctp_add_more_threshold, SCTPCTL_ADD_MORE_ON_OUTPUT) +SCTP_UINT_SYSCTL(incoming_streams, sctp_nr_incoming_streams_default, SCTPCTL_INCOMING_STREAMS) +SCTP_UINT_SYSCTL(outgoing_streams, sctp_nr_outgoing_streams_default, SCTPCTL_OUTGOING_STREAMS) +SCTP_UINT_SYSCTL(cmt_on_off, sctp_cmt_on_off, SCTPCTL_CMT_ON_OFF) +SCTP_UINT_SYSCTL(cmt_use_dac, sctp_cmt_use_dac, SCTPCTL_CMT_USE_DAC) +SCTP_UINT_SYSCTL(cwnd_maxburst, sctp_use_cwnd_based_maxburst, SCTPCTL_CWND_MAXBURST) +SCTP_UINT_SYSCTL(nat_friendly, sctp_nat_friendly, SCTPCTL_NAT_FRIENDLY) +SCTP_UINT_SYSCTL(abc_l_var, sctp_L2_abc_variable, SCTPCTL_ABC_L_VAR) +SCTP_UINT_SYSCTL(max_chained_mbufs, sctp_mbuf_threshold_count, SCTPCTL_MAX_CHAINED_MBUFS) +SCTP_UINT_SYSCTL(do_sctp_drain, sctp_do_drain, SCTPCTL_DO_SCTP_DRAIN) +SCTP_UINT_SYSCTL(hb_max_burst, sctp_hb_maxburst, SCTPCTL_HB_MAX_BURST) +SCTP_UINT_SYSCTL(abort_at_limit, sctp_abort_if_one_2_one_hits_limit, SCTPCTL_ABORT_AT_LIMIT) +SCTP_UINT_SYSCTL(min_residual, sctp_min_residual, SCTPCTL_MIN_RESIDUAL) +SCTP_UINT_SYSCTL(max_retran_chunk, sctp_max_retran_chunk, SCTPCTL_MAX_RETRAN_CHUNK) +SCTP_UINT_SYSCTL(log_level, sctp_logging_level, SCTPCTL_LOGGING_LEVEL) +SCTP_UINT_SYSCTL(default_cc_module, sctp_default_cc_module, SCTPCTL_DEFAULT_CC_MODULE) +SCTP_UINT_SYSCTL(default_ss_module, sctp_default_ss_module, SCTPCTL_DEFAULT_SS_MODULE) +SCTP_UINT_SYSCTL(default_frag_interleave, sctp_default_frag_interleave, SCTPCTL_DEFAULT_FRAG_INTERLEAVE) +SCTP_UINT_SYSCTL(mobility_base, sctp_mobility_base, SCTPCTL_MOBILITY_BASE) +SCTP_UINT_SYSCTL(mobility_fasthandoff, sctp_mobility_fasthandoff, SCTPCTL_MOBILITY_FASTHANDOFF) #if defined(SCTP_LOCAL_TRACE_BUF) -SYSCTL_VNET_STRUCT(_net_inet_sctp, OID_AUTO, log, CTLFLAG_RD, - &SCTP_BASE_SYSCTL(sctp_log), sctp_log, - "SCTP logging (struct sctp_log)"); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, clear_trace, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_log), 0, sysctl_sctp_cleartrace, "IU", - "Clear SCTP Logging buffer"); +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, log, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_RD, + NULL, 0, sctp_sysctl_handle_trace_log, "S,sctplog", "SCTP logging (struct sctp_log)"); +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, clear_trace, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + NULL, 0, sctp_sysctl_handle_trace_log_clear, "IU", "Clear SCTP Logging buffer"); #endif - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, udp_tunneling_port, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_udp_tunneling_port), 0, sysctl_sctp_udp_tunneling_check, "IU", - SCTPCTL_UDP_TUNNELING_PORT_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, enable_sack_immediately, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_enable_sack_immediately), 0, sysctl_sctp_check, "IU", - SCTPCTL_SACK_IMMEDIATELY_ENABLE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, nat_friendly_init, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly), 0, sysctl_sctp_check, "IU", - SCTPCTL_NAT_FRIENDLY_INITS_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, vtag_time_wait, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_vtag_time_wait), 0, sysctl_sctp_check, "IU", - SCTPCTL_TIME_WAIT_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, buffer_splitting, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_buffer_splitting), 0, sysctl_sctp_check, "IU", - SCTPCTL_BUFFER_SPLITTING_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, initial_cwnd, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_initial_cwnd), 0, sysctl_sctp_check, "IU", - SCTPCTL_INITIAL_CWND_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rttvar_bw, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_rttvar_bw), 0, sysctl_sctp_check, "IU", - SCTPCTL_RTTVAR_BW_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rttvar_rtt, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_rttvar_rtt), 0, sysctl_sctp_check, "IU", - SCTPCTL_RTTVAR_RTT_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rttvar_eqret, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_rttvar_eqret), 0, sysctl_sctp_check, "IU", - SCTPCTL_RTTVAR_EQRET_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, rttvar_steady_step, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_steady_step), 0, sysctl_sctp_check, "IU", - SCTPCTL_RTTVAR_STEADYS_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, use_dcccecn, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_use_dccc_ecn), 0, sysctl_sctp_check, "IU", - SCTPCTL_RTTVAR_DCCCECN_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, blackhole, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_blackhole), 0, sysctl_sctp_check, "IU", - SCTPCTL_BLACKHOLE_DESC); - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, diag_info_code, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_diag_info_code), 0, sysctl_sctp_check, "IU", - SCTPCTL_DIAG_INFO_CODE_DESC); - +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, udp_tunneling_port, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, + NULL, 0, sctp_sysctl_handle_udp_tunneling, "IU", SCTPCTL_UDP_TUNNELING_PORT_DESC); +SCTP_UINT_SYSCTL(enable_sack_immediately, sctp_enable_sack_immediately, SCTPCTL_SACK_IMMEDIATELY_ENABLE) +SCTP_UINT_SYSCTL(nat_friendly_init, sctp_inits_include_nat_friendly, SCTPCTL_NAT_FRIENDLY_INITS) +SCTP_UINT_SYSCTL(vtag_time_wait, sctp_vtag_time_wait, SCTPCTL_TIME_WAIT) +SCTP_UINT_SYSCTL(buffer_splitting, sctp_buffer_splitting, SCTPCTL_BUFFER_SPLITTING) +SCTP_UINT_SYSCTL(initial_cwnd, sctp_initial_cwnd, SCTPCTL_INITIAL_CWND) +SCTP_UINT_SYSCTL(rttvar_bw, sctp_rttvar_bw, SCTPCTL_RTTVAR_BW) +SCTP_UINT_SYSCTL(rttvar_rtt, sctp_rttvar_rtt, SCTPCTL_RTTVAR_RTT) +SCTP_UINT_SYSCTL(rttvar_eqret, sctp_rttvar_eqret, SCTPCTL_RTTVAR_EQRET) +SCTP_UINT_SYSCTL(rttvar_steady_step, sctp_steady_step, SCTPCTL_RTTVAR_STEADYS) +SCTP_UINT_SYSCTL(use_dcccecn, sctp_use_dccc_ecn, SCTPCTL_RTTVAR_DCCCECN) +SCTP_UINT_SYSCTL(blackhole, sctp_blackhole, SCTPCTL_BLACKHOLE) +SCTP_UINT_SYSCTL(diag_info_code, sctp_diag_info_code, SCTPCTL_DIAG_INFO_CODE) #ifdef SCTP_DEBUG -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, debug, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_debug_on), 0, sysctl_sctp_check, "IU", - SCTPCTL_DEBUG_DESC); +SCTP_UINT_SYSCTL(debug, sctp_debug_on, SCTPCTL_DEBUG) #endif - - #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, output_unlocked, CTLTYPE_UINT | CTLFLAG_RW, - &SCTP_BASE_SYSCTL(sctp_output_unlocked), 0, sysctl_sctp_check, "IU", - SCTPCTL_OUTPUT_UNLOCKED_DESC); +SCTP_UINT_SYSCTL(output_unlocked, sctp_output_unlocked, SCTPCTL_OUTPUT_UNLOCKED) #endif - -#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT) -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, stats, - CTLTYPE_STRUCT | CTLFLAG_RW, - 0, 0, sysctl_stat_get, "S,sctpstat", - "SCTP statistics (struct sctp_stat)"); -#else -SYSCTL_VNET_STRUCT(_net_inet_sctp, OID_AUTO, stats, CTLFLAG_RW, - &SCTP_BASE_STATS_SYSCTL, sctpstat, - "SCTP statistics (struct sctp_stat)"); -#endif - -SYSCTL_VNET_PROC(_net_inet_sctp, OID_AUTO, assoclist, CTLTYPE_OPAQUE | CTLFLAG_RD, - 0, 0, sctp_assoclist, - "S,xassoc", "List of active SCTP associations"); +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, stats, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_RW, + NULL, 0, sctp_sysctl_handle_stats, "S,sctpstat", "SCTP statistics (struct sctp_stat)"); +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, assoclist, CTLFLAG_VNET | CTLTYPE_OPAQUE | CTLFLAG_RD, + NULL, 0, sctp_sysctl_handle_assoclist, "S,xassoc", "List of active SCTP associations"); diff --git a/freebsd/sys/netinet/sctp_sysctl.h b/freebsd/sys/netinet/sctp_sysctl.h index 432d36a4..959bd1e4 100644 --- a/freebsd/sys/netinet/sctp_sysctl.h +++ b/freebsd/sys/netinet/sctp_sysctl.h @@ -45,8 +45,13 @@ struct sctp_sysctl { uint32_t sctp_auto_asconf; uint32_t sctp_multiple_asconfs; uint32_t sctp_ecn_enable; + uint32_t sctp_pr_enable; + uint32_t sctp_auth_enable; + uint32_t sctp_asconf_enable; + uint32_t sctp_reconfig_enable; + uint32_t sctp_nrsack_enable; + uint32_t sctp_pktdrop_enable; uint32_t sctp_fr_max_burst_default; - uint32_t sctp_strict_sacks; uint32_t sctp_peer_chunk_oh; uint32_t sctp_max_burst_default; uint32_t sctp_max_chunks_on_queue; @@ -76,18 +81,13 @@ struct sctp_sysctl { uint32_t sctp_nr_outgoing_streams_default; uint32_t sctp_cmt_on_off; uint32_t sctp_cmt_use_dac; - /* EY 5/5/08 - nr_sack flag variable */ - uint32_t sctp_nr_sack_on_off; uint32_t sctp_use_cwnd_based_maxburst; - uint32_t sctp_asconf_auth_nochk; - uint32_t sctp_auth_disable; uint32_t sctp_nat_friendly; uint32_t sctp_L2_abc_variable; uint32_t sctp_mbuf_threshold_count; uint32_t sctp_do_drain; uint32_t sctp_hb_maxburst; uint32_t sctp_abort_if_one_2_one_hits_limit; - uint32_t sctp_strict_data_order; uint32_t sctp_min_residual; uint32_t sctp_max_retran_chunk; uint32_t sctp_logging_level; @@ -141,7 +141,7 @@ struct sctp_sysctl { #define SCTPCTL_AUTOASCONF_DESC "Enable SCTP Auto-ASCONF" #define SCTPCTL_AUTOASCONF_MIN 0 #define SCTPCTL_AUTOASCONF_MAX 1 -#define SCTPCTL_AUTOASCONF_DEFAULT SCTP_DEFAULT_AUTO_ASCONF +#define SCTPCTL_AUTOASCONF_DEFAULT 1 /* autoasconf: Enable SCTP Auto-ASCONF */ #define SCTPCTL_MULTIPLEASCONFS_DESC "Enable SCTP Muliple-ASCONFs" @@ -155,11 +155,41 @@ struct sctp_sysctl { #define SCTPCTL_ECN_ENABLE_MAX 1 #define SCTPCTL_ECN_ENABLE_DEFAULT 1 -/* strict_sacks: Enable SCTP Strict SACK checking */ -#define SCTPCTL_STRICT_SACKS_DESC "Enable SCTP Strict SACK checking" -#define SCTPCTL_STRICT_SACKS_MIN 0 -#define SCTPCTL_STRICT_SACKS_MAX 1 -#define SCTPCTL_STRICT_SACKS_DEFAULT 1 +/* pr_enable: Enable PR-SCTP */ +#define SCTPCTL_PR_ENABLE_DESC "Enable PR-SCTP" +#define SCTPCTL_PR_ENABLE_MIN 0 +#define SCTPCTL_PR_ENABLE_MAX 1 +#define SCTPCTL_PR_ENABLE_DEFAULT 1 + +/* auth_enable: Enable SCTP AUTH function */ +#define SCTPCTL_AUTH_ENABLE_DESC "Enable SCTP AUTH function" +#define SCTPCTL_AUTH_ENABLE_MIN 0 +#define SCTPCTL_AUTH_ENABLE_MAX 1 +#define SCTPCTL_AUTH_ENABLE_DEFAULT 1 + +/* asconf_enable: Enable SCTP ASCONF */ +#define SCTPCTL_ASCONF_ENABLE_DESC "Enable SCTP ASCONF" +#define SCTPCTL_ASCONF_ENABLE_MIN 0 +#define SCTPCTL_ASCONF_ENABLE_MAX 1 +#define SCTPCTL_ASCONF_ENABLE_DEFAULT 1 + +/* reconfig_enable: Enable SCTP RE-CONFIG */ +#define SCTPCTL_RECONFIG_ENABLE_DESC "Enable SCTP RE-CONFIG" +#define SCTPCTL_RECONFIG_ENABLE_MIN 0 +#define SCTPCTL_RECONFIG_ENABLE_MAX 1 +#define SCTPCTL_RECONFIG_ENABLE_DEFAULT 1 + +/* nrsack_enable: Enable NR_SACK */ +#define SCTPCTL_NRSACK_ENABLE_DESC "Enable SCTP NR-SACK" +#define SCTPCTL_NRSACK_ENABLE_MIN 0 +#define SCTPCTL_NRSACK_ENABLE_MAX 1 +#define SCTPCTL_NRSACK_ENABLE_DEFAULT 0 + +/* pktdrop_enable: Enable SCTP Packet Drop Reports */ +#define SCTPCTL_PKTDROP_ENABLE_DESC "Enable SCTP PKTDROP" +#define SCTPCTL_PKTDROP_ENABLE_MIN 0 +#define SCTPCTL_PKTDROP_ENABLE_MAX 1 +#define SCTPCTL_PKTDROP_ENABLE_DEFAULT 0 /* loopback_nocsum: Enable NO Csum on packets sent on loopback */ #define SCTPCTL_LOOPBACK_NOCSUM_DESC "Enable NO Csum on packets sent on loopback" @@ -253,10 +283,10 @@ struct sctp_sysctl { #define SCTPCTL_PMTU_RAISE_TIME_DEFAULT SCTP_DEF_PMTU_RAISE_SEC /* shutdown_guard_time: Default shutdown guard timer in seconds */ -#define SCTPCTL_SHUTDOWN_GUARD_TIME_DESC "Default shutdown guard timer in seconds" +#define SCTPCTL_SHUTDOWN_GUARD_TIME_DESC "Shutdown guard timer in seconds (0 means 5 times RTO.Max)" #define SCTPCTL_SHUTDOWN_GUARD_TIME_MIN 0 #define SCTPCTL_SHUTDOWN_GUARD_TIME_MAX 0xFFFFFFFF -#define SCTPCTL_SHUTDOWN_GUARD_TIME_DEFAULT SCTP_DEF_MAX_SHUTDOWN_SEC +#define SCTPCTL_SHUTDOWN_GUARD_TIME_DEFAULT 0 /* secret_lifetime: Default secret lifetime in seconds */ #define SCTPCTL_SECRET_LIFETIME_DESC "Default secret lifetime in seconds" @@ -342,12 +372,6 @@ struct sctp_sysctl { #define SCTPCTL_CMT_ON_OFF_MAX SCTP_CMT_MAX #define SCTPCTL_CMT_ON_OFF_DEFAULT SCTP_CMT_OFF -/* EY - nr_sack_on_off: NR_SACK on/off flag */ -#define SCTPCTL_NR_SACK_ON_OFF_DESC "NR_SACK on/off flag" -#define SCTPCTL_NR_SACK_ON_OFF_MIN 0 -#define SCTPCTL_NR_SACK_ON_OFF_MAX 1 -#define SCTPCTL_NR_SACK_ON_OFF_DEFAULT 0 - /* cmt_use_dac: CMT DAC on/off flag */ #define SCTPCTL_CMT_USE_DAC_DESC "CMT DAC on/off flag" #define SCTPCTL_CMT_USE_DAC_MIN 0 @@ -360,18 +384,6 @@ struct sctp_sysctl { #define SCTPCTL_CWND_MAXBURST_MAX 1 #define SCTPCTL_CWND_MAXBURST_DEFAULT 1 -/* asconf_auth_nochk: Disable SCTP ASCONF AUTH requirement */ -#define SCTPCTL_ASCONF_AUTH_NOCHK_DESC "Disable SCTP ASCONF AUTH requirement" -#define SCTPCTL_ASCONF_AUTH_NOCHK_MIN 0 -#define SCTPCTL_ASCONF_AUTH_NOCHK_MAX 1 -#define SCTPCTL_ASCONF_AUTH_NOCHK_DEFAULT 0 - -/* auth_disable: Disable SCTP AUTH function */ -#define SCTPCTL_AUTH_DISABLE_DESC "Disable SCTP AUTH function" -#define SCTPCTL_AUTH_DISABLE_MIN 0 -#define SCTPCTL_AUTH_DISABLE_MAX 1 -#define SCTPCTL_AUTH_DISABLE_DEFAULT 0 - /* nat_friendly: SCTP NAT friendly operation */ #define SCTPCTL_NAT_FRIENDLY_DESC "SCTP NAT friendly operation" #define SCTPCTL_NAT_FRIENDLY_MIN 0 @@ -408,12 +420,6 @@ struct sctp_sysctl { #define SCTPCTL_ABORT_AT_LIMIT_MAX 1 #define SCTPCTL_ABORT_AT_LIMIT_DEFAULT 0 -/* strict_data_order: Enforce strict data ordering, abort if control inside data */ -#define SCTPCTL_STRICT_DATA_ORDER_DESC "Enforce strict data ordering, abort if control inside data" -#define SCTPCTL_STRICT_DATA_ORDER_MIN 0 -#define SCTPCTL_STRICT_DATA_ORDER_MAX 1 -#define SCTPCTL_STRICT_DATA_ORDER_DEFAULT 0 - /* min_residual: min residual in a data fragment leftover */ #define SCTPCTL_MIN_RESIDUAL_DESC "Minimum residual data chunk in second part of split" #define SCTPCTL_MIN_RESIDUAL_MIN 20 @@ -454,13 +460,13 @@ struct sctp_sysctl { #define SCTPCTL_MOBILITY_BASE_DESC "Enable SCTP base mobility" #define SCTPCTL_MOBILITY_BASE_MIN 0 #define SCTPCTL_MOBILITY_BASE_MAX 1 -#define SCTPCTL_MOBILITY_BASE_DEFAULT SCTP_DEFAULT_MOBILITY_BASE +#define SCTPCTL_MOBILITY_BASE_DEFAULT 0 /* mobility_fasthandoff: Enable SCTP fast handoff support */ #define SCTPCTL_MOBILITY_FASTHANDOFF_DESC "Enable SCTP fast handoff" #define SCTPCTL_MOBILITY_FASTHANDOFF_MIN 0 #define SCTPCTL_MOBILITY_FASTHANDOFF_MAX 1 -#define SCTPCTL_MOBILITY_FASTHANDOFF_DEFAULT SCTP_DEFAULT_MOBILITY_FASTHANDOFF +#define SCTPCTL_MOBILITY_FASTHANDOFF_DEFAULT 0 /* Enable SCTP/UDP tunneling port */ #define SCTPCTL_UDP_TUNNELING_PORT_DESC "Set the SCTP/UDP tunneling port" @@ -472,7 +478,7 @@ struct sctp_sysctl { #define SCTPCTL_SACK_IMMEDIATELY_ENABLE_DESC "Enable sending of the SACK-IMMEDIATELY-bit." #define SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN 0 #define SCTPCTL_SACK_IMMEDIATELY_ENABLE_MAX 1 -#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_DEFAULT SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN +#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_DEFAULT SCTPCTL_SACK_IMMEDIATELY_ENABLE_MAX /* Enable sending of the NAT-FRIENDLY message */ #define SCTPCTL_NAT_FRIENDLY_INITS_DESC "Enable sending of the nat-friendly SCTP option on INITs." @@ -525,7 +531,7 @@ struct sctp_sysctl { #define SCTPCTL_RTTVAR_DCCCECN_MAX 1 #define SCTPCTL_RTTVAR_DCCCECN_DEFAULT 1 /* 0 means disable feature */ -#define SCTPCTL_BLACKHOLE_DESC "Enable SCTP blackholing" +#define SCTPCTL_BLACKHOLE_DESC "Enable SCTP blackholing. See blackhole(4) for more details." #define SCTPCTL_BLACKHOLE_MIN 0 #define SCTPCTL_BLACKHOLE_MAX 2 #define SCTPCTL_BLACKHOLE_DEFAULT SCTPCTL_BLACKHOLE_MIN diff --git a/freebsd/sys/netinet/sctp_timer.c b/freebsd/sys/netinet/sctp_timer.c index 7d010c7b..c851317b 100644 --- a/freebsd/sys/netinet/sctp_timer.c +++ b/freebsd/sys/netinet/sctp_timer.c @@ -51,7 +51,9 @@ __FBSDID("$FreeBSD$"); #include #include #include +#if defined(INET) || defined(INET6) #include +#endif void @@ -85,7 +87,7 @@ sctp_audit_retranmission_queue(struct sctp_association *asoc) asoc->sent_queue_cnt); } -int +static int sctp_threshold_management(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net, uint16_t threshold) { @@ -110,8 +112,10 @@ sctp_threshold_management(struct sctp_inpcb *inp, struct sctp_tcb *stcb, net->dest_state |= SCTP_ADDR_PF; net->last_active = sctp_get_tick_count(); sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED); - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_TIMER + SCTP_LOC_3); - sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, + inp, stcb, net, + SCTP_FROM_SCTP_TIMER + SCTP_LOC_1); + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net); } } } @@ -151,9 +155,9 @@ sctp_threshold_management(struct sctp_inpcb *inp, struct sctp_tcb *stcb, /* Abort notification sends a ULP notify */ struct mbuf *op_err; - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, - "Association error couter exceeded"); - inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_1; + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + "Association error counter exceeded"); + inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_2; sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); return (1); } @@ -337,7 +341,7 @@ sctp_find_alternate_net(struct sctp_tcb *stcb, return (NULL); } } - do { + for (;;) { alt = TAILQ_NEXT(mnet, sctp_next); if (alt == NULL) { once++; @@ -356,7 +360,6 @@ sctp_find_alternate_net(struct sctp_tcb *stcb, } alt->src_addr_selected = 0; } - /* sa_ignore NO_NULL_CHK */ if (((alt->dest_state & SCTP_ADDR_REACHABLE) == SCTP_ADDR_REACHABLE) && (alt->ro.ro_rt != NULL) && (!(alt->dest_state & SCTP_ADDR_UNCONFIRMED))) { @@ -364,14 +367,14 @@ sctp_find_alternate_net(struct sctp_tcb *stcb, break; } mnet = alt; - } while (alt != NULL); + } if (alt == NULL) { /* Case where NO insv network exists (dormant state) */ /* we rotate destinations */ once = 0; mnet = net; - do { + for (;;) { if (mnet == NULL) { return (TAILQ_FIRST(&stcb->asoc.nets)); } @@ -382,15 +385,17 @@ sctp_find_alternate_net(struct sctp_tcb *stcb, break; } alt = TAILQ_FIRST(&stcb->asoc.nets); + if (alt == NULL) { + break; + } } - /* sa_ignore NO_NULL_CHK */ if ((!(alt->dest_state & SCTP_ADDR_UNCONFIRMED)) && (alt != net)) { /* Found an alternate address */ break; } mnet = alt; - } while (alt != NULL); + } } if (alt == NULL) { return (net); @@ -405,7 +410,11 @@ sctp_backoff_on_timeout(struct sctp_tcb *stcb, int num_marked, int num_abandoned) { if (net->RTO == 0) { - net->RTO = stcb->asoc.minrto; + if (net->RTO_measured) { + net->RTO = stcb->asoc.minrto; + } else { + net->RTO = stcb->asoc.initial_rto; + } } net->RTO <<= 1; if (net->RTO > stcb->asoc.maxrto) { @@ -435,6 +444,11 @@ sctp_recover_sent_list(struct sctp_tcb *stcb) asoc->strmout[chk->rec.data.stream_number].chunks_on_queues--; } } + if ((asoc->strmout[chk->rec.data.stream_number].chunks_on_queues == 0) && + (asoc->strmout[chk->rec.data.stream_number].state == SCTP_STREAM_RESET_PENDING) && + TAILQ_EMPTY(&asoc->strmout[chk->rec.data.stream_number].outqueue)) { + asoc->trigger_reset = 1; + } TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next); if (PR_SCTP_ENABLED(chk->flags)) { if (asoc->pr_sctp_cnt != 0) @@ -445,7 +459,7 @@ sctp_recover_sent_list(struct sctp_tcb *stcb) sctp_free_bufspace(stcb, asoc, chk, 1); sctp_m_freem(chk->data); chk->data = NULL; - if (asoc->peer_supports_prsctp && PR_SCTP_BUF_ENABLED(chk->flags)) { + if (asoc->prsctp_supported && PR_SCTP_BUF_ENABLED(chk->flags)) { asoc->sent_queue_cnt_removeable--; } } @@ -600,7 +614,7 @@ start_again: continue; } } - if (stcb->asoc.peer_supports_prsctp && PR_SCTP_TTL_ENABLED(chk->flags)) { + if (stcb->asoc.prsctp_supported && PR_SCTP_TTL_ENABLED(chk->flags)) { /* Is it expired? */ if (timevalcmp(&now, &chk->rec.data.timetodrop, >)) { /* Yes so drop it */ @@ -614,7 +628,7 @@ start_again: continue; } } - if (stcb->asoc.peer_supports_prsctp && PR_SCTP_RTX_ENABLED(chk->flags)) { + if (stcb->asoc.prsctp_supported && PR_SCTP_RTX_ENABLED(chk->flags)) { /* Has it been retransmitted tv_sec times? */ if (chk->snd_count > chk->rec.data.timetodrop.tv_sec) { if (chk->data) { @@ -650,7 +664,7 @@ start_again: sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_RSND_TO, chk->whoTo->flight_size, chk->book_size, - (uintptr_t) chk->whoTo, + (uint32_t) (uintptr_t) chk->whoTo, chk->rec.data.TSN_seq); } sctp_flight_size_decrease(chk); @@ -778,7 +792,7 @@ start_again: sctp_misc_ints(SCTP_FLIGHT_LOG_UP, chk->whoTo->flight_size, chk->book_size, - (uintptr_t) chk->whoTo, + (uint32_t) (uintptr_t) chk->whoTo, chk->rec.data.TSN_seq); } sctp_flight_size_increase(chk); @@ -957,7 +971,7 @@ sctp_t3rxt_timer(struct sctp_inpcb *inp, sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); return (0); } - if (stcb->asoc.peer_supports_prsctp) { + if (stcb->asoc.prsctp_supported) { struct sctp_tmit_chunk *lchk; lchk = sctp_try_advance_peer_ack_point(stcb, &stcb->asoc); @@ -1043,9 +1057,9 @@ sctp_cookie_timer(struct sctp_inpcb *inp, /* FOOBAR! */ struct mbuf *op_err; - op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Cookie timer expired, but no cookie"); - inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_4; + inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_3; sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); } else { #ifdef INVARIANTS @@ -1064,8 +1078,8 @@ sctp_cookie_timer(struct sctp_inpcb *inp, return (1); } /* - * cleared theshold management now lets backoff the address & select - * an alternate + * Cleared threshold management, now lets backoff the address and + * select an alternate */ stcb->asoc.dropped_special_cnt = 0; sctp_backoff_on_timeout(stcb, cookie->whoTo, 1, 0, 0); @@ -1110,8 +1124,8 @@ sctp_strreset_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, return (1); } /* - * cleared theshold management now lets backoff the address & select - * an alternate + * Cleared threshold management, now lets backoff the address and + * select an alternate */ sctp_backoff_on_timeout(stcb, strrst->whoTo, 1, 0, 0); alt = sctp_find_alternate_net(stcb, strrst->whoTo, 0); @@ -1270,7 +1284,7 @@ sctp_shutdown_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, { struct sctp_nets *alt; - /* first threshold managment */ + /* first threshold management */ if (sctp_threshold_management(inp, stcb, net, stcb->asoc.max_send_times)) { /* Assoc is over */ return (1); @@ -1293,7 +1307,7 @@ sctp_shutdownack_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, { struct sctp_nets *alt; - /* first threshold managment */ + /* first threshold management */ if (sctp_threshold_management(inp, stcb, net, stcb->asoc.max_send_times)) { /* Assoc is over */ return (1); @@ -1482,11 +1496,15 @@ sctp_pathmtu_timer(struct sctp_inpcb *inp, } if (net->ro._s_addr) { mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._s_addr.sa, net->ro.ro_rt); +#if defined(INET) || defined(INET6) if (net->port) { mtu -= sizeof(struct udphdr); } +#endif if (mtu > next_mtu) { net->mtu = next_mtu; + } else { + net->mtu = mtu; } } } diff --git a/freebsd/sys/netinet/sctp_timer.h b/freebsd/sys/netinet/sctp_timer.h index fd9df804..6d409cdc 100644 --- a/freebsd/sys/netinet/sctp_timer.h +++ b/freebsd/sys/netinet/sctp_timer.h @@ -45,10 +45,6 @@ struct sctp_nets * sctp_find_alternate_net(struct sctp_tcb *, struct sctp_nets *, int mode); -int -sctp_threshold_management(struct sctp_inpcb *, struct sctp_tcb *, - struct sctp_nets *, uint16_t); - int sctp_t3rxt_timer(struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *); diff --git a/freebsd/sys/netinet/sctp_uio.h b/freebsd/sys/netinet/sctp_uio.h index 968fc980..e65b7b5e 100644 --- a/freebsd/sys/netinet/sctp_uio.h +++ b/freebsd/sys/netinet/sctp_uio.h @@ -134,20 +134,27 @@ struct sctp_extrcvinfo { uint16_t sinfo_flags; uint32_t sinfo_ppid; uint32_t sinfo_context; - uint32_t sinfo_timetolive; + uint32_t sinfo_timetolive; /* should have been sinfo_pr_value */ uint32_t sinfo_tsn; uint32_t sinfo_cumtsn; sctp_assoc_t sinfo_assoc_id; - uint16_t sreinfo_next_flags; - uint16_t sreinfo_next_stream; - uint32_t sreinfo_next_aid; - uint32_t sreinfo_next_length; - uint32_t sreinfo_next_ppid; + uint16_t serinfo_next_flags; + uint16_t serinfo_next_stream; + uint32_t serinfo_next_aid; + uint32_t serinfo_next_length; + uint32_t serinfo_next_ppid; uint16_t sinfo_keynumber; uint16_t sinfo_keynumber_valid; uint8_t __reserve_pad[SCTP_ALIGN_RESV_PAD_SHORT]; }; +#define sinfo_pr_value sinfo_timetolive +#define sreinfo_next_flags serinfo_next_flags +#define sreinfo_next_stream serinfo_next_stream +#define sreinfo_next_aid serinfo_next_aid +#define sreinfo_next_length serinfo_next_length +#define sreinfo_next_ppid serinfo_next_ppid + struct sctp_sndinfo { uint16_t snd_sid; uint16_t snd_flags; @@ -249,18 +256,24 @@ struct sctp_snd_all_completes { SCTP_SACK_IMMEDIATELY)) != 0) /* for the endpoint */ -/* The lower byte is an enumeration of PR-SCTP policies */ +/* The lower four bits is an enumeration of PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000/* Reliable transfer */ #define SCTP_PR_SCTP_TTL 0x0001/* Time based PR-SCTP */ -#define SCTP_PR_SCTP_BUF 0x0002/* Buffer based PR-SCTP */ +#define SCTP_PR_SCTP_PRIO 0x0002/* Buffer based PR-SCTP */ +#define SCTP_PR_SCTP_BUF SCTP_PR_SCTP_PRIO /* For backwards compatibility */ #define SCTP_PR_SCTP_RTX 0x0003/* Number of retransmissions based PR-SCTP */ +#define SCTP_PR_SCTP_MAX SCTP_PR_SCTP_RTX +#define SCTP_PR_SCTP_ALL 0x000f/* Used for aggregated stats */ #define PR_SCTP_POLICY(x) ((x) & 0x0f) -#define PR_SCTP_ENABLED(x) (PR_SCTP_POLICY(x) != SCTP_PR_SCTP_NONE) +#define PR_SCTP_ENABLED(x) ((PR_SCTP_POLICY(x) != SCTP_PR_SCTP_NONE) && \ + (PR_SCTP_POLICY(x) != SCTP_PR_SCTP_ALL)) #define PR_SCTP_TTL_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_TTL) #define PR_SCTP_BUF_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_BUF) #define PR_SCTP_RTX_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_RTX) -#define PR_SCTP_INVALID_POLICY(x) (PR_SCTP_POLICY(x) > SCTP_PR_SCTP_RTX) +#define PR_SCTP_INVALID_POLICY(x) (PR_SCTP_POLICY(x) > SCTP_PR_SCTP_MAX) +#define PR_SCTP_VALID_POLICY(x) (PR_SCTP_POLICY(x) <= SCTP_PR_SCTP_MAX) + /* Stat's */ struct sctp_pcbinfo { uint32_t ep_count; @@ -306,12 +319,13 @@ struct sctp_assoc_change { #define SCTP_CANT_STR_ASSOC 0x0005 /* sac_info values */ -#define SCTP_ASSOC_SUPPORTS_PR 0x01 -#define SCTP_ASSOC_SUPPORTS_AUTH 0x02 -#define SCTP_ASSOC_SUPPORTS_ASCONF 0x03 -#define SCTP_ASSOC_SUPPORTS_MULTIBUF 0x04 -#define SCTP_ASSOC_SUPPORTS_RE_CONFIG 0x05 -#define SCTP_ASSOC_SUPPORTS_MAX 0x05 +#define SCTP_ASSOC_SUPPORTS_PR 0x01 +#define SCTP_ASSOC_SUPPORTS_AUTH 0x02 +#define SCTP_ASSOC_SUPPORTS_ASCONF 0x03 +#define SCTP_ASSOC_SUPPORTS_MULTIBUF 0x04 +#define SCTP_ASSOC_SUPPORTS_RE_CONFIG 0x05 +#define SCTP_ASSOC_SUPPORTS_INTERLEAVING 0x06 +#define SCTP_ASSOC_SUPPORTS_MAX 0x06 /* * Address event */ @@ -323,7 +337,6 @@ struct sctp_paddr_change { uint32_t spc_state; uint32_t spc_error; sctp_assoc_t spc_assoc_id; - uint8_t spc_padding[4]; }; /* paddr state values */ @@ -346,7 +359,7 @@ struct sctp_remote_error { uint32_t sre_length; uint16_t sre_error; sctp_assoc_t sre_assoc_id; - uint8_t sre_data[4]; + uint8_t sre_data[]; }; /* data send failure event (deprecated) */ @@ -578,6 +591,7 @@ struct sctp_paddrthlds { sctp_assoc_t spt_assoc_id; uint16_t spt_pathmaxrxt; uint16_t spt_pathpfthld; + uint16_t spt_pathcpthld; }; struct sctp_paddrinfo { @@ -720,6 +734,14 @@ struct sctp_udpencaps { uint16_t sue_port; }; +struct sctp_prstatus { + sctp_assoc_t sprstat_assoc_id; + uint16_t sprstat_sid; + uint16_t sprstat_policy; + uint64_t sprstat_abandoned_unsent; + uint64_t sprstat_abandoned_sent; +}; + struct sctp_cwnd_args { struct sctp_nets *net; /* network to *//* FIXME: LP64 issue */ uint32_t cwnd_new_value;/* cwnd in k */ @@ -1145,15 +1167,22 @@ union sctp_sockstore { struct xsctp_inpcb { uint32_t last; uint32_t flags; - uint32_t features; + uint64_t features; uint32_t total_sends; uint32_t total_recvs; uint32_t total_nospaces; uint32_t fragmentation_point; uint16_t local_port; - uint16_t qlen; - uint16_t maxqlen; - uint32_t extra_padding[32]; /* future */ + uint16_t qlen_old; + uint16_t maxqlen_old; + void *socket; + uint32_t qlen; + uint32_t maxqlen; +#if defined(__LP64__) + uint32_t extra_padding[27]; /* future */ +#else + uint32_t extra_padding[28]; /* future */ +#endif }; struct xsctp_tcb { @@ -1211,7 +1240,8 @@ struct xsctp_raddr { struct sctp_timeval start_time; /* sctpAssocLocalRemEntry 8 */ uint32_t rtt; uint32_t heartbeat_interval; - uint32_t extra_padding[31]; /* future */ + uint32_t ssthresh; + uint32_t extra_padding[30]; /* future */ }; #define SCTP_MAX_LOGGING_SIZE 30000 diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c index b19a7499..1cbb7076 100644 --- a/freebsd/sys/netinet/sctp_usrreq.c +++ b/freebsd/sys/netinet/sctp_usrreq.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include #include #ifdef INET6 +#include #endif #include #include @@ -55,8 +56,8 @@ __FBSDID("$FreeBSD$"); -extern struct sctp_cc_functions sctp_cc_functions[]; -extern struct sctp_ss_functions sctp_ss_functions[]; +extern const struct sctp_cc_functions sctp_cc_functions[]; +extern const struct sctp_ss_functions sctp_ss_functions[]; void sctp_init(void) @@ -90,13 +91,15 @@ sctp_init(void) #endif } -void -sctp_finish(void) +#ifdef VIMAGE +static void +sctp_finish(void *unused __unused) { sctp_pcb_finish(); } - +VNET_SYSUNINIT(sctp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, sctp_finish, NULL); +#endif void sctp_pathmtu_adjustment(struct sctp_tcb *stcb, uint16_t nxtsz) @@ -126,148 +129,55 @@ sctp_pathmtu_adjustment(struct sctp_tcb *stcb, uint16_t nxtsz) if (chk->sent < SCTP_DATAGRAM_RESEND) { sctp_flight_size_decrease(chk); sctp_total_flight_decrease(stcb, chk); - } - if (chk->sent != SCTP_DATAGRAM_RESEND) { + chk->sent = SCTP_DATAGRAM_RESEND; sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + chk->rec.data.doing_fast_retransmit = 0; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PMTU, + chk->whoTo->flight_size, + chk->book_size, + (uint32_t) (uintptr_t) chk->whoTo, + chk->rec.data.TSN_seq); + } + /* Clear any time so NO RTT is being done */ + chk->do_rtt = 0; } - chk->sent = SCTP_DATAGRAM_RESEND; - chk->rec.data.doing_fast_retransmit = 0; - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { - sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PMTU, - chk->whoTo->flight_size, - chk->book_size, - (uintptr_t) chk->whoTo, - chk->rec.data.TSN_seq); - } - /* Clear any time so NO RTT is being done */ - chk->do_rtt = 0; } } } #ifdef INET -static void -sctp_notify_mbuf(struct sctp_inpcb *inp, - struct sctp_tcb *stcb, - struct sctp_nets *net, - struct ip *ip, - struct sctphdr *sh) -{ - struct icmp *icmph; - int totsz, tmr_stopped = 0; - uint16_t nxtsz; - - /* protection */ - if ((inp == NULL) || (stcb == NULL) || (net == NULL) || - (ip == NULL) || (sh == NULL)) { - if (stcb != NULL) { - SCTP_TCB_UNLOCK(stcb); - } - return; - } - /* First job is to verify the vtag matches what I would send */ - if (ntohl(sh->v_tag) != (stcb->asoc.peer_vtag)) { - SCTP_TCB_UNLOCK(stcb); - return; - } - icmph = (struct icmp *)((caddr_t)ip - (sizeof(struct icmp) - - sizeof(struct ip))); - if (icmph->icmp_type != ICMP_UNREACH) { - /* We only care about unreachable */ - SCTP_TCB_UNLOCK(stcb); - return; - } - if (icmph->icmp_code != ICMP_UNREACH_NEEDFRAG) { - /* not a unreachable message due to frag. */ - SCTP_TCB_UNLOCK(stcb); - return; - } - totsz = ip->ip_len; - - nxtsz = ntohs(icmph->icmp_nextmtu); - if (nxtsz == 0) { - /* - * old type router that does not tell us what the next size - * mtu is. Rats we will have to guess (in a educated fashion - * of course) - */ - nxtsz = sctp_get_prev_mtu(totsz); - } - /* Stop any PMTU timer */ - if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { - tmr_stopped = 1; - sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, - SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1); - } - /* Adjust destination size limit */ - if (net->mtu > nxtsz) { - net->mtu = nxtsz; - if (net->port) { - net->mtu -= sizeof(struct udphdr); - } - } - /* now what about the ep? */ - if (stcb->asoc.smallest_mtu > nxtsz) { - sctp_pathmtu_adjustment(stcb, nxtsz); - } - if (tmr_stopped) - sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net); - - SCTP_TCB_UNLOCK(stcb); -} - -#endif - void sctp_notify(struct sctp_inpcb *inp, - struct ip *ip, - struct sctphdr *sh, - struct sockaddr *to, struct sctp_tcb *stcb, - struct sctp_nets *net) + struct sctp_nets *net, + uint8_t icmp_type, + uint8_t icmp_code, + uint16_t ip_len, + uint16_t next_mtu) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif - struct icmp *icmph; + int timer_stopped; - /* protection */ - if ((inp == NULL) || (stcb == NULL) || (net == NULL) || - (sh == NULL) || (to == NULL)) { - if (stcb) - SCTP_TCB_UNLOCK(stcb); - return; - } - /* First job is to verify the vtag matches what I would send */ - if (ntohl(sh->v_tag) != (stcb->asoc.peer_vtag)) { - SCTP_TCB_UNLOCK(stcb); - return; - } - icmph = (struct icmp *)((caddr_t)ip - (sizeof(struct icmp) - - sizeof(struct ip))); - if (icmph->icmp_type != ICMP_UNREACH) { + if (icmp_type != ICMP_UNREACH) { /* We only care about unreachable */ SCTP_TCB_UNLOCK(stcb); return; } - if ((icmph->icmp_code == ICMP_UNREACH_NET) || - (icmph->icmp_code == ICMP_UNREACH_HOST) || - (icmph->icmp_code == ICMP_UNREACH_NET_UNKNOWN) || - (icmph->icmp_code == ICMP_UNREACH_HOST_UNKNOWN) || - (icmph->icmp_code == ICMP_UNREACH_ISOLATED) || - (icmph->icmp_code == ICMP_UNREACH_NET_PROHIB) || - (icmph->icmp_code == ICMP_UNREACH_HOST_PROHIB) || - (icmph->icmp_code == ICMP_UNREACH_FILTER_PROHIB)) { - - /* - * Hmm reachablity problems we must examine closely. If its - * not reachable, we may have lost a network. Or if there is - * NO protocol at the other end named SCTP. well we consider - * it a OOTB abort. - */ + if ((icmp_code == ICMP_UNREACH_NET) || + (icmp_code == ICMP_UNREACH_HOST) || + (icmp_code == ICMP_UNREACH_NET_UNKNOWN) || + (icmp_code == ICMP_UNREACH_HOST_UNKNOWN) || + (icmp_code == ICMP_UNREACH_ISOLATED) || + (icmp_code == ICMP_UNREACH_NET_PROHIB) || + (icmp_code == ICMP_UNREACH_HOST_PROHIB) || + (icmp_code == ICMP_UNREACH_FILTER_PROHIB)) { + /* Mark the net unreachable. */ if (net->dest_state & SCTP_ADDR_REACHABLE) { - /* Ok that destination is NOT reachable */ + /* OK, that destination is NOT reachable. */ net->dest_state &= ~SCTP_ADDR_REACHABLE; net->dest_state &= ~SCTP_ADDR_PF; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, @@ -275,15 +185,9 @@ sctp_notify(struct sctp_inpcb *inp, (void *)net, SCTP_SO_NOT_LOCKED); } SCTP_TCB_UNLOCK(stcb); - } else if ((icmph->icmp_code == ICMP_UNREACH_PROTOCOL) || - (icmph->icmp_code == ICMP_UNREACH_PORT)) { - /* - * Here the peer is either playing tricks on us, including - * an address that belongs to someone who does not support - * SCTP OR was a userland implementation that shutdown and - * now is dead. In either case treat it like a OOTB abort - * with no TCB - */ + } else if ((icmp_code == ICMP_UNREACH_PROTOCOL) || + (icmp_code == ICMP_UNREACH_PORT)) { + /* Treat it like an ABORT. */ sctp_abort_notification(stcb, 1, 0, NULL, SCTP_SO_NOT_LOCKED); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(inp); @@ -293,72 +197,141 @@ sctp_notify(struct sctp_inpcb *inp, SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); /* SCTP_TCB_UNLOCK(stcb); MT: I think this is not needed. */ #endif /* no need to unlock here, since the TCB is gone */ + } else if (icmp_code == ICMP_UNREACH_NEEDFRAG) { + /* Find the next (smaller) MTU */ + if (next_mtu == 0) { + /* + * Old type router that does not tell us what the + * next MTU is. Rats we will have to guess (in a + * educated fashion of course). + */ + next_mtu = sctp_get_prev_mtu(ip_len); + } + /* Stop the PMTU timer. */ + if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { + timer_stopped = 1; + sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1); + } else { + timer_stopped = 0; + } + /* Update the path MTU. */ + if (net->mtu > next_mtu) { + net->mtu = next_mtu; + if (net->port) { + net->mtu -= sizeof(struct udphdr); + } + } + /* Update the association MTU */ + if (stcb->asoc.smallest_mtu > next_mtu) { + sctp_pathmtu_adjustment(stcb, next_mtu); + } + /* Finally, start the PMTU timer if it was running before. */ + if (timer_stopped) { + sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net); + } + SCTP_TCB_UNLOCK(stcb); } else { SCTP_TCB_UNLOCK(stcb); } } -#ifdef INET void -sctp_ctlinput(cmd, sa, vip) - int cmd; - struct sockaddr *sa; - void *vip; +sctp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { - struct ip *ip = vip; + struct ip *outer_ip; + struct ip *inner_ip; struct sctphdr *sh; - uint32_t vrf_id; + struct icmp *icmp; + struct sctp_inpcb *inp; + struct sctp_tcb *stcb; + struct sctp_nets *net; + struct sctp_init_chunk *ch; + struct sockaddr_in src, dst; - /* FIX, for non-bsd is this right? */ - vrf_id = SCTP_DEFAULT_VRFID; if (sa->sa_family != AF_INET || ((struct sockaddr_in *)sa)->sin_addr.s_addr == INADDR_ANY) { return; } if (PRC_IS_REDIRECT(cmd)) { - ip = 0; + vip = NULL; } else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) { return; } - if (ip) { - struct sctp_inpcb *inp = NULL; - struct sctp_tcb *stcb = NULL; - struct sctp_nets *net = NULL; - struct sockaddr_in to, from; - - sh = (struct sctphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - bzero(&to, sizeof(to)); - bzero(&from, sizeof(from)); - from.sin_family = to.sin_family = AF_INET; - from.sin_len = to.sin_len = sizeof(to); - from.sin_port = sh->src_port; - from.sin_addr = ip->ip_src; - to.sin_port = sh->dest_port; - to.sin_addr = ip->ip_dst; - + if (vip != NULL) { + inner_ip = (struct ip *)vip; + icmp = (struct icmp *)((caddr_t)inner_ip - + (sizeof(struct icmp) - sizeof(struct ip))); + outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); + sh = (struct sctphdr *)((caddr_t)inner_ip + (inner_ip->ip_hl << 2)); + memset(&src, 0, sizeof(struct sockaddr_in)); + src.sin_family = AF_INET; + src.sin_len = sizeof(struct sockaddr_in); + src.sin_port = sh->src_port; + src.sin_addr = inner_ip->ip_src; + memset(&dst, 0, sizeof(struct sockaddr_in)); + dst.sin_family = AF_INET; + dst.sin_len = sizeof(struct sockaddr_in); + dst.sin_port = sh->dest_port; + dst.sin_addr = inner_ip->ip_dst; /* - * 'to' holds the dest of the packet that failed to be sent. - * 'from' holds our local endpoint address. Thus we reverse - * the to and the from in the lookup. + * 'dst' holds the dest of the packet that failed to be + * sent. 'src' holds our local endpoint address. Thus we + * reverse the dst and the src in the lookup. */ - stcb = sctp_findassociation_addr_sa((struct sockaddr *)&to, - (struct sockaddr *)&from, - &inp, &net, 1, vrf_id); - if (stcb != NULL && inp && (inp->sctp_socket != NULL)) { - if (cmd != PRC_MSGSIZE) { - sctp_notify(inp, ip, sh, - (struct sockaddr *)&to, stcb, - net); + inp = NULL; + net = NULL; + stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst, + (struct sockaddr *)&src, + &inp, &net, 1, + SCTP_DEFAULT_VRFID); + if ((stcb != NULL) && + (net != NULL) && + (inp != NULL)) { + /* Check the verification tag */ + if (ntohl(sh->v_tag) != 0) { + /* + * This must be the verification tag used + * for sending out packets. We don't + * consider packets reflecting the + * verification tag. + */ + if (ntohl(sh->v_tag) != stcb->asoc.peer_vtag) { + SCTP_TCB_UNLOCK(stcb); + return; + } } else { - /* handle possible ICMP size messages */ - sctp_notify_mbuf(inp, stcb, net, ip, sh); + if (ntohs(outer_ip->ip_len) >= + sizeof(struct ip) + + 8 + (inner_ip->ip_hl << 2) + 20) { + /* + * In this case we can check if we + * got an INIT chunk and if the + * initiate tag matches. + */ + ch = (struct sctp_init_chunk *)(sh + 1); + if ((ch->ch.chunk_type != SCTP_INITIATION) || + (ntohl(ch->init.initiate_tag) != stcb->asoc.my_vtag)) { + SCTP_TCB_UNLOCK(stcb); + return; + } + } else { + SCTP_TCB_UNLOCK(stcb); + return; + } } + sctp_notify(inp, stcb, net, + icmp->icmp_type, + icmp->icmp_code, + ntohs(inner_ip->ip_len), + ntohs(icmp->icmp_nextmtu)); } else { if ((stcb == NULL) && (inp != NULL)) { /* reduce ref-count */ @@ -489,13 +462,8 @@ sctp_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUS int error; uint32_t vrf_id = SCTP_DEFAULT_VRFID; -#ifdef IPSEC - uint32_t flags; - -#endif - inp = (struct sctp_inpcb *)so->so_pcb; - if (inp != 0) { + if (inp != NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } @@ -515,33 +483,6 @@ sctp_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUS ip_inp = &inp->ip_inp.inp; ip_inp->inp_vflag |= INP_IPV4; ip_inp->inp_ip_ttl = MODULE_GLOBAL(ip_defttl); -#ifdef IPSEC - error = ipsec_init_policy(so, &ip_inp->inp_sp); -#ifdef SCTP_LOG_CLOSING - sctp_log_closing(inp, NULL, 17); -#endif - if (error != 0) { -try_again: - flags = inp->sctp_flags; - if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && - (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) { -#ifdef SCTP_LOG_CLOSING - sctp_log_closing(inp, NULL, 15); -#endif - SCTP_INP_WUNLOCK(inp); - sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, - SCTP_CALLED_AFTER_CMPSET_OFCLOSE); - } else { - flags = inp->sctp_flags; - if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) { - goto try_again; - } else { - SCTP_INP_WUNLOCK(inp); - } - } - return (error); - } -#endif /* IPSEC */ SCTP_INP_WUNLOCK(inp); return (0); } @@ -759,7 +700,7 @@ sctp_disconnect(struct socket *so) /* Left with Data unread */ struct mbuf *err; - err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_DONTWAIT, 1, MT_DATA); + err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA); if (err) { /* * Fill in the user @@ -780,7 +721,8 @@ sctp_disconnect(struct socket *so) (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_3); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_3); /* No unlock tcb assoc is gone */ return (0); } @@ -788,7 +730,7 @@ sctp_disconnect(struct socket *so) TAILQ_EMPTY(&asoc->sent_queue) && (asoc->stream_queue_cnt == 0)) { /* there is nothing queued to send, so done */ - if (asoc->locked_on_sending) { + if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && @@ -837,18 +779,8 @@ sctp_disconnect(struct socket *so) asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp); - if (asoc->locked_on_sending) { - /* Locked to send out the data */ - struct sctp_stream_queue_pending *sp; - - sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead); - if (sp == NULL) { - SCTP_PRINTF("Error, sp is NULL, locked on sending is non-null strm:%d\n", - asoc->locked_on_sending->stream_no); - } else { - if ((sp->length == 0) && (sp->msg_is_complete == 0)) - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; - } + if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; } if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && @@ -865,7 +797,8 @@ sctp_disconnect(struct socket *so) SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_INP_RUNLOCK(inp); - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_5); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_5); return (0); } else { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); @@ -957,14 +890,15 @@ sctp_shutdown(struct socket *so) SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); return (EOPNOTSUPP); - } - /* - * Ok if we reach here its the TCP model and it is either a SHUT_WR - * or SHUT_RDWR. This means we put the shutdown flag against it. - */ - { + } else { + /* + * Ok, if we reach here its the TCP model and it is either a + * SHUT_WR or SHUT_RDWR. This means we put the shutdown flag + * against it. + */ struct sctp_tcb *stcb; struct sctp_association *asoc; + struct sctp_nets *netp; if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { @@ -976,7 +910,7 @@ sctp_shutdown(struct socket *so) stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { /* - * Ok we hit the case that the shutdown call was + * Ok, we hit the case that the shutdown call was * made after an abort or something. Nothing to do * now. */ @@ -985,66 +919,50 @@ sctp_shutdown(struct socket *so) } SCTP_TCB_LOCK(stcb); asoc = &stcb->asoc; - if (TAILQ_EMPTY(&asoc->send_queue) && + if (asoc->state & SCTP_STATE_ABOUT_TO_BE_FREED) { + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_RUNLOCK(inp); + return (0); + } + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_ECHOED) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_OPEN)) { + /* + * If we are not in or before ESTABLISHED, there is + * no protocol action required. + */ + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_RUNLOCK(inp); + return (0); + } + if (stcb->asoc.alternate) { + netp = stcb->asoc.alternate; + } else { + netp = stcb->asoc.primary_destination; + } + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) && + TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->stream_queue_cnt == 0)) { - if (asoc->locked_on_sending) { + if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } /* there is nothing queued to send, so I'm done... */ - if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) { - /* only send SHUTDOWN the first time through */ - struct sctp_nets *netp; - - if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || - (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { - SCTP_STAT_DECR_GAUGE32(sctps_currestab); - } - SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); - SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); - sctp_stop_timers_for_shutdown(stcb); - if (stcb->asoc.alternate) { - netp = stcb->asoc.alternate; - } else { - netp = stcb->asoc.primary_destination; - } - sctp_send_shutdown(stcb, netp); - sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, - stcb->sctp_ep, stcb, netp); - sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, - stcb->sctp_ep, stcb, netp); - sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_LOCKED); - } + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_stop_timers_for_shutdown(stcb); + sctp_send_shutdown(stcb, netp); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, + stcb->sctp_ep, stcb, netp); } else { /* - * we still got (or just got) data to send, so set - * SHUTDOWN_PENDING + * We still got (or just got) data to send, so set + * SHUTDOWN_PENDING. */ - struct sctp_nets *netp; - - if (stcb->asoc.alternate) { - netp = stcb->asoc.alternate; - } else { - netp = stcb->asoc.primary_destination; - } - - asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; - sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, - netp); - - if (asoc->locked_on_sending) { - /* Locked to send out the data */ - struct sctp_stream_queue_pending *sp; - - sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead); - if (sp == NULL) { - SCTP_PRINTF("Error, sp is NULL, locked on sending is non-null strm:%d\n", - asoc->locked_on_sending->stream_no); - } else { - if ((sp->length == 0) && (sp->msg_is_complete == 0)) { - asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; - } - } + SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { + SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_PARTIAL_MSG_LEFT); } if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && @@ -1056,16 +974,20 @@ sctp_shutdown(struct socket *so) stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6; sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_LOCKED); - goto skip_unlock; - } else { - sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); + SCTP_INP_RUNLOCK(inp); + return (0); } } + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, netp); + /* + * XXX: Why do this in the case where we have still data + * queued? + */ + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); SCTP_TCB_UNLOCK(stcb); + SCTP_INP_RUNLOCK(inp); + return (0); } -skip_unlock: - SCTP_INP_RUNLOCK(inp); - return (0); } /* @@ -1190,7 +1112,7 @@ sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp, if (ipv4_addr_legal) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)&sctp_ifa->address.sa; + sin = &sctp_ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* * we skip @@ -1235,7 +1157,7 @@ sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp, if (ipv6_addr_legal) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa; + sin6 = &sctp_ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * we skip @@ -1375,10 +1297,14 @@ sctp_count_max_addresses_vrf(struct sctp_inpcb *inp, uint32_t vrf_id) switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: +#ifdef INET6 if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) cnt += sizeof(struct sockaddr_in6); else cnt += sizeof(struct sockaddr_in); +#else + cnt += sizeof(struct sockaddr_in); +#endif break; #endif #ifdef INET6 @@ -1398,10 +1324,14 @@ sctp_count_max_addresses_vrf(struct sctp_inpcb *inp, uint32_t vrf_id) switch (laddr->ifa->address.sa.sa_family) { #ifdef INET case AF_INET: +#ifdef INET6 if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) cnt += sizeof(struct sockaddr_in6); else cnt += sizeof(struct sockaddr_in); +#else + cnt += sizeof(struct sockaddr_in); +#endif break; #endif #ifdef INET6 @@ -1437,7 +1367,7 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, int creat_lock_on = 0; struct sctp_tcb *stcb = NULL; struct sockaddr *sa; - int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr; + unsigned int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr; uint32_t vrf_id; int bad_addresses = 0; sctp_assoc_t *a_id; @@ -1473,10 +1403,10 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, error = EFAULT; goto out_now; } - totaddrp = (int *)optval; + totaddrp = (unsigned int *)optval; totaddr = *totaddrp; sa = (struct sockaddr *)(totaddrp + 1); - stcb = sctp_connectx_helper_find(inp, sa, &totaddr, &num_v4, &num_v6, &error, (optsize - sizeof(int)), &bad_addresses); + stcb = sctp_connectx_helper_find(inp, sa, &totaddr, &num_v4, &num_v6, &error, (unsigned int)(optsize - sizeof(int)), &bad_addresses); if ((stcb != NULL) || bad_addresses) { /* Already have or am bring up an association */ SCTP_ASOC_CREATE_UNLOCK(inp); @@ -1525,6 +1455,8 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, /* We are GOOD to go */ stcb = sctp_aloc_assoc(inp, sa, &error, 0, vrf_id, + inp->sctp_ep.pre_open_stream_count, + inp->sctp_ep.port, (struct thread *)p ); if (stcb == NULL) { @@ -1557,7 +1489,8 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error); /* Fill in the return id */ if (error) { - (void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6); + (void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7); goto out_now; } a_id = (sctp_assoc_t *) optval; @@ -1575,11 +1508,6 @@ sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); } SCTP_TCB_UNLOCK(stcb); - if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { - stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; - /* Set the connected flag so we can queue data */ - soisconnecting(so); - } out_now: if (creat_lock_on) { SCTP_ASOC_CREATE_UNLOCK(inp); @@ -1752,6 +1680,37 @@ flags_out: *optsize = sizeof(uint32_t); break; } + case SCTP_INTERLEAVING_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + av->assoc_value = stcb->asoc.idata_supported; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_RLOCK(inp); + if (inp->idata_supported) { + av->assoc_value = 1; + } else { + av->assoc_value = 0; + } + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; + } case SCTP_CMT_ON_OFF: { struct sctp_assoc_value *av; @@ -1905,8 +1864,15 @@ flags_out: uint32_t *value, cnt; SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); - cnt = 0; SCTP_INP_RLOCK(inp); + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + /* Can't do this for a 1-1 socket */ + error = EINVAL; + SCTP_INP_RUNLOCK(inp); + break; + } + cnt = 0; LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { cnt++; } @@ -1918,15 +1884,28 @@ flags_out: case SCTP_GET_ASSOC_ID_LIST: { struct sctp_assoc_ids *ids; - unsigned int at, limit; + uint32_t at; + size_t limit; SCTP_CHECK_AND_CAST(ids, optval, struct sctp_assoc_ids, *optsize); + SCTP_INP_RLOCK(inp); + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + /* Can't do this for a 1-1 socket */ + error = EINVAL; + SCTP_INP_RUNLOCK(inp); + break; + } at = 0; limit = (*optsize - sizeof(uint32_t)) / sizeof(sctp_assoc_t); - SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { if (at < limit) { ids->gaids_assoc_id[at++] = sctp_get_associd(stcb); + if (at == 0) { + error = EINVAL; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } } else { error = EINVAL; SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); @@ -2219,23 +2198,27 @@ flags_out: size = 0; /* Count the sizes */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { - size += sizeof(struct sockaddr_in6); - } else { - switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) { + switch (net->ro._l_addr.sa.sa_family) { #ifdef INET - case AF_INET: + case AF_INET: +#ifdef INET6 + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { + size += sizeof(struct sockaddr_in6); + } else { size += sizeof(struct sockaddr_in); - break; + } +#else + size += sizeof(struct sockaddr_in); +#endif + break; #endif #ifdef INET6 - case AF_INET6: - size += sizeof(struct sockaddr_in6); - break; + case AF_INET6: + size += sizeof(struct sockaddr_in6); + break; #endif - default: - break; - } + default: + break; } } SCTP_TCB_UNLOCK(stcb); @@ -2267,43 +2250,47 @@ flags_out: sas = (struct sockaddr_storage *)&saddr->addr[0]; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { - cpsz = sizeof(struct sockaddr_in6); - } else { - switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) { + switch (net->ro._l_addr.sa.sa_family) { #ifdef INET - case AF_INET: - cpsz = sizeof(struct sockaddr_in); - break; -#endif + case AF_INET: #ifdef INET6 - case AF_INET6: + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { cpsz = sizeof(struct sockaddr_in6); - break; -#endif - default: - cpsz = 0; - break; + } else { + cpsz = sizeof(struct sockaddr_in); } - } - if (cpsz == 0) { +#else + cpsz = sizeof(struct sockaddr_in); +#endif break; - } - if (left < cpsz) { +#endif +#ifdef INET6 + case AF_INET6: + cpsz = sizeof(struct sockaddr_in6); + break; +#endif + default: + cpsz = 0; + break; + } + if (cpsz == 0) { + break; + } + if (left < cpsz) { /* not enough room. */ break; } #if defined(INET) && defined(INET6) if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) && - (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET)) { + (net->ro._l_addr.sa.sa_family == AF_INET)) { /* Must map the address */ - in6_sin_2_v4mapsin6((struct sockaddr_in *)&net->ro._l_addr, + in6_sin_2_v4mapsin6(&net->ro._l_addr.sin, (struct sockaddr_in6 *)sas); } else { -#endif memcpy(sas, &net->ro._l_addr, cpsz); -#if defined(INET) && defined(INET6) } +#else + memcpy(sas, &net->ro._l_addr, cpsz); #endif ((struct sockaddr_in *)sas)->sin_port = stcb->rport; @@ -2340,13 +2327,35 @@ flags_out: { struct sctp_paddrparams *paddrp; struct sctp_nets *net; + struct sockaddr *addr; + +#if defined(INET) && defined(INET6) + struct sockaddr_in sin_store; + +#endif SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, *optsize); SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id); - net = NULL; - if (stcb) { - net = sctp_findnet(stcb, (struct sockaddr *)&paddrp->spp_address); +#if defined(INET) && defined(INET6) + if (paddrp->spp_address.ss_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&paddrp->spp_address; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + in6_sin6_2_sin(&sin_store, sin6); + addr = (struct sockaddr *)&sin_store; + } else { + addr = (struct sockaddr *)&paddrp->spp_address; + } + } else { + addr = (struct sockaddr *)&paddrp->spp_address; + } +#else + addr = (struct sockaddr *)&paddrp->spp_address; +#endif + if (stcb != NULL) { + net = sctp_findnet(stcb, addr); } else { /* * We increment here since @@ -2355,22 +2364,20 @@ flags_out: * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ + net = NULL; SCTP_INP_INCR_REF(inp); - stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&paddrp->spp_address, &net, NULL, NULL); + stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } - if (stcb && (net == NULL)) { - struct sockaddr *sa; - - sa = (struct sockaddr *)&paddrp->spp_address; + if ((stcb != NULL) && (net == NULL)) { #ifdef INET - if (sa->sa_family == AF_INET) { + if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)sa; - if (sin->sin_addr.s_addr) { + sin = (struct sockaddr_in *)addr; + if (sin->sin_addr.s_addr != INADDR_ANY) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); @@ -2379,10 +2386,10 @@ flags_out: } else #endif #ifdef INET6 - if (sa->sa_family == AF_INET6) { + if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)sa; + sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); @@ -2398,21 +2405,27 @@ flags_out: break; } } - if (stcb) { + if (stcb != NULL) { /* Applies to the specific association */ paddrp->spp_flags = 0; - if (net) { - int ovh; - - if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { - ovh = SCTP_MED_OVERHEAD; - } else { - ovh = SCTP_MED_V4_OVERHEAD; - } - + if (net != NULL) { paddrp->spp_hbinterval = net->heart_beat_delay; paddrp->spp_pathmaxrxt = net->failure_threshold; - paddrp->spp_pathmtu = net->mtu - ovh; + paddrp->spp_pathmtu = net->mtu; + switch (net->ro._l_addr.sa.sa_family) { +#ifdef INET + case AF_INET: + paddrp->spp_pathmtu -= SCTP_MIN_V4_OVERHEAD; + break; +#endif +#ifdef INET6 + case AF_INET6: + paddrp->spp_pathmtu -= SCTP_MIN_V4_OVERHEAD; + break; +#endif + default: + break; + } /* get flags for HB */ if (net->dest_state & SCTP_ADDR_NOHB) { paddrp->spp_flags |= SPP_HB_DISABLE; @@ -2421,9 +2434,9 @@ flags_out: } /* get flags for PMTU */ if (net->dest_state & SCTP_ADDR_NO_PMTUD) { - paddrp->spp_flags |= SPP_PMTUD_ENABLE; - } else { paddrp->spp_flags |= SPP_PMTUD_DISABLE; + } else { + paddrp->spp_flags |= SPP_PMTUD_ENABLE; } if (net->dscp & 0x01) { paddrp->spp_dscp = net->dscp & 0xfc; @@ -2442,7 +2455,7 @@ flags_out: * value */ paddrp->spp_pathmaxrxt = stcb->asoc.def_net_failure; - paddrp->spp_pathmtu = sctp_get_frag_point(stcb, &stcb->asoc); + paddrp->spp_pathmtu = 0; if (stcb->asoc.default_dscp & 0x01) { paddrp->spp_dscp = stcb->asoc.default_dscp & 0xfc; paddrp->spp_flags |= SPP_DSCP; @@ -2517,13 +2530,35 @@ flags_out: { struct sctp_paddrinfo *paddri; struct sctp_nets *net; + struct sockaddr *addr; + +#if defined(INET) && defined(INET6) + struct sockaddr_in sin_store; + +#endif SCTP_CHECK_AND_CAST(paddri, optval, struct sctp_paddrinfo, *optsize); SCTP_FIND_STCB(inp, stcb, paddri->spinfo_assoc_id); - net = NULL; - if (stcb) { - net = sctp_findnet(stcb, (struct sockaddr *)&paddri->spinfo_address); +#if defined(INET) && defined(INET6) + if (paddri->spinfo_address.ss_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&paddri->spinfo_address; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + in6_sin6_2_sin(&sin_store, sin6); + addr = (struct sockaddr *)&sin_store; + } else { + addr = (struct sockaddr *)&paddri->spinfo_address; + } + } else { + addr = (struct sockaddr *)&paddri->spinfo_address; + } +#else + addr = (struct sockaddr *)&paddri->spinfo_address; +#endif + if (stcb != NULL) { + net = sctp_findnet(stcb, addr); } else { /* * We increment here since @@ -2532,14 +2567,15 @@ flags_out: * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ + net = NULL; SCTP_INP_INCR_REF(inp); - stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&paddri->spinfo_address, &net, NULL, NULL); + stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } - if ((stcb) && (net)) { + if ((stcb != NULL) && (net != NULL)) { if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { /* It's unconfirmed */ paddri->spinfo_state = SCTP_UNCONFIRMED; @@ -2555,10 +2591,24 @@ flags_out: paddri->spinfo_rto = net->RTO; paddri->spinfo_assoc_id = sctp_get_associd(stcb); paddri->spinfo_mtu = net->mtu; + switch (addr->sa_family) { +#if defined(INET) + case AF_INET: + paddri->spinfo_mtu -= SCTP_MIN_V4_OVERHEAD; + break; +#endif +#if defined(INET6) + case AF_INET6: + paddri->spinfo_mtu -= SCTP_MIN_OVERHEAD; + break; +#endif + default: + break; + } SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_paddrinfo); } else { - if (stcb) { + if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); @@ -2588,12 +2638,7 @@ flags_out: error = EINVAL; break; } - /* - * I think passing the state is fine since - * sctp_constants.h will be available to the user - * land. - */ - sstat->sstat_state = stcb->asoc.state; + sstat->sstat_state = sctp_map_assoc_state(stcb->asoc.state); sstat->sstat_assoc_id = sctp_get_associd(stcb); sstat->sstat_rwnd = stcb->asoc.peers_rwnd; sstat->sstat_unackdata = stcb->asoc.sent_queue_cnt; @@ -2631,6 +2676,20 @@ flags_out: sstat->sstat_primary.spinfo_srtt = net->lastsa >> SCTP_RTT_SHIFT; sstat->sstat_primary.spinfo_rto = net->RTO; sstat->sstat_primary.spinfo_mtu = net->mtu; + switch (stcb->asoc.primary_destination->ro._l_addr.sa.sa_family) { +#if defined(INET) + case AF_INET: + sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_V4_OVERHEAD; + break; +#endif +#if defined(INET6) + case AF_INET6: + sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_OVERHEAD; + break; +#endif + default: + break; + } sstat->sstat_primary.spinfo_assoc_id = sctp_get_associd(stcb); SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_status); @@ -2775,16 +2834,32 @@ flags_out: SCTP_FIND_STCB(inp, stcb, ssp->ssp_assoc_id); if (stcb) { - /* simply copy out the sockaddr_storage... */ - size_t len; + union sctp_sockstore *addr; - len = *optsize; - if (len > stcb->asoc.primary_destination->ro._l_addr.sa.sa_len) - len = stcb->asoc.primary_destination->ro._l_addr.sa.sa_len; - - memcpy(&ssp->ssp_addr, - &stcb->asoc.primary_destination->ro._l_addr, - len); + addr = &stcb->asoc.primary_destination->ro._l_addr; + switch (addr->sa.sa_family) { +#ifdef INET + case AF_INET: +#ifdef INET6 + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { + in6_sin_2_v4mapsin6(&addr->sin, + (struct sockaddr_in6 *)&ssp->ssp_addr); + } else { + memcpy(&ssp->ssp_addr, &addr->sin, sizeof(struct sockaddr_in)); + } +#else + memcpy(&ssp->ssp_addr, &addr->sin, sizeof(struct sockaddr_in)); +#endif + break; +#endif +#ifdef INET6 + case AF_INET6: + memcpy(&ssp->ssp_addr, &addr->sin6, sizeof(struct sockaddr_in6)); + break; +#endif + default: + break; + } SCTP_TCB_UNLOCK(stcb); *optsize = sizeof(struct sctp_setprim); } else { @@ -3124,13 +3199,35 @@ flags_out: { struct sctp_paddrthlds *thlds; struct sctp_nets *net; + struct sockaddr *addr; + +#if defined(INET) && defined(INET6) + struct sockaddr_in sin_store; + +#endif SCTP_CHECK_AND_CAST(thlds, optval, struct sctp_paddrthlds, *optsize); SCTP_FIND_STCB(inp, stcb, thlds->spt_assoc_id); - net = NULL; - if (stcb) { - net = sctp_findnet(stcb, (struct sockaddr *)&thlds->spt_address); +#if defined(INET) && defined(INET6) + if (thlds->spt_address.ss_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&thlds->spt_address; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + in6_sin6_2_sin(&sin_store, sin6); + addr = (struct sockaddr *)&sin_store; + } else { + addr = (struct sockaddr *)&thlds->spt_address; + } + } else { + addr = (struct sockaddr *)&thlds->spt_address; + } +#else + addr = (struct sockaddr *)&thlds->spt_address; +#endif + if (stcb != NULL) { + net = sctp_findnet(stcb, addr); } else { /* * We increment here since @@ -3139,22 +3236,20 @@ flags_out: * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ + net = NULL; SCTP_INP_INCR_REF(inp); - stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&thlds->spt_address, &net, NULL, NULL); + stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } - if (stcb && (net == NULL)) { - struct sockaddr *sa; - - sa = (struct sockaddr *)&thlds->spt_address; + if ((stcb != NULL) && (net == NULL)) { #ifdef INET - if (sa->sa_family == AF_INET) { + if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)sa; - if (sin->sin_addr.s_addr) { + sin = (struct sockaddr_in *)addr; + if (sin->sin_addr.s_addr != INADDR_ANY) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); @@ -3163,10 +3258,10 @@ flags_out: } else #endif #ifdef INET6 - if (sa->sa_family == AF_INET6) { + if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)sa; + sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); @@ -3182,13 +3277,15 @@ flags_out: break; } } - if (stcb) { - if (net) { + if (stcb != NULL) { + if (net != NULL) { thlds->spt_pathmaxrxt = net->failure_threshold; thlds->spt_pathpfthld = net->pf_threshold; + thlds->spt_pathcpthld = 0xffff; } else { thlds->spt_pathmaxrxt = stcb->asoc.def_net_failure; thlds->spt_pathpfthld = stcb->asoc.def_net_pf_threshold; + thlds->spt_pathcpthld = 0xffff; } thlds->spt_assoc_id = sctp_get_associd(stcb); SCTP_TCB_UNLOCK(stcb); @@ -3200,6 +3297,7 @@ flags_out: SCTP_INP_RLOCK(inp); thlds->spt_pathmaxrxt = inp->sctp_ep.def_net_failure; thlds->spt_pathpfthld = inp->sctp_ep.def_net_pf_threshold; + thlds->spt_pathcpthld = 0xffff; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -3215,12 +3313,35 @@ flags_out: { struct sctp_udpencaps *encaps; struct sctp_nets *net; + struct sockaddr *addr; + +#if defined(INET) && defined(INET6) + struct sockaddr_in sin_store; + +#endif SCTP_CHECK_AND_CAST(encaps, optval, struct sctp_udpencaps, *optsize); SCTP_FIND_STCB(inp, stcb, encaps->sue_assoc_id); +#if defined(INET) && defined(INET6) + if (encaps->sue_address.ss_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&encaps->sue_address; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + in6_sin6_2_sin(&sin_store, sin6); + addr = (struct sockaddr *)&sin_store; + } else { + addr = (struct sockaddr *)&encaps->sue_address; + } + } else { + addr = (struct sockaddr *)&encaps->sue_address; + } +#else + addr = (struct sockaddr *)&encaps->sue_address; +#endif if (stcb) { - net = sctp_findnet(stcb, (struct sockaddr *)&encaps->sue_address); + net = sctp_findnet(stcb, addr); } else { /* * We increment here since @@ -3231,21 +3352,18 @@ flags_out: */ net = NULL; SCTP_INP_INCR_REF(inp); - stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&encaps->sue_address, &net, NULL, NULL); + stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } - if (stcb && (net == NULL)) { - struct sockaddr *sa; - - sa = (struct sockaddr *)&encaps->sue_address; + if ((stcb != NULL) && (net == NULL)) { #ifdef INET - if (sa->sa_family == AF_INET) { + if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)sa; - if (sin->sin_addr.s_addr) { + sin = (struct sockaddr_in *)addr; + if (sin->sin_addr.s_addr != INADDR_ANY) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); @@ -3254,10 +3372,10 @@ flags_out: } else #endif #ifdef INET6 - if (sa->sa_family == AF_INET6) { + if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)sa; + sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { error = EINVAL; SCTP_TCB_UNLOCK(stcb); @@ -3273,7 +3391,7 @@ flags_out: break; } } - if (stcb) { + if (stcb != NULL) { if (net) { encaps->sue_port = net->port; } else { @@ -3297,7 +3415,7 @@ flags_out: } break; } - case SCTP_ENABLE_STREAM_RESET: + case SCTP_ECN_SUPPORTED: { struct sctp_assoc_value *av; @@ -3305,14 +3423,14 @@ flags_out: SCTP_FIND_STCB(inp, stcb, av->assoc_id); if (stcb) { - av->assoc_value = (uint32_t) stcb->asoc.local_strreset_support; + av->assoc_value = stcb->asoc.ecn_supported; SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (av->assoc_id == SCTP_FUTURE_ASSOC)) { SCTP_INP_RLOCK(inp); - av->assoc_value = (uint32_t) inp->local_strreset_support; + av->assoc_value = inp->ecn_supported; SCTP_INP_RUNLOCK(inp); } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -3324,98 +3442,381 @@ flags_out: } break; } - default: - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); - error = ENOPROTOOPT; - break; - } /* end switch (sopt->sopt_name) */ - if (error) { - *optsize = 0; - } - return (error); -} - -static int -sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, - void *p) -{ - int error, set_opt; - uint32_t *mopt; - struct sctp_tcb *stcb = NULL; - struct sctp_inpcb *inp = NULL; - uint32_t vrf_id; + case SCTP_PR_SUPPORTED: + { + struct sctp_assoc_value *av; - if (optval == NULL) { - SCTP_PRINTF("optval is NULL\n"); - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - return (EINVAL); - } - inp = (struct sctp_inpcb *)so->so_pcb; - if (inp == NULL) { - SCTP_PRINTF("inp is NULL?\n"); - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - return (EINVAL); - } - vrf_id = inp->def_vrf_id; + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); - error = 0; - switch (optname) { - case SCTP_NODELAY: - case SCTP_AUTOCLOSE: - case SCTP_AUTO_ASCONF: - case SCTP_EXPLICIT_EOR: - case SCTP_DISABLE_FRAGMENTS: - case SCTP_USE_EXT_RCVINFO: - case SCTP_I_WANT_MAPPED_V4_ADDR: - /* copy in the option value */ - SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize); - set_opt = 0; - if (error) - break; - switch (optname) { - case SCTP_DISABLE_FRAGMENTS: - set_opt = SCTP_PCB_FLAGS_NO_FRAGMENT; - break; - case SCTP_AUTO_ASCONF: - /* - * NOTE: we don't really support this flag - */ - if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { - /* only valid for bound all sockets */ - if ((SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) && - (*mopt != 0)) { - /* forbidden by admin */ - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EPERM); - return (EPERM); - } - set_opt = SCTP_PCB_FLAGS_AUTO_ASCONF; + if (stcb) { + av->assoc_value = stcb->asoc.prsctp_supported; + SCTP_TCB_UNLOCK(stcb); } else { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - return (EINVAL); + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->prsctp_supported; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } } - break; - case SCTP_EXPLICIT_EOR: - set_opt = SCTP_PCB_FLAGS_EXPLICIT_EOR; - break; - case SCTP_USE_EXT_RCVINFO: - set_opt = SCTP_PCB_FLAGS_EXT_RCVINFO; - break; - case SCTP_I_WANT_MAPPED_V4_ADDR: - if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { - set_opt = SCTP_PCB_FLAGS_NEEDS_MAPPED_V4; - } else { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - return (EINVAL); + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); } break; - case SCTP_NODELAY: - set_opt = SCTP_PCB_FLAGS_NODELAY; - break; - case SCTP_AUTOCLOSE: - if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || - (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); - return (EINVAL); + } + case SCTP_AUTH_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + av->assoc_value = stcb->asoc.auth_supported; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->auth_supported; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; + } + case SCTP_ASCONF_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + av->assoc_value = stcb->asoc.asconf_supported; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->asconf_supported; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; + } + case SCTP_RECONFIG_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + av->assoc_value = stcb->asoc.reconfig_supported; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->reconfig_supported; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; + } + case SCTP_NRSACK_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + av->assoc_value = stcb->asoc.nrsack_supported; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->nrsack_supported; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; + } + case SCTP_PKTDROP_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + av->assoc_value = stcb->asoc.pktdrop_supported; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->pktdrop_supported; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; + } + case SCTP_ENABLE_STREAM_RESET: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + av->assoc_value = (uint32_t) stcb->asoc.local_strreset_support; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_RLOCK(inp); + av->assoc_value = (uint32_t) inp->local_strreset_support; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; + } + case SCTP_PR_STREAM_STATUS: + { + struct sctp_prstatus *sprstat; + uint16_t sid; + uint16_t policy; + + SCTP_CHECK_AND_CAST(sprstat, optval, struct sctp_prstatus, *optsize); + SCTP_FIND_STCB(inp, stcb, sprstat->sprstat_assoc_id); + + sid = sprstat->sprstat_sid; + policy = sprstat->sprstat_policy; +#if defined(SCTP_DETAILED_STR_STATS) + if ((stcb != NULL) && + (sid < stcb->asoc.streamoutcnt) && + (policy != SCTP_PR_SCTP_NONE) && + ((policy <= SCTP_PR_SCTP_MAX) || + (policy == SCTP_PR_SCTP_ALL))) { + if (policy == SCTP_PR_SCTP_ALL) { + sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[0]; + sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[0]; + } else { + sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[policy]; + sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[policy]; + } +#else + if ((stcb != NULL) && + (sid < stcb->asoc.streamoutcnt) && + (policy == SCTP_PR_SCTP_ALL)) { + sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[0]; + sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[0]; +#endif + SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_prstatus); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + break; + } + case SCTP_PR_ASSOC_STATUS: + { + struct sctp_prstatus *sprstat; + uint16_t policy; + + SCTP_CHECK_AND_CAST(sprstat, optval, struct sctp_prstatus, *optsize); + SCTP_FIND_STCB(inp, stcb, sprstat->sprstat_assoc_id); + + policy = sprstat->sprstat_policy; + if ((stcb != NULL) && + (policy != SCTP_PR_SCTP_NONE) && + ((policy <= SCTP_PR_SCTP_MAX) || + (policy == SCTP_PR_SCTP_ALL))) { + if (policy == SCTP_PR_SCTP_ALL) { + sprstat->sprstat_abandoned_unsent = stcb->asoc.abandoned_unsent[0]; + sprstat->sprstat_abandoned_sent = stcb->asoc.abandoned_sent[0]; + } else { + sprstat->sprstat_abandoned_unsent = stcb->asoc.abandoned_unsent[policy]; + sprstat->sprstat_abandoned_sent = stcb->asoc.abandoned_sent[policy]; + } + SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(struct sctp_prstatus); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + break; + } + case SCTP_MAX_CWND: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + av->assoc_value = stcb->asoc.max_cwnd; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->max_cwnd; + SCTP_INP_RUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + if (error == 0) { + *optsize = sizeof(struct sctp_assoc_value); + } + break; + } + default: + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); + error = ENOPROTOOPT; + break; + } /* end switch (sopt->sopt_name) */ + if (error) { + *optsize = 0; + } + return (error); +} + +static int +sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, + void *p) +{ + int error, set_opt; + uint32_t *mopt; + struct sctp_tcb *stcb = NULL; + struct sctp_inpcb *inp = NULL; + uint32_t vrf_id; + + if (optval == NULL) { + SCTP_PRINTF("optval is NULL\n"); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == NULL) { + SCTP_PRINTF("inp is NULL?\n"); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + vrf_id = inp->def_vrf_id; + + error = 0; + switch (optname) { + case SCTP_NODELAY: + case SCTP_AUTOCLOSE: + case SCTP_AUTO_ASCONF: + case SCTP_EXPLICIT_EOR: + case SCTP_DISABLE_FRAGMENTS: + case SCTP_USE_EXT_RCVINFO: + case SCTP_I_WANT_MAPPED_V4_ADDR: + /* copy in the option value */ + SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize); + set_opt = 0; + if (error) + break; + switch (optname) { + case SCTP_DISABLE_FRAGMENTS: + set_opt = SCTP_PCB_FLAGS_NO_FRAGMENT; + break; + case SCTP_AUTO_ASCONF: + /* + * NOTE: we don't really support this flag + */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + /* only valid for bound all sockets */ + if ((SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) && + (*mopt != 0)) { + /* forbidden by admin */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EPERM); + return (EPERM); + } + set_opt = SCTP_PCB_FLAGS_AUTO_ASCONF; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + break; + case SCTP_EXPLICIT_EOR: + set_opt = SCTP_PCB_FLAGS_EXPLICIT_EOR; + break; + case SCTP_USE_EXT_RCVINFO: + set_opt = SCTP_PCB_FLAGS_EXT_RCVINFO; + break; + case SCTP_I_WANT_MAPPED_V4_ADDR: + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + set_opt = SCTP_PCB_FLAGS_NEEDS_MAPPED_V4; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + break; + case SCTP_NODELAY: + set_opt = SCTP_PCB_FLAGS_NODELAY; + break; + case SCTP_AUTOCLOSE: + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); } set_opt = SCTP_PCB_FLAGS_AUTOCLOSE; /* @@ -3487,6 +3888,47 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } break; } + case SCTP_INTERLEAVING_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (av->assoc_value == 0) { + inp->idata_supported = 0; + } else { + if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) && + (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS))) { + inp->idata_supported = 1; + } else { + /* + * Must have Frag + * interleave and + * stream interleave + * on + */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + break; + } case SCTP_CMT_ON_OFF: if (SCTP_BASE_SYSCTL(sctp_cmt_on_off)) { struct sctp_assoc_value *av; @@ -3684,7 +4126,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); - } else { /* * Can't set stream value without @@ -3959,12 +4400,13 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, uint32_t i; SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, optsize); - if (optsize < sizeof(struct sctp_hmacalgo) + shmac->shmac_number_of_idents * sizeof(uint16_t)) { + if ((optsize < sizeof(struct sctp_hmacalgo) + shmac->shmac_number_of_idents * sizeof(uint16_t)) || + (shmac->shmac_number_of_idents > 0xffff)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; } - hmaclist = sctp_alloc_hmaclist(shmac->shmac_number_of_idents); + hmaclist = sctp_alloc_hmaclist((uint16_t) shmac->shmac_number_of_idents); if (hmaclist == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); error = ENOMEM; @@ -4172,7 +4614,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, error = ENOENT; break; } - if (stcb->asoc.peer_supports_strreset == 0) { + if (stcb->asoc.reconfig_supported == 0) { /* * Peer does not support the chunk type. */ @@ -4181,18 +4623,30 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_TCB_UNLOCK(stcb); break; } - if (stcb->asoc.stream_reset_outstanding) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); - error = EALREADY; + if (sizeof(struct sctp_reset_streams) + + strrst->srs_number_streams * sizeof(uint16_t) > optsize) { + error = EINVAL; SCTP_TCB_UNLOCK(stcb); break; } if (strrst->srs_flags & SCTP_STREAM_RESET_INCOMING) { send_in = 1; + if (stcb->asoc.stream_reset_outstanding) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); + error = EALREADY; + SCTP_TCB_UNLOCK(stcb); + break; + } } if (strrst->srs_flags & SCTP_STREAM_RESET_OUTGOING) { send_out = 1; } + if ((strrst->srs_number_streams > SCTP_MAX_STREAMS_AT_ONCE_RESET) && send_in) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); + error = ENOMEM; + SCTP_TCB_UNLOCK(stcb); + break; + } if ((send_in == 0) && (send_out == 0)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; @@ -4217,11 +4671,46 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_TCB_UNLOCK(stcb); break; } - error = sctp_send_str_reset_req(stcb, strrst->srs_number_streams, - strrst->srs_stream_list, - send_out, send_in, 0, 0, 0, 0, 0); - - sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED); + if (send_out) { + int cnt; + uint16_t strm; + + if (strrst->srs_number_streams) { + for (i = 0, cnt = 0; i < strrst->srs_number_streams; i++) { + strm = strrst->srs_stream_list[i]; + if (stcb->asoc.strmout[strm].state == SCTP_STREAM_OPEN) { + stcb->asoc.strmout[strm].state = SCTP_STREAM_RESET_PENDING; + cnt++; + } + } + } else { + /* Its all */ + for (i = 0, cnt = 0; i < stcb->asoc.streamoutcnt; i++) { + if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN) { + stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING; + cnt++; + } + } + } + } + if (send_in) { + error = sctp_send_str_reset_req(stcb, strrst->srs_number_streams, + strrst->srs_stream_list, + send_in, 0, 0, 0, 0, 0); + } else { + error = sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_LOCKED); + } + if (error == 0) { + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED); + } else { + /* + * For outgoing streams don't report any + * problems in sending the request to the + * application. XXX: Double check resetting + * incoming streams. + */ + error = 0; + } SCTP_TCB_UNLOCK(stcb); break; } @@ -4239,7 +4728,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, error = ENOENT; break; } - if (stcb->asoc.peer_supports_strreset == 0) { + if (stcb->asoc.reconfig_supported == 0) { /* * Peer does not support the chunk type. */ @@ -4291,7 +4780,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, goto skip_stuff; } } - error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 0, addstream, add_o_strmcnt, add_i_strmcnt, 0); + error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, addstream, add_o_strmcnt, add_i_strmcnt, 0); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED); skip_stuff: SCTP_TCB_UNLOCK(stcb); @@ -4299,6 +4788,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } case SCTP_RESET_ASSOC: { + int i; uint32_t *value; SCTP_CHECK_AND_CAST(value, optval, uint32_t, optsize); @@ -4308,7 +4798,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, error = ENOENT; break; } - if (stcb->asoc.peer_supports_strreset == 0) { + if (stcb->asoc.reconfig_supported == 0) { /* * Peer does not support the chunk type. */ @@ -4323,7 +4813,25 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_TCB_UNLOCK(stcb); break; } - error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 1, 0, 0, 0, 0); + /* + * Is there any data pending in the send or sent + * queues? + */ + if (!TAILQ_EMPTY(&stcb->asoc.send_queue) || + !TAILQ_EMPTY(&stcb->asoc.sent_queue)) { + busy_out: + error = EBUSY; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + SCTP_TCB_UNLOCK(stcb); + break; + } + /* Do any streams have data queued? */ + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { + goto busy_out; + } + } + error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 1, 0, 0, 0, 0); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED); SCTP_TCB_UNLOCK(stcb); break; @@ -4347,7 +4855,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, case SCTP_CONNECT_X_COMPLETE: { struct sockaddr *sa; - struct sctp_nets *net; /* FIXME MT: check correct? */ SCTP_CHECK_AND_CAST(sa, optval, struct sockaddr, optsize); @@ -4358,7 +4865,6 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb) { SCTP_TCB_LOCK(stcb); - net = sctp_findnet(stcb, sa); } SCTP_INP_RUNLOCK(inp); } else { @@ -4370,7 +4876,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, * TCB.. aka NULL. */ SCTP_INP_INCR_REF(inp); - stcb = sctp_findassociation_ep_addr(&inp, sa, &net, NULL, NULL); + stcb = sctp_findassociation_ep_addr(&inp, sa, NULL, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } @@ -4386,7 +4892,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, stcb->asoc.primary_destination, - SCTP_FROM_SCTP_USRREQ + SCTP_LOC_9); + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8); sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); } else { /* @@ -4690,12 +5196,35 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sctp_paddrparams *paddrp; struct sctp_nets *net; + struct sockaddr *addr; + +#if defined(INET) && defined(INET6) + struct sockaddr_in sin_store; + +#endif SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, optsize); SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id); - net = NULL; - if (stcb) { - net = sctp_findnet(stcb, (struct sockaddr *)&paddrp->spp_address); + +#if defined(INET) && defined(INET6) + if (paddrp->spp_address.ss_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&paddrp->spp_address; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + in6_sin6_2_sin(&sin_store, sin6); + addr = (struct sockaddr *)&sin_store; + } else { + addr = (struct sockaddr *)&paddrp->spp_address; + } + } else { + addr = (struct sockaddr *)&paddrp->spp_address; + } +#else + addr = (struct sockaddr *)&paddrp->spp_address; +#endif + if (stcb != NULL) { + net = sctp_findnet(stcb, addr); } else { /* * We increment here since @@ -4704,25 +5233,22 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ + net = NULL; SCTP_INP_INCR_REF(inp); - stcb = sctp_findassociation_ep_addr(&inp, - (struct sockaddr *)&paddrp->spp_address, + stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } - if (stcb && (net == NULL)) { - struct sockaddr *sa; - - sa = (struct sockaddr *)&paddrp->spp_address; + if ((stcb != NULL) && (net == NULL)) { #ifdef INET - if (sa->sa_family == AF_INET) { + if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)sa; - if (sin->sin_addr.s_addr) { + sin = (struct sockaddr_in *)addr; + if (sin->sin_addr.s_addr != INADDR_ANY) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); error = EINVAL; @@ -4731,10 +5257,10 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } else #endif #ifdef INET6 - if (sa->sa_family == AF_INET6) { + if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)sa; + sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); @@ -4763,28 +5289,15 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (EINVAL); } - if (stcb) { + if (stcb != NULL) { /************************TCB SPECIFIC SET ******************/ - /* - * do we change the timer for HB, we run - * only one? - */ - int ovh = 0; - - if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { - ovh = SCTP_MED_OVERHEAD; - } else { - ovh = SCTP_MED_V4_OVERHEAD; - } - - /* network sets ? */ - if (net) { + if (net != NULL) { /************************NET SPECIFIC SET ******************/ if (paddrp->spp_flags & SPP_HB_DISABLE) { if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED) && !(net->dest_state & SCTP_ADDR_NOHB)) { sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, - SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10); + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_9); } net->dest_state |= SCTP_ADDR_NOHB; } @@ -4808,10 +5321,24 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) { if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, - SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10); + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_11); } net->dest_state |= SCTP_ADDR_NO_PMTUD; - net->mtu = paddrp->spp_pathmtu + ovh; + net->mtu = paddrp->spp_pathmtu; + switch (net->ro._l_addr.sa.sa_family) { +#ifdef INET + case AF_INET: + net->mtu += SCTP_MIN_V4_OVERHEAD; + break; +#endif +#ifdef INET6 + case AF_INET6: + net->mtu += SCTP_MIN_OVERHEAD; + break; +#endif + default: + break; + } if (net->mtu < stcb->asoc.smallest_mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } @@ -4832,7 +5359,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, (net->error_count > net->pf_threshold)) { net->dest_state |= SCTP_ADDR_PF; sctp_send_hb(stcb, net, SCTP_SO_LOCKED); - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_TIMER + SCTP_LOC_3); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, + stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_12); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); } } @@ -4863,7 +5392,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, #endif } else { /************************ASSOC ONLY -- NO NET SPECIFIC SET ******************/ - if (paddrp->spp_pathmaxrxt) { + if (paddrp->spp_pathmaxrxt != 0) { stcb->asoc.def_net_failure = paddrp->spp_pathmaxrxt; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->dest_state & SCTP_ADDR_PF) { @@ -4875,7 +5404,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, (net->error_count > net->pf_threshold)) { net->dest_state |= SCTP_ADDR_PF; sctp_send_hb(stcb, net, SCTP_SO_LOCKED); - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_TIMER + SCTP_LOC_3); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, + stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_13); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); } } @@ -4894,14 +5425,14 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } } if (paddrp->spp_flags & SPP_HB_ENABLE) { - if (paddrp->spp_hbinterval) { + if (paddrp->spp_hbinterval != 0) { stcb->asoc.heart_beat_delay = paddrp->spp_hbinterval; } else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) { stcb->asoc.heart_beat_delay = 0; } /* Turn back on the timer */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { - if (paddrp->spp_hbinterval) { + if (paddrp->spp_hbinterval != 0) { net->heart_beat_delay = paddrp->spp_hbinterval; } else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) { net->heart_beat_delay = 0; @@ -4910,7 +5441,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, net->dest_state &= ~SCTP_ADDR_NOHB; } sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, - SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10); + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_14); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net); } sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); @@ -4920,7 +5451,9 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, if (!(net->dest_state & SCTP_ADDR_NOHB)) { net->dest_state |= SCTP_ADDR_NOHB; if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED)) { - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, + inp, stcb, net, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_15); } } } @@ -4930,10 +5463,24 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, - SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10); + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_16); } net->dest_state |= SCTP_ADDR_NO_PMTUD; - net->mtu = paddrp->spp_pathmtu + ovh; + net->mtu = paddrp->spp_pathmtu; + switch (net->ro._l_addr.sa.sa_family) { +#ifdef INET + case AF_INET: + net->mtu += SCTP_MIN_V4_OVERHEAD; + break; +#endif +#ifdef INET6 + case AF_INET6: + net->mtu += SCTP_MIN_OVERHEAD; + break; +#endif + default: + break; + } if (net->mtu < stcb->asoc.smallest_mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } @@ -4982,12 +5529,12 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, * set it with the options on the * socket */ - if (paddrp->spp_pathmaxrxt) { + if (paddrp->spp_pathmaxrxt != 0) { inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt; } if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0; - else if (paddrp->spp_hbinterval) { + else if (paddrp->spp_hbinterval != 0) { if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL) paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL; inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval); @@ -5153,13 +5700,35 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sctp_setprim *spa; struct sctp_nets *net; + struct sockaddr *addr; + +#if defined(INET) && defined(INET6) + struct sockaddr_in sin_store; + +#endif SCTP_CHECK_AND_CAST(spa, optval, struct sctp_setprim, optsize); SCTP_FIND_STCB(inp, stcb, spa->ssp_assoc_id); - net = NULL; - if (stcb) { - net = sctp_findnet(stcb, (struct sockaddr *)&spa->ssp_addr); +#if defined(INET) && defined(INET6) + if (spa->ssp_addr.ss_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&spa->ssp_addr; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + in6_sin6_2_sin(&sin_store, sin6); + addr = (struct sockaddr *)&sin_store; + } else { + addr = (struct sockaddr *)&spa->ssp_addr; + } + } else { + addr = (struct sockaddr *)&spa->ssp_addr; + } +#else + addr = (struct sockaddr *)&spa->ssp_addr; +#endif + if (stcb != NULL) { + net = sctp_findnet(stcb, addr); } else { /* * We increment here since @@ -5168,33 +5737,40 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ + net = NULL; SCTP_INP_INCR_REF(inp); - stcb = sctp_findassociation_ep_addr(&inp, - (struct sockaddr *)&spa->ssp_addr, + stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } - if ((stcb) && (net)) { - if ((net != stcb->asoc.primary_destination) && - (!(net->dest_state & SCTP_ADDR_UNCONFIRMED))) { - /* Ok we need to set it */ - if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) { - if ((stcb->asoc.alternate) && - (!(net->dest_state & SCTP_ADDR_PF)) && - (net->dest_state & SCTP_ADDR_REACHABLE)) { - sctp_free_remote_addr(stcb->asoc.alternate); - stcb->asoc.alternate = NULL; + if ((stcb != NULL) && (net != NULL)) { + if (net != stcb->asoc.primary_destination) { + if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED)) { + /* Ok we need to set it */ + if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) { + if ((stcb->asoc.alternate) && + (!(net->dest_state & SCTP_ADDR_PF)) && + (net->dest_state & SCTP_ADDR_REACHABLE)) { + sctp_free_remote_addr(stcb->asoc.alternate); + stcb->asoc.alternate = NULL; + } + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; } } } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } - if (stcb) { + if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } break; @@ -5216,14 +5792,36 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, case SCTP_SET_PEER_PRIMARY_ADDR: { struct sctp_setpeerprim *sspp; + struct sockaddr *addr; + +#if defined(INET) && defined(INET6) + struct sockaddr_in sin_store; + +#endif SCTP_CHECK_AND_CAST(sspp, optval, struct sctp_setpeerprim, optsize); SCTP_FIND_STCB(inp, stcb, sspp->sspp_assoc_id); if (stcb != NULL) { struct sctp_ifa *ifa; - ifa = sctp_find_ifa_by_addr((struct sockaddr *)&sspp->sspp_addr, - stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED); +#if defined(INET) && defined(INET6) + if (sspp->sspp_addr.ss_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&sspp->sspp_addr; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + in6_sin6_2_sin(&sin_store, sin6); + addr = (struct sockaddr *)&sin_store; + } else { + addr = (struct sockaddr *)&sspp->sspp_addr; + } + } else { + addr = (struct sockaddr *)&sspp->sspp_addr; + } +#else + addr = (struct sockaddr *)&sspp->sspp_addr; +#endif + ifa = sctp_find_ifa_by_addr(addr, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED); if (ifa == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; @@ -5240,7 +5838,11 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", - __FUNCTION__); + __func__); + continue; + } + if ((sctp_is_addr_restricted(stcb, laddr->ifa)) && + (!sctp_is_addr_pending(stcb, laddr->ifa))) { continue; } if (laddr->ifa == ifa) { @@ -5254,13 +5856,13 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, goto out_of_it; } } else { - switch (sspp->sspp_addr.ss_family) { + switch (addr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)&sspp->sspp_addr; + sin = (struct sockaddr_in *)addr; if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -5275,7 +5877,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)&sspp->sspp_addr; + sin6 = (struct sockaddr_in6 *)addr; if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); @@ -5291,11 +5893,11 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, goto out_of_it; } } - if (sctp_set_primary_ip_address_sa(stcb, - (struct sockaddr *)&sspp->sspp_addr) != 0) { + if (sctp_set_primary_ip_address_sa(stcb, addr) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; } + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SOCKOPT, SCTP_SO_LOCKED); out_of_it: SCTP_TCB_UNLOCK(stcb); } else { @@ -5602,7 +6204,7 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, SCTP_CHECK_AND_CAST(info, optval, struct sctp_default_prinfo, optsize); SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id); - if (PR_SCTP_INVALID_POLICY(info->pr_policy)) { + if (info->pr_policy > SCTP_PR_SCTP_MAX) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); error = EINVAL; break; @@ -5643,12 +6245,35 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sctp_paddrthlds *thlds; struct sctp_nets *net; + struct sockaddr *addr; + +#if defined(INET) && defined(INET6) + struct sockaddr_in sin_store; + +#endif SCTP_CHECK_AND_CAST(thlds, optval, struct sctp_paddrthlds, optsize); SCTP_FIND_STCB(inp, stcb, thlds->spt_assoc_id); - net = NULL; - if (stcb) { - net = sctp_findnet(stcb, (struct sockaddr *)&thlds->spt_address); + +#if defined(INET) && defined(INET6) + if (thlds->spt_address.ss_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&thlds->spt_address; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + in6_sin6_2_sin(&sin_store, sin6); + addr = (struct sockaddr *)&sin_store; + } else { + addr = (struct sockaddr *)&thlds->spt_address; + } + } else { + addr = (struct sockaddr *)&thlds->spt_address; + } +#else + addr = (struct sockaddr *)&thlds->spt_address; +#endif + if (stcb != NULL) { + net = sctp_findnet(stcb, addr); } else { /* * We increment here since @@ -5657,25 +6282,22 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, * the locked tcb (last argument) is NOT a * TCB.. aka NULL. */ + net = NULL; SCTP_INP_INCR_REF(inp); - stcb = sctp_findassociation_ep_addr(&inp, - (struct sockaddr *)&thlds->spt_address, + stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } - if (stcb && (net == NULL)) { - struct sockaddr *sa; - - sa = (struct sockaddr *)&thlds->spt_address; + if ((stcb != NULL) && (net == NULL)) { #ifdef INET - if (sa->sa_family == AF_INET) { + if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)sa; - if (sin->sin_addr.s_addr) { + sin = (struct sockaddr_in *)addr; + if (sin->sin_addr.s_addr != INADDR_ANY) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); error = EINVAL; @@ -5684,10 +6306,10 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } else #endif #ifdef INET6 - if (sa->sa_family == AF_INET6) { + if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)sa; + sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); @@ -5703,68 +6325,78 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, break; } } - if (stcb) { - if (net) { + if (thlds->spt_pathcpthld != 0xffff) { + error = EINVAL; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } + if (stcb != NULL) { + if (net != NULL) { + net->failure_threshold = thlds->spt_pathmaxrxt; + net->pf_threshold = thlds->spt_pathpfthld; if (net->dest_state & SCTP_ADDR_PF) { - if ((net->failure_threshold > thlds->spt_pathmaxrxt) || - (net->failure_threshold <= thlds->spt_pathpfthld)) { + if ((net->error_count > net->failure_threshold) || + (net->error_count <= net->pf_threshold)) { net->dest_state &= ~SCTP_ADDR_PF; } } else { - if ((net->failure_threshold > thlds->spt_pathpfthld) && - (net->failure_threshold <= thlds->spt_pathmaxrxt)) { + if ((net->error_count > net->pf_threshold) && + (net->error_count <= net->failure_threshold)) { net->dest_state |= SCTP_ADDR_PF; sctp_send_hb(stcb, net, SCTP_SO_LOCKED); - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_TIMER + SCTP_LOC_3); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, + stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_17); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); } } if (net->dest_state & SCTP_ADDR_REACHABLE) { - if (net->failure_threshold > thlds->spt_pathmaxrxt) { + if (net->error_count > net->failure_threshold) { net->dest_state &= ~SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED); } } else { - if (net->failure_threshold <= thlds->spt_pathmaxrxt) { + if (net->error_count <= net->failure_threshold) { net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED); } - } - net->failure_threshold = thlds->spt_pathmaxrxt; - net->pf_threshold = thlds->spt_pathpfthld; + } } else { TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + net->failure_threshold = thlds->spt_pathmaxrxt; + net->pf_threshold = thlds->spt_pathpfthld; if (net->dest_state & SCTP_ADDR_PF) { - if ((net->failure_threshold > thlds->spt_pathmaxrxt) || - (net->failure_threshold <= thlds->spt_pathpfthld)) { + if ((net->error_count > net->failure_threshold) || + (net->error_count <= net->pf_threshold)) { net->dest_state &= ~SCTP_ADDR_PF; } } else { - if ((net->failure_threshold > thlds->spt_pathpfthld) && - (net->failure_threshold <= thlds->spt_pathmaxrxt)) { + if ((net->error_count > net->pf_threshold) && + (net->error_count <= net->failure_threshold)) { net->dest_state |= SCTP_ADDR_PF; sctp_send_hb(stcb, net, SCTP_SO_LOCKED); - sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_TIMER + SCTP_LOC_3); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, + stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_18); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); } } if (net->dest_state & SCTP_ADDR_REACHABLE) { - if (net->failure_threshold > thlds->spt_pathmaxrxt) { + if (net->error_count > net->failure_threshold) { net->dest_state &= ~SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED); } } else { - if (net->failure_threshold <= thlds->spt_pathmaxrxt) { + if (net->error_count <= net->failure_threshold) { net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED); } } - net->failure_threshold = thlds->spt_pathmaxrxt; - net->pf_threshold = thlds->spt_pathpfthld; } stcb->asoc.def_net_failure = thlds->spt_pathmaxrxt; stcb->asoc.def_net_pf_threshold = thlds->spt_pathpfthld; } + SCTP_TCB_UNLOCK(stcb); } else { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || @@ -5784,11 +6416,35 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, { struct sctp_udpencaps *encaps; struct sctp_nets *net; + struct sockaddr *addr; + +#if defined(INET) && defined(INET6) + struct sockaddr_in sin_store; + +#endif SCTP_CHECK_AND_CAST(encaps, optval, struct sctp_udpencaps, optsize); SCTP_FIND_STCB(inp, stcb, encaps->sue_assoc_id); - if (stcb) { - net = sctp_findnet(stcb, (struct sockaddr *)&encaps->sue_address); + +#if defined(INET) && defined(INET6) + if (encaps->sue_address.ss_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&encaps->sue_address; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + in6_sin6_2_sin(&sin_store, sin6); + addr = (struct sockaddr *)&sin_store; + } else { + addr = (struct sockaddr *)&encaps->sue_address; + } + } else { + addr = (struct sockaddr *)&encaps->sue_address; + } +#else + addr = (struct sockaddr *)&encaps->sue_address; +#endif + if (stcb != NULL) { + net = sctp_findnet(stcb, addr); } else { /* * We increment here since @@ -5799,22 +6455,19 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, */ net = NULL; SCTP_INP_INCR_REF(inp); - stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&encaps->sue_address, &net, NULL, NULL); + stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_DECR_REF(inp); } } - if (stcb && (net == NULL)) { - struct sockaddr *sa; - - sa = (struct sockaddr *)&encaps->sue_address; + if ((stcb != NULL) && (net == NULL)) { #ifdef INET - if (sa->sa_family == AF_INET) { + if (addr->sa_family == AF_INET) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)sa; - if (sin->sin_addr.s_addr) { + sin = (struct sockaddr_in *)addr; + if (sin->sin_addr.s_addr != INADDR_ANY) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); error = EINVAL; @@ -5823,10 +6476,10 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } else #endif #ifdef INET6 - if (sa->sa_family == AF_INET6) { + if (addr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)sa; + sin6 = (struct sockaddr_in6 *)addr; if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); SCTP_TCB_UNLOCK(stcb); @@ -5842,8 +6495,8 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, break; } } - if (stcb) { - if (net) { + if (stcb != NULL) { + if (net != NULL) { net->port = encaps->sue_port; } else { stcb->asoc.port = encaps->sue_port; @@ -5863,6 +6516,273 @@ sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, } break; } + case SCTP_ECN_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (av->assoc_value == 0) { + inp->ecn_supported = 0; + } else { + inp->ecn_supported = 1; + } + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + break; + } + case SCTP_PR_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (av->assoc_value == 0) { + inp->prsctp_supported = 0; + } else { + inp->prsctp_supported = 1; + } + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + break; + } + case SCTP_AUTH_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + if ((av->assoc_value == 0) && + (inp->asconf_supported == 1)) { + /* + * AUTH is required for + * ASCONF + */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } else { + SCTP_INP_WLOCK(inp); + if (av->assoc_value == 0) { + inp->auth_supported = 0; + } else { + inp->auth_supported = 1; + } + SCTP_INP_WUNLOCK(inp); + } + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + break; + } + case SCTP_ASCONF_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + if ((av->assoc_value != 0) && + (inp->auth_supported == 0)) { + /* + * AUTH is required for + * ASCONF + */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } else { + SCTP_INP_WLOCK(inp); + if (av->assoc_value == 0) { + inp->asconf_supported = 0; + sctp_auth_delete_chunk(SCTP_ASCONF, + inp->sctp_ep.local_auth_chunks); + sctp_auth_delete_chunk(SCTP_ASCONF_ACK, + inp->sctp_ep.local_auth_chunks); + } else { + inp->asconf_supported = 1; + sctp_auth_add_chunk(SCTP_ASCONF, + inp->sctp_ep.local_auth_chunks); + sctp_auth_add_chunk(SCTP_ASCONF_ACK, + inp->sctp_ep.local_auth_chunks); + } + SCTP_INP_WUNLOCK(inp); + } + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + break; + } + case SCTP_RECONFIG_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (av->assoc_value == 0) { + inp->reconfig_supported = 0; + } else { + inp->reconfig_supported = 1; + } + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + break; + } + case SCTP_NRSACK_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (av->assoc_value == 0) { + inp->nrsack_supported = 0; + } else { + inp->nrsack_supported = 1; + } + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + break; + } + case SCTP_PKTDROP_SUPPORTED: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_WLOCK(inp); + if (av->assoc_value == 0) { + inp->pktdrop_supported = 0; + } else { + inp->pktdrop_supported = 1; + } + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + break; + } + case SCTP_MAX_CWND: + { + struct sctp_assoc_value *av; + struct sctp_nets *net; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + stcb->asoc.max_cwnd = av->assoc_value; + if (stcb->asoc.max_cwnd > 0) { + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if ((net->cwnd > stcb->asoc.max_cwnd) && + (net->cwnd > (net->mtu - sizeof(struct sctphdr)))) { + net->cwnd = stcb->asoc.max_cwnd; + if (net->cwnd < (net->mtu - sizeof(struct sctphdr))) { + net->cwnd = net->mtu - sizeof(struct sctphdr); + } + } + } + } + SCTP_TCB_UNLOCK(stcb); + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + (av->assoc_id == SCTP_FUTURE_ASSOC)) { + SCTP_INP_WLOCK(inp); + inp->max_cwnd = av->assoc_value; + SCTP_INP_WUNLOCK(inp); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + break; + } default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); error = ENOPROTOOPT; @@ -5878,7 +6798,20 @@ sctp_ctloutput(struct socket *so, struct sockopt *sopt) size_t optsize = 0; void *p; int error = 0; + struct sctp_inpcb *inp; + if ((sopt->sopt_level == SOL_SOCKET) && + (sopt->sopt_name == SO_SETFIB)) { + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == NULL) { + SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS); + return (EINVAL); + } + SCTP_INP_WLOCK(inp); + inp->fibnum = so->so_fibnum; + SCTP_INP_WUNLOCK(inp); + return (0); + } if (sopt->sopt_level != IPPROTO_SCTP) { /* wrong proto level... send back up to IP */ #ifdef INET6 @@ -6052,7 +6985,9 @@ sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) } vrf_id = inp->def_vrf_id; /* We are GOOD to go */ - stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, p); + stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, + inp->sctp_ep.pre_open_stream_count, + inp->sctp_ep.port, p); if (stcb == NULL) { /* Gak! no memory */ goto out_now; @@ -6182,7 +7117,7 @@ sctp_listen(struct socket *so, int backlog, struct thread *p) SCTP_INP_DECR_REF(tinp); return (EADDRINUSE); } else if (tinp) { - SCTP_INP_DECR_REF(inp); + SCTP_INP_DECR_REF(tinp); } } } @@ -6194,8 +7129,8 @@ sctp_listen(struct socket *so, int backlog, struct thread *p) #endif SOCK_LOCK(so); error = solisten_proto_check(so); + SOCK_UNLOCK(so); if (error) { - SOCK_UNLOCK(so); SCTP_INP_RUNLOCK(inp); return (error); } @@ -6208,28 +7143,27 @@ sctp_listen(struct socket *so, int backlog, struct thread *p) * move the guy that was listener to the TCP Pool. */ if (sctp_swap_inpcb_for_listen(inp)) { - goto in_use; + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); + return (EADDRINUSE); } } if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) { /* We are already connected AND the TCP model */ -in_use: SCTP_INP_RUNLOCK(inp); - SOCK_UNLOCK(so); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); return (EADDRINUSE); } SCTP_INP_RUNLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) { /* We must do a bind. */ - SOCK_UNLOCK(so); if ((error = sctp_inpcb_bind(so, NULL, NULL, p))) { /* bind error, probably perm */ return (error); } - SOCK_LOCK(so); } + SOCK_LOCK(so); /* It appears for 7.0 and on, we must always call this. */ solisten_proto(so, backlog); if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { @@ -6357,7 +7291,8 @@ sctp_accept(struct socket *so, struct sockaddr **addr) } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_LOCK(stcb); - sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7); + sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_19); } return (0); } @@ -6442,7 +7377,7 @@ sctp_ingetaddr(struct socket *so, struct sockaddr **addr) if (laddr->ifa->address.sa.sa_family == AF_INET) { struct sockaddr_in *sin_a; - sin_a = (struct sockaddr_in *)&laddr->ifa->address.sa; + sin_a = &laddr->ifa->address.sin; sin->sin_addr = sin_a->sin_addr; fnd = 1; break; diff --git a/freebsd/sys/netinet/sctp_var.h b/freebsd/sys/netinet/sctp_var.h index d88a2376..a4d2b998 100644 --- a/freebsd/sys/netinet/sctp_var.h +++ b/freebsd/sys/netinet/sctp_var.h @@ -72,7 +72,7 @@ extern struct pr_usrreqs sctp_usrreqs; ((stcb->asoc.sctp_features & feature) == 0)) || \ ((stcb == NULL) && (inp != NULL) && \ ((inp->sctp_features & feature) == 0)) || \ - ((stcb == NULL) && (inp == NULL))) + ((stcb == NULL) && (inp == NULL))) /* managing mobility_feature in inpcb (by micchie) */ #define sctp_mobility_feature_on(inp, feature) (inp->sctp_mobility_features |= feature) @@ -86,7 +86,7 @@ extern struct pr_usrreqs sctp_usrreqs; #define sctp_sbspace_failedmsgs(sb) ((long) ((sctp_maxspace(sb) > (sb)->sb_cc) ? (sctp_maxspace(sb) - (sb)->sb_cc) : 0)) -#define sctp_sbspace_sub(a,b) ((a > b) ? (a - b) : 0) +#define sctp_sbspace_sub(a,b) (((a) > (b)) ? ((a) - (b)) : 0) /* * I tried to cache the readq entries at one point. But the reality @@ -97,16 +97,24 @@ extern struct pr_usrreqs sctp_usrreqs; * an mbuf cache as well so it is not really worth doing, at least * right now :-D */ - +#ifdef INVARIANTS #define sctp_free_a_readq(_stcb, _readq) { \ + if ((_readq)->on_strm_q) \ + panic("On strm q stcb:%p readq:%p", (_stcb), (_readq)); \ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \ SCTP_DECR_READQ_COUNT(); \ } +#else +#define sctp_free_a_readq(_stcb, _readq) { \ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \ + SCTP_DECR_READQ_COUNT(); \ +} +#endif #define sctp_alloc_a_readq(_stcb, _readq) { \ (_readq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_readq), struct sctp_queued_to_read); \ if ((_readq)) { \ - SCTP_INCR_READQ_COUNT(); \ + SCTP_INCR_READQ_COUNT(); \ } \ } @@ -121,11 +129,11 @@ extern struct pr_usrreqs sctp_usrreqs; #define sctp_alloc_a_strmoq(_stcb, _strmoq) { \ (_strmoq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_strmoq), struct sctp_stream_queue_pending); \ - if ((_strmoq)) { \ + if ((_strmoq)) { \ memset(_strmoq, 0, sizeof(struct sctp_stream_queue_pending)); \ SCTP_INCR_STRMOQ_COUNT(); \ (_strmoq)->holds_key_ref = 0; \ - } \ + } \ } #define sctp_free_a_chunk(_stcb, _chk, _so_locked) { \ @@ -133,22 +141,22 @@ extern struct pr_usrreqs sctp_usrreqs; sctp_auth_key_release((_stcb), (_chk)->auth_keyid, _so_locked); \ (_chk)->holds_key_ref = 0; \ } \ - if (_stcb) { \ - SCTP_TCB_LOCK_ASSERT((_stcb)); \ - if ((_chk)->whoTo) { \ - sctp_free_remote_addr((_chk)->whoTo); \ - (_chk)->whoTo = NULL; \ - } \ - if (((_stcb)->asoc.free_chunk_cnt > SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit)) || \ - (SCTP_BASE_INFO(ipi_free_chunks) > SCTP_BASE_SYSCTL(sctp_system_free_resc_limit))) { \ - SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \ - SCTP_DECR_CHK_COUNT(); \ - } else { \ - TAILQ_INSERT_TAIL(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \ - (_stcb)->asoc.free_chunk_cnt++; \ - atomic_add_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \ - } \ - } else { \ + if (_stcb) { \ + SCTP_TCB_LOCK_ASSERT((_stcb)); \ + if ((_chk)->whoTo) { \ + sctp_free_remote_addr((_chk)->whoTo); \ + (_chk)->whoTo = NULL; \ + } \ + if (((_stcb)->asoc.free_chunk_cnt > SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit)) || \ + (SCTP_BASE_INFO(ipi_free_chunks) > SCTP_BASE_SYSCTL(sctp_system_free_resc_limit))) { \ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \ + SCTP_DECR_CHK_COUNT(); \ + } else { \ + TAILQ_INSERT_TAIL(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \ + (_stcb)->asoc.free_chunk_cnt++; \ + atomic_add_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \ + } \ + } else { \ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \ SCTP_DECR_CHK_COUNT(); \ } \ @@ -159,7 +167,7 @@ extern struct pr_usrreqs sctp_usrreqs; (_chk) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_chunk), struct sctp_tmit_chunk); \ if ((_chk)) { \ SCTP_INCR_CHK_COUNT(); \ - (_chk)->whoTo = NULL; \ + (_chk)->whoTo = NULL; \ (_chk)->holds_key_ref = 0; \ } \ } else { \ @@ -167,7 +175,7 @@ extern struct pr_usrreqs sctp_usrreqs; TAILQ_REMOVE(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \ atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \ (_chk)->holds_key_ref = 0; \ - SCTP_STAT_INCR(sctps_cached_chk); \ + SCTP_STAT_INCR(sctps_cached_chk); \ (_stcb)->asoc.free_chunk_cnt--; \ } \ } @@ -178,15 +186,16 @@ extern struct pr_usrreqs sctp_usrreqs; if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&(__net)->ref_count)) { \ (void)SCTP_OS_TIMER_STOP(&(__net)->rxt_timer.timer); \ (void)SCTP_OS_TIMER_STOP(&(__net)->pmtu_timer.timer); \ - if ((__net)->ro.ro_rt) { \ + (void)SCTP_OS_TIMER_STOP(&(__net)->hb_timer.timer); \ + if ((__net)->ro.ro_rt) { \ RTFREE((__net)->ro.ro_rt); \ (__net)->ro.ro_rt = NULL; \ - } \ + } \ if ((__net)->src_addr_selected) { \ sctp_free_ifa((__net)->ro._s_addr); \ (__net)->ro._s_addr = NULL; \ } \ - (__net)->src_addr_selected = 0; \ + (__net)->src_addr_selected = 0; \ (__net)->dest_state &= ~SCTP_ADDR_REACHABLE; \ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_net), (__net)); \ SCTP_DECR_RADDR_COUNT(); \ @@ -210,7 +219,7 @@ extern struct pr_usrreqs sctp_usrreqs; atomic_add_int(&(sb)->sb_cc,SCTP_BUF_LEN((m))); \ atomic_add_int(&(sb)->sb_mbcnt, MSIZE); \ if (stcb) { \ - atomic_add_int(&(stcb)->asoc.sb_cc,SCTP_BUF_LEN((m))); \ + atomic_add_int(&(stcb)->asoc.sb_cc, SCTP_BUF_LEN((m))); \ atomic_add_int(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \ } \ if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \ @@ -250,12 +259,12 @@ extern struct pr_usrreqs sctp_usrreqs; } while (0) #define sctp_flight_size_increase(tp1) do { \ - (tp1)->whoTo->flight_size += (tp1)->book_size; \ + (tp1)->whoTo->flight_size += (tp1)->book_size; \ } while (0) #ifdef SCTP_FS_SPEC_LOG #define sctp_total_flight_decrease(stcb, tp1) do { \ - if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \ + if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \ stcb->asoc.fs_index = 0;\ stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \ stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.TSN_seq; \ @@ -264,7 +273,7 @@ extern struct pr_usrreqs sctp_usrreqs; stcb->asoc.fslog[stcb->asoc.fs_index].incr = 0; \ stcb->asoc.fslog[stcb->asoc.fs_index].decr = 1; \ stcb->asoc.fs_index++; \ - tp1->window_probe = 0; \ + tp1->window_probe = 0; \ if (stcb->asoc.total_flight >= tp1->book_size) { \ stcb->asoc.total_flight -= tp1->book_size; \ if (stcb->asoc.total_flight_count > 0) \ @@ -276,7 +285,7 @@ extern struct pr_usrreqs sctp_usrreqs; } while (0) #define sctp_total_flight_increase(stcb, tp1) do { \ - if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \ + if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \ stcb->asoc.fs_index = 0;\ stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \ stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.TSN_seq; \ @@ -285,14 +294,14 @@ extern struct pr_usrreqs sctp_usrreqs; stcb->asoc.fslog[stcb->asoc.fs_index].incr = 1; \ stcb->asoc.fslog[stcb->asoc.fs_index].decr = 0; \ stcb->asoc.fs_index++; \ - (stcb)->asoc.total_flight_count++; \ - (stcb)->asoc.total_flight += (tp1)->book_size; \ + (stcb)->asoc.total_flight_count++; \ + (stcb)->asoc.total_flight += (tp1)->book_size; \ } while (0) #else #define sctp_total_flight_decrease(stcb, tp1) do { \ - tp1->window_probe = 0; \ + tp1->window_probe = 0; \ if (stcb->asoc.total_flight >= tp1->book_size) { \ stcb->asoc.total_flight -= tp1->book_size; \ if (stcb->asoc.total_flight_count > 0) \ @@ -304,8 +313,8 @@ extern struct pr_usrreqs sctp_usrreqs; } while (0) #define sctp_total_flight_increase(stcb, tp1) do { \ - (stcb)->asoc.total_flight_count++; \ - (stcb)->asoc.total_flight += (tp1)->book_size; \ + (stcb)->asoc.total_flight_count++; \ + (stcb)->asoc.total_flight += (tp1)->book_size; \ } while (0) #endif @@ -326,19 +335,17 @@ int sctp_ctloutput(struct socket *, struct sockopt *); #ifdef INET void sctp_input_with_port(struct mbuf *, int, uint16_t); -void sctp_input(struct mbuf *, int); +int sctp_input(struct mbuf **, int *, int); #endif void sctp_pathmtu_adjustment(struct sctp_tcb *, uint16_t); void sctp_drain(void); void sctp_init(void); -void sctp_finish(void); +void +sctp_notify(struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *, + uint8_t, uint8_t, uint16_t, uint16_t); int sctp_flush(struct socket *, int); int sctp_shutdown(struct socket *); -void -sctp_notify(struct sctp_inpcb *, struct ip *ip, struct sctphdr *, - struct sockaddr *, struct sctp_tcb *, - struct sctp_nets *); int sctp_bindx(struct socket *, int, struct sockaddr_storage *, int, int, struct proc *); diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c index 6cd82739..36a9c2ce 100644 --- a/freebsd/sys/netinet/sctputil.c +++ b/freebsd/sys/netinet/sctputil.c @@ -54,14 +54,17 @@ __FBSDID("$FreeBSD$"); #include #include #include +#ifdef INET6 +#include +#endif #ifndef KTR_SCTP #define KTR_SCTP KTR_SUBSYS #endif -extern struct sctp_cc_functions sctp_cc_functions[]; -extern struct sctp_ss_functions sctp_ss_functions[]; +extern const struct sctp_cc_functions sctp_cc_functions[]; +extern const struct sctp_ss_functions sctp_ss_functions[]; void sctp_sblog(struct sockbuf *sb, struct sctp_tcb *stcb, int from, int incr) @@ -219,6 +222,7 @@ sctp_log_fr(uint32_t biggest_tsn, uint32_t biggest_new_tsn, uint32_t tsn, int fr sctp_clog.x.misc.log4); } +#ifdef SCTP_MBUF_LOGGING void sctp_log_mb(struct mbuf *m, int from) { @@ -244,6 +248,18 @@ sctp_log_mb(struct mbuf *m, int from) sctp_clog.x.misc.log4); } +void +sctp_log_mbc(struct mbuf *m, int from) +{ + struct mbuf *mat; + + for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) { + sctp_log_mb(mat, from); + } +} + +#endif + void sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_read *poschk, int from) { @@ -415,7 +431,8 @@ sctp_log_rwnd_set(uint8_t from, uint32_t peers_rwnd, uint32_t flight_size, uint3 sctp_clog.x.misc.log4); } -void +#ifdef SCTP_MBCNT_LOGGING +static void sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mbcnt_q, uint32_t mbcnt) { struct sctp_cwnd_log sctp_clog; @@ -433,6 +450,8 @@ sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mb sctp_clog.x.misc.log4); } +#endif + void sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d) { @@ -489,7 +508,7 @@ sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t wake_cnt, int from) } void -sctp_log_block(uint8_t from, struct sctp_association *asoc, int sendlen) +sctp_log_block(uint8_t from, struct sctp_association *asoc, size_t sendlen) { struct sctp_cwnd_log sctp_clog; @@ -499,7 +518,7 @@ sctp_log_block(uint8_t from, struct sctp_association *asoc, int sendlen) sctp_clog.x.blk.stream_qcnt = (uint16_t) asoc->stream_queue_cnt; sctp_clog.x.blk.chunks_on_oque = (uint16_t) asoc->chunks_on_out_queue; sctp_clog.x.blk.flight_size = (uint16_t) (asoc->total_flight / 1024); - sctp_clog.x.blk.sndlen = sendlen; + sctp_clog.x.blk.sndlen = (uint32_t) sendlen; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_BLOCK, from, @@ -879,9 +898,52 @@ sctp_select_a_tag(struct sctp_inpcb *inp, uint16_t lport, uint16_t rport, int ch return (x); } +int32_t +sctp_map_assoc_state(int kernel_state) +{ + int32_t user_state; + + if (kernel_state & SCTP_STATE_WAS_ABORTED) { + user_state = SCTP_CLOSED; + } else if (kernel_state & SCTP_STATE_SHUTDOWN_PENDING) { + user_state = SCTP_SHUTDOWN_PENDING; + } else { + switch (kernel_state & SCTP_STATE_MASK) { + case SCTP_STATE_EMPTY: + user_state = SCTP_CLOSED; + break; + case SCTP_STATE_INUSE: + user_state = SCTP_CLOSED; + break; + case SCTP_STATE_COOKIE_WAIT: + user_state = SCTP_COOKIE_WAIT; + break; + case SCTP_STATE_COOKIE_ECHOED: + user_state = SCTP_COOKIE_ECHOED; + break; + case SCTP_STATE_OPEN: + user_state = SCTP_ESTABLISHED; + break; + case SCTP_STATE_SHUTDOWN_SENT: + user_state = SCTP_SHUTDOWN_SENT; + break; + case SCTP_STATE_SHUTDOWN_RECEIVED: + user_state = SCTP_SHUTDOWN_RECEIVED; + break; + case SCTP_STATE_SHUTDOWN_ACK_SENT: + user_state = SCTP_SHUTDOWN_ACK_SENT; + break; + default: + user_state = SCTP_CLOSED; + break; + } + } + return (user_state); +} + int sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, - uint32_t override_tag, uint32_t vrf_id) + uint32_t override_tag, uint32_t vrf_id, uint16_t o_strms) { struct sctp_association *asoc; @@ -898,6 +960,11 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, */ int i; +#if defined(SCTP_DETAILED_STR_STATS) + int j; + +#endif + asoc = &stcb->asoc; /* init all variables to a known value. */ SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_INUSE); @@ -906,12 +973,20 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, asoc->heart_beat_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]); asoc->cookie_life = inp->sctp_ep.def_cookie_life; asoc->sctp_cmt_on_off = inp->sctp_cmt_on_off; - asoc->ecn_allowed = inp->sctp_ecn_enable; - asoc->sctp_nr_sack_on_off = (uint8_t) SCTP_BASE_SYSCTL(sctp_nr_sack_on_off); + asoc->ecn_supported = inp->ecn_supported; + asoc->prsctp_supported = inp->prsctp_supported; + asoc->idata_supported = inp->idata_supported; + asoc->auth_supported = inp->auth_supported; + asoc->asconf_supported = inp->asconf_supported; + asoc->reconfig_supported = inp->reconfig_supported; + asoc->nrsack_supported = inp->nrsack_supported; + asoc->pktdrop_supported = inp->pktdrop_supported; + asoc->idata_supported = inp->idata_supported; asoc->sctp_cmt_pf = (uint8_t) 0; asoc->sctp_frag_point = inp->sctp_frag_point; asoc->sctp_features = inp->sctp_features; asoc->default_dscp = inp->sctp_ep.default_dscp; + asoc->max_cwnd = inp->max_cwnd; #ifdef INET6 if (inp->sctp_ep.default_flowlabel) { asoc->default_flowlabel = inp->sctp_ep.default_flowlabel; @@ -953,7 +1028,6 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, sctp_select_initial_TSN(&inp->sctp_ep); asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; /* we are optimisitic here */ - asoc->peer_supports_pktdrop = 1; asoc->peer_supports_nat = 0; asoc->sent_queue_retran_cnt = 0; @@ -1005,7 +1079,6 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, asoc->minrto = inp->sctp_ep.sctp_minrto; asoc->maxrto = inp->sctp_ep.sctp_maxrto; - asoc->locked_on_sending = NULL; asoc->stream_locked_on = 0; asoc->ecn_echo_cnt_onq = 0; asoc->stream_locked = 0; @@ -1033,7 +1106,7 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, * that we request by default. */ asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams = - inp->sctp_ep.pre_open_stream_count; + o_strms; SCTP_MALLOC(asoc->strmout, struct sctp_stream_out *, asoc->streamoutcnt * sizeof(struct sctp_stream_out), SCTP_M_STRMO); @@ -1051,12 +1124,23 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, * that were dropped must be notified to the upper layer as * failed to send. */ - asoc->strmout[i].next_sequence_send = 0x0; + asoc->strmout[i].next_mid_ordered = 0; + asoc->strmout[i].next_mid_unordered = 0; TAILQ_INIT(&asoc->strmout[i].outqueue); asoc->strmout[i].chunks_on_queues = 0; +#if defined(SCTP_DETAILED_STR_STATS) + for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { + asoc->strmout[i].abandoned_sent[j] = 0; + asoc->strmout[i].abandoned_unsent[j] = 0; + } +#else + asoc->strmout[i].abandoned_sent[0] = 0; + asoc->strmout[i].abandoned_unsent[0] = 0; +#endif asoc->strmout[i].stream_no = i; asoc->strmout[i].last_msg_incomplete = 0; - asoc->ss_functions.sctp_ss_init_stream(&asoc->strmout[i], NULL); + asoc->strmout[i].state = SCTP_STREAM_OPENING; + asoc->ss_functions.sctp_ss_init_stream(stcb, &asoc->strmout[i], NULL); } asoc->ss_functions.sctp_ss_init(stcb, asoc, 0); @@ -1086,7 +1170,6 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, TAILQ_INIT(&asoc->asconf_send_queue); TAILQ_INIT(&asoc->send_queue); TAILQ_INIT(&asoc->sent_queue); - TAILQ_INIT(&asoc->reasmqueue); TAILQ_INIT(&asoc->resetHead); asoc->max_inbound_streams = inp->sctp_ep.max_open_streams_intome; TAILQ_INIT(&asoc->asconf_queue); @@ -1109,6 +1192,10 @@ sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, asoc->timoshutdownack = 0; (void)SCTP_GETTIME_TIMEVAL(&asoc->start_time); asoc->discontinuity_time = asoc->start_time; + for (i = 0; i < SCTP_PR_SCTP_MAX + 1; i++) { + asoc->abandoned_unsent[i] = 0; + asoc->abandoned_sent[i] = 0; + } /* * sa_ignore MEMLEAK {memory is put in the assoc mapping array and * freed later when the association is freed. @@ -1195,6 +1282,7 @@ sctp_iterator_work(struct sctp_iterator *it) SCTP_INP_INFO_RLOCK(); SCTP_ITERATOR_LOCK(); + sctp_it_ctl.cur_it = it; if (it->inp) { SCTP_INP_RLOCK(it->inp); SCTP_INP_DECR_REF(it->inp); @@ -1202,6 +1290,7 @@ sctp_iterator_work(struct sctp_iterator *it) if (it->inp == NULL) { /* iterator is complete */ done_with_iterator: + sctp_it_ctl.cur_it = NULL; SCTP_ITERATOR_UNLOCK(); SCTP_INP_INFO_RUNLOCK(); if (it->function_atend != NULL) { @@ -1342,13 +1431,11 @@ sctp_iterator_worker(void) sctp_it_ctl.iterator_running = 1; TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) { - sctp_it_ctl.cur_it = it; /* now lets work on this one */ TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); SCTP_IPI_ITERATOR_WQ_UNLOCK(); CURVNET_SET(it->vn); sctp_iterator_work(it); - sctp_it_ctl.cur_it = NULL; CURVNET_RESTORE(); SCTP_IPI_ITERATOR_WQ_LOCK(); /* sa_ignore FREED_MEMORY */ @@ -1389,7 +1476,9 @@ sctp_handle_addr_wq(void) if (asc->cnt == 0) { SCTP_FREE(asc, SCTP_M_ASC_IT); } else { - (void)sctp_initiate_iterator(sctp_asconf_iterator_ep, + int ret; + + ret = sctp_initiate_iterator(sctp_asconf_iterator_ep, sctp_asconf_iterator_stcb, NULL, /* No ep end for boundall */ SCTP_PCB_FLAGS_BOUNDALL, @@ -1397,6 +1486,23 @@ sctp_handle_addr_wq(void) SCTP_ASOC_ANY_STATE, (void *)asc, 0, sctp_asconf_iterator_end, NULL, 0); + if (ret) { + SCTP_PRINTF("Failed to initiate iterator for handle_addr_wq\n"); + /* + * Freeing if we are stopping or put back on the + * addr_wq. + */ + if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { + sctp_asconf_iterator_end(asc, 0); + } else { + SCTP_WQ_ADDR_LOCK(); + LIST_FOREACH(wi, &asc->list_of_work, sctp_nxt_addr) { + LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); + } + SCTP_WQ_ADDR_UNLOCK(); + SCTP_FREE(asc, SCTP_M_ASC_IT); + } + } } } @@ -1407,12 +1513,14 @@ sctp_timeout_handler(void *t) struct sctp_tcb *stcb; struct sctp_nets *net; struct sctp_timer *tmr; + struct mbuf *op_err; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif - int did_output, type; + int did_output; + int type; tmr = (struct sctp_timer *)t; inp = (struct sctp_inpcb *)tmr->ep; @@ -1451,7 +1559,6 @@ sctp_timeout_handler(void *t) } /* if this is an iterator timeout, get the struct and clear inp */ tmr->stopped_from = 0xa003; - type = tmr->type; if (inp) { SCTP_INP_INCR_REF(inp); if ((inp->sctp_socket == NULL) && @@ -1482,8 +1589,9 @@ sctp_timeout_handler(void *t) return; } } + type = tmr->type; tmr->stopped_from = 0xa005; - SCTPDBG(SCTP_DEBUG_TIMER1, "Timer type %d goes off\n", tmr->type); + SCTPDBG(SCTP_DEBUG_TIMER1, "Timer type %d goes off\n", type); if (!SCTP_OS_TIMER_ACTIVE(&tmr->timer)) { if (inp) { SCTP_INP_DECR_REF(inp); @@ -1499,7 +1607,7 @@ sctp_timeout_handler(void *t) if (stcb) { SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); - if ((tmr->type != SCTP_TIMER_TYPE_ASOCKILL) && + if ((type != SCTP_TIMER_TYPE_ASOCKILL) && ((stcb->asoc.state == 0) || (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED))) { SCTP_TCB_UNLOCK(stcb); @@ -1510,8 +1618,8 @@ sctp_timeout_handler(void *t) return; } } - /* record in stopped what t-o occured */ - tmr->stopped_from = tmr->type; + /* record in stopped what t-o occurred */ + tmr->stopped_from = type; /* mark as being serviced now */ if (SCTP_OS_TIMER_PENDING(&tmr->timer)) { @@ -1529,7 +1637,7 @@ sctp_timeout_handler(void *t) SCTP_OS_TIMER_DEACTIVATE(&tmr->timer); /* call the handler for the appropriate timer type */ - switch (tmr->type) { + switch (type) { case SCTP_TIMER_TYPE_ZERO_COPY: if (inp == NULL) { break; @@ -1719,7 +1827,9 @@ sctp_timeout_handler(void *t) break; } SCTP_STAT_INCR(sctps_timoshutdownguard); - sctp_abort_an_association(inp, stcb, NULL, SCTP_SO_NOT_LOCKED); + op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), + "Shutdown guard timer expired"); + sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); /* no need to unlock on tcb its gone */ goto out_decr; @@ -1772,7 +1882,8 @@ sctp_timeout_handler(void *t) SCTP_STAT_INCR(sctps_timoassockill); /* Can we free it yet? */ SCTP_INP_DECR_REF(inp); - sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_1); + sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL, + SCTP_FROM_SCTPUTIL + SCTP_LOC_1); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) so = SCTP_INP_SO(inp); atomic_add_int(&stcb->asoc.refcnt, 1); @@ -1781,7 +1892,8 @@ sctp_timeout_handler(void *t) SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_2); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTPUTIL + SCTP_LOC_2); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -1801,18 +1913,19 @@ sctp_timeout_handler(void *t) * killer */ SCTP_INP_DECR_REF(inp); - sctp_timer_stop(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_3); + sctp_timer_stop(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL, + SCTP_FROM_SCTPUTIL + SCTP_LOC_3); sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, SCTP_CALLED_FROM_INPKILL_TIMER); inp = NULL; goto out_no_decr; default: SCTPDBG(SCTP_DEBUG_TIMER1, "sctp_timeout_handler:unknown timer %d\n", - tmr->type); + type); break; } #ifdef SCTP_AUDITING_ENABLED - sctp_audit_log(0xF1, (uint8_t) tmr->type); + sctp_audit_log(0xF1, (uint8_t) type); if (inp) sctp_auditing(5, inp, stcb, net); #endif @@ -1835,8 +1948,7 @@ out_decr: SCTP_INP_DECR_REF(inp); } out_no_decr: - SCTPDBG(SCTP_DEBUG_TIMER1, "Timer now complete (type %d)\n", - type); + SCTPDBG(SCTP_DEBUG_TIMER1, "Timer now complete (type = %d)\n", type); CURVNET_RESTORE(); } @@ -1929,7 +2041,7 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, * though we use a different timer. We also add the HB timer * PLUS a random jitter. */ - if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { + if ((stcb == NULL) || (net == NULL)) { return; } else { uint32_t rndval; @@ -1984,9 +2096,6 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, * nothing needed but the endpoint here ususually about 60 * minutes. */ - if (inp == NULL) { - return; - } tmr = &inp->sctp_ep.signature_change; to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_SIGNATURE]; break; @@ -2003,9 +2112,6 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, * timer since that has stopped and we are in the GONE * state. */ - if (inp == NULL) { - return; - } tmr = &inp->sctp_ep.signature_change; to_ticks = MSEC_TO_TICKS(SCTP_INP_KILL_TIMEOUT); break; @@ -2014,10 +2120,7 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, * Here we use the value found in the EP for PMTU ususually * about 10 minutes. */ - if ((stcb == NULL) || (inp == NULL)) { - return; - } - if (net == NULL) { + if ((stcb == NULL) || (net == NULL)) { return; } if (net->dest_state & SCTP_ADDR_NO_PMTUD) { @@ -2043,10 +2146,14 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, * Here we use the endpoints shutdown guard timer usually * about 3 minutes. */ - if ((inp == NULL) || (stcb == NULL)) { + if (stcb == NULL) { return; } - to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN]; + if (inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] == 0) { + to_ticks = 5 * MSEC_TO_TICKS(stcb->asoc.maxrto); + } else { + to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN]; + } tmr = &stcb->asoc.shut_guard_timer; break; case SCTP_TIMER_TYPE_STRRESET: @@ -2102,13 +2209,13 @@ sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, break; default: SCTPDBG(SCTP_DEBUG_TIMER1, "%s: Unknown timer type %d\n", - __FUNCTION__, t_type); + __func__, t_type); return; break; } if ((to_ticks <= 0) || (tmr == NULL)) { SCTPDBG(SCTP_DEBUG_TIMER1, "%s: %d:software error to_ticks:%d tmr:%p not set ??\n", - __FUNCTION__, t_type, to_ticks, (void *)tmr); + __func__, t_type, to_ticks, (void *)tmr); return; } if (SCTP_OS_TIMER_PENDING(&tmr->timer)) { @@ -2264,7 +2371,7 @@ sctp_timer_stop(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, break; default: SCTPDBG(SCTP_DEBUG_TIMER1, "%s: Unknown timer type %d\n", - __FUNCTION__, t_type); + __func__, t_type); break; } if (tmr == NULL) { @@ -2383,8 +2490,8 @@ sctp_calculate_rto(struct sctp_tcb *stcb, net->rtt = (uint64_t) 1000000 *(uint64_t) now.tv_sec + (uint64_t) now.tv_usec; - /* computer rtt in ms */ - rtt = net->rtt / 1000; + /* compute rtt in ms */ + rtt = (int32_t) (net->rtt / 1000); if ((asoc->cc_functions.sctp_rtt_calculated) && (rtt_from_sack == SCTP_RTT_FROM_DATA)) { /* * Tell the CC module that a new update has just occurred @@ -2518,58 +2625,44 @@ sctp_get_next_param(struct mbuf *m, } -int +struct mbuf * sctp_add_pad_tombuf(struct mbuf *m, int padlen) { - /* - * add padlen bytes of 0 filled padding to the end of the mbuf. If - * padlen is > 3 this routine will fail. - */ - uint8_t *dp; - int i; + struct mbuf *m_last; + caddr_t dp; if (padlen > 3) { - SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS); - return (ENOBUFS); + return (NULL); } if (padlen <= M_TRAILINGSPACE(m)) { /* * The easy way. We hope the majority of the time we hit * here :) */ - dp = (uint8_t *) (mtod(m, caddr_t)+SCTP_BUF_LEN(m)); - SCTP_BUF_LEN(m) += padlen; + m_last = m; } else { - /* Hard way we must grow the mbuf */ - struct mbuf *tmp; - - tmp = sctp_get_mbuf_for_msg(padlen, 0, M_DONTWAIT, 1, MT_DATA); - if (tmp == NULL) { - /* Out of space GAK! we are in big trouble. */ - SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS); - return (ENOBUFS); - } - /* setup and insert in middle */ - SCTP_BUF_LEN(tmp) = padlen; - SCTP_BUF_NEXT(tmp) = NULL; - SCTP_BUF_NEXT(m) = tmp; - dp = mtod(tmp, uint8_t *); - } - /* zero out the pad */ - for (i = 0; i < padlen; i++) { - *dp = 0; - dp++; + /* Hard way we must grow the mbuf chain */ + m_last = sctp_get_mbuf_for_msg(padlen, 0, M_NOWAIT, 1, MT_DATA); + if (m_last == NULL) { + return (NULL); + } + SCTP_BUF_LEN(m_last) = 0; + SCTP_BUF_NEXT(m_last) = NULL; + SCTP_BUF_NEXT(m) = m_last; } - return (0); + dp = mtod(m_last, caddr_t)+SCTP_BUF_LEN(m_last); + SCTP_BUF_LEN(m_last) += padlen; + memset(dp, 0, padlen); + return (m_last); } -int +struct mbuf * sctp_pad_lastmbuf(struct mbuf *m, int padval, struct mbuf *last_mbuf) { /* find the last mbuf in chain and pad it */ struct mbuf *m_at; - if (last_mbuf) { + if (last_mbuf != NULL) { return (sctp_add_pad_tombuf(last_mbuf, padval)); } else { for (m_at = m; m_at; m_at = SCTP_BUF_NEXT(m_at)) { @@ -2578,8 +2671,7 @@ sctp_pad_lastmbuf(struct mbuf *m, int padval, struct mbuf *last_mbuf) } } } - SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EFAULT); - return (EFAULT); + return (NULL); } static void @@ -2593,7 +2685,8 @@ sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb, struct mbuf *m_notify; struct sctp_assoc_change *sac; struct sctp_queued_to_read *control; - size_t notif_len, abort_len; + unsigned int notif_len; + uint16_t abort_len; unsigned int i; #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) @@ -2601,8 +2694,11 @@ sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb, #endif + if (stcb == NULL) { + return; + } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT)) { - notif_len = sizeof(struct sctp_assoc_change); + notif_len = (unsigned int)sizeof(struct sctp_assoc_change); if (abort != NULL) { abort_len = ntohs(abort->ch.chunk_length); } else { @@ -2613,11 +2709,11 @@ sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb, } else if ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC)) { notif_len += abort_len; } - m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { /* Retry with smaller value. */ - notif_len = sizeof(struct sctp_assoc_change); - m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA); + notif_len = (unsigned int)sizeof(struct sctp_assoc_change); + m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { goto set_error; } @@ -2637,17 +2733,20 @@ sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb, if (notif_len > sizeof(struct sctp_assoc_change)) { if ((state == SCTP_COMM_UP) || (state == SCTP_RESTART)) { i = 0; - if (stcb->asoc.peer_supports_prsctp) { + if (stcb->asoc.prsctp_supported == 1) { sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_PR; } - if (stcb->asoc.peer_supports_auth) { + if (stcb->asoc.auth_supported == 1) { sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_AUTH; } - if (stcb->asoc.peer_supports_asconf) { + if (stcb->asoc.asconf_supported == 1) { sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_ASCONF; } + if (stcb->asoc.idata_supported == 1) { + sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_INTERLEAVING; + } sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_MULTIBUF; - if (stcb->asoc.peer_supports_strreset) { + if (stcb->asoc.reconfig_supported == 1) { sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_RE_CONFIG; } sac->sac_length += i; @@ -2732,7 +2831,11 @@ set_error: static void sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state, - struct sockaddr *sa, uint32_t error) + struct sockaddr *sa, uint32_t error, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) { struct mbuf *m_notify; struct sctp_paddr_change *spc; @@ -2743,18 +2846,28 @@ sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state, /* event not enabled */ return; } - m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_paddr_change), 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_paddr_change), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) return; SCTP_BUF_LEN(m_notify) = 0; spc = mtod(m_notify, struct sctp_paddr_change *); + memset(spc, 0, sizeof(struct sctp_paddr_change)); spc->spc_type = SCTP_PEER_ADDR_CHANGE; spc->spc_flags = 0; spc->spc_length = sizeof(struct sctp_paddr_change); switch (sa->sa_family) { #ifdef INET case AF_INET: +#ifdef INET6 + if (sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { + in6_sin_2_v4mapsin6((struct sockaddr_in *)sa, + (struct sockaddr_in6 *)&spc->spc_aaddr); + } else { + memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in)); + } +#else memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in)); +#endif break; #endif #ifdef INET6 @@ -2805,7 +2918,7 @@ sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, - SCTP_SO_NOT_LOCKED); + so_locked); } @@ -2821,7 +2934,8 @@ sctp_notify_send_failed(struct sctp_tcb *stcb, uint8_t sent, uint32_t error, struct sctp_send_failed *ssf; struct sctp_send_failed_event *ssfe; struct sctp_queued_to_read *control; - int length; + struct sctp_chunkhdr *chkhdr; + int notifhdr_len, chk_len, chkhdr_len, padding_len, payload_len; if ((stcb == NULL) || (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT) && @@ -2830,27 +2944,49 @@ sctp_notify_send_failed(struct sctp_tcb *stcb, uint8_t sent, uint32_t error, return; } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) { - length = sizeof(struct sctp_send_failed_event); + notifhdr_len = sizeof(struct sctp_send_failed_event); } else { - length = sizeof(struct sctp_send_failed); + notifhdr_len = sizeof(struct sctp_send_failed); } - m_notify = sctp_get_mbuf_for_msg(length, 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(notifhdr_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; - SCTP_BUF_LEN(m_notify) = 0; + SCTP_BUF_LEN(m_notify) = notifhdr_len; + if (stcb->asoc.idata_supported) { + chkhdr_len = sizeof(struct sctp_idata_chunk); + } else { + chkhdr_len = sizeof(struct sctp_data_chunk); + } + /* Use some defaults in case we can't access the chunk header */ + if (chk->send_size >= chkhdr_len) { + payload_len = chk->send_size - chkhdr_len; + } else { + payload_len = 0; + } + padding_len = 0; + if (chk->data != NULL) { + chkhdr = mtod(chk->data, struct sctp_chunkhdr *); + if (chkhdr != NULL) { + chk_len = ntohs(chkhdr->chunk_length); + if ((chk_len >= chkhdr_len) && + (chk->send_size >= chk_len) && + (chk->send_size - chk_len < 4)) { + padding_len = chk->send_size - chk_len; + payload_len = chk->send_size - chkhdr_len - padding_len; + } + } + } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) { ssfe = mtod(m_notify, struct sctp_send_failed_event *); - memset(ssfe, 0, length); + memset(ssfe, 0, notifhdr_len); ssfe->ssfe_type = SCTP_SEND_FAILED_EVENT; if (sent) { ssfe->ssfe_flags = SCTP_DATA_SENT; } else { ssfe->ssfe_flags = SCTP_DATA_UNSENT; } - length += chk->send_size; - length -= sizeof(struct sctp_data_chunk); - ssfe->ssfe_length = length; + ssfe->ssfe_length = (uint32_t) (notifhdr_len + payload_len); ssfe->ssfe_error = error; /* not exactly what the user sent in, but should be close :) */ ssfe->ssfe_info.snd_sid = chk->rec.data.stream_number; @@ -2859,39 +2995,33 @@ sctp_notify_send_failed(struct sctp_tcb *stcb, uint8_t sent, uint32_t error, ssfe->ssfe_info.snd_context = chk->rec.data.context; ssfe->ssfe_info.snd_assoc_id = sctp_get_associd(stcb); ssfe->ssfe_assoc_id = sctp_get_associd(stcb); - SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed_event); } else { ssf = mtod(m_notify, struct sctp_send_failed *); - memset(ssf, 0, length); + memset(ssf, 0, notifhdr_len); ssf->ssf_type = SCTP_SEND_FAILED; if (sent) { ssf->ssf_flags = SCTP_DATA_SENT; } else { ssf->ssf_flags = SCTP_DATA_UNSENT; } - length += chk->send_size; - length -= sizeof(struct sctp_data_chunk); - ssf->ssf_length = length; + ssf->ssf_length = (uint32_t) (notifhdr_len + payload_len); ssf->ssf_error = error; /* not exactly what the user sent in, but should be close :) */ - bzero(&ssf->ssf_info, sizeof(ssf->ssf_info)); ssf->ssf_info.sinfo_stream = chk->rec.data.stream_number; - ssf->ssf_info.sinfo_ssn = chk->rec.data.stream_seq; + ssf->ssf_info.sinfo_ssn = (uint16_t) chk->rec.data.stream_seq; ssf->ssf_info.sinfo_flags = chk->rec.data.rcv_flags; ssf->ssf_info.sinfo_ppid = chk->rec.data.payloadtype; ssf->ssf_info.sinfo_context = chk->rec.data.context; ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb); ssf->ssf_assoc_id = sctp_get_associd(stcb); - SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed); } - if (chk->data) { - /* - * trim off the sctp chunk header(it should be there) - */ - if (chk->send_size >= sizeof(struct sctp_data_chunk)) { - m_adj(chk->data, sizeof(struct sctp_data_chunk)); + if (chk->data != NULL) { + /* Trim off the sctp chunk header (it should be there) */ + if (chk->send_size == chkhdr_len + payload_len + padding_len) { + m_adj(chk->data, chkhdr_len); + m_adj(chk->data, -padding_len); sctp_mbuf_crush(chk->data); - chk->send_size -= sizeof(struct sctp_data_chunk); + chk->send_size -= (chkhdr_len + padding_len); } } SCTP_BUF_NEXT(m_notify) = chk->data; @@ -2936,7 +3066,7 @@ sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error, struct sctp_send_failed *ssf; struct sctp_send_failed_event *ssfe; struct sctp_queued_to_read *control; - int length; + int notifhdr_len; if ((stcb == NULL) || (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT) && @@ -2945,23 +3075,22 @@ sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error, return; } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) { - length = sizeof(struct sctp_send_failed_event); + notifhdr_len = sizeof(struct sctp_send_failed_event); } else { - length = sizeof(struct sctp_send_failed); + notifhdr_len = sizeof(struct sctp_send_failed); } - m_notify = sctp_get_mbuf_for_msg(length, 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(notifhdr_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { /* no space left */ return; } - SCTP_BUF_LEN(m_notify) = 0; + SCTP_BUF_LEN(m_notify) = notifhdr_len; if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) { ssfe = mtod(m_notify, struct sctp_send_failed_event *); - memset(ssfe, 0, length); + memset(ssfe, 0, notifhdr_len); ssfe->ssfe_type = SCTP_SEND_FAILED_EVENT; ssfe->ssfe_flags = SCTP_DATA_UNSENT; - length += sp->length; - ssfe->ssfe_length = length; + ssfe->ssfe_length = (uint32_t) (notifhdr_len + sp->length); ssfe->ssfe_error = error; /* not exactly what the user sent in, but should be close :) */ ssfe->ssfe_info.snd_sid = sp->stream; @@ -2974,14 +3103,12 @@ sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error, ssfe->ssfe_info.snd_context = sp->context; ssfe->ssfe_info.snd_assoc_id = sctp_get_associd(stcb); ssfe->ssfe_assoc_id = sctp_get_associd(stcb); - SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed_event); } else { ssf = mtod(m_notify, struct sctp_send_failed *); - memset(ssf, 0, length); + memset(ssf, 0, notifhdr_len); ssf->ssf_type = SCTP_SEND_FAILED; ssf->ssf_flags = SCTP_DATA_UNSENT; - length += sp->length; - ssf->ssf_length = length; + ssf->ssf_length = (uint32_t) (notifhdr_len + sp->length); ssf->ssf_error = error; /* not exactly what the user sent in, but should be close :) */ ssf->ssf_info.sinfo_stream = sp->stream; @@ -2995,7 +3122,6 @@ sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error, ssf->ssf_info.sinfo_context = sp->context; ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb); ssf->ssf_assoc_id = sctp_get_associd(stcb); - SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed); } SCTP_BUF_NEXT(m_notify) = sp->data; @@ -3039,7 +3165,7 @@ sctp_notify_adaptation_layer(struct sctp_tcb *stcb) /* event not enabled */ return; } - m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_adaption_event), 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_adaption_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; @@ -3095,7 +3221,7 @@ sctp_notify_partial_delivery_indication(struct sctp_tcb *stcb, uint32_t error, if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) { return; } - m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_pdapi_event), 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_pdapi_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; @@ -3206,7 +3332,7 @@ sctp_notify_shutdown_event(struct sctp_tcb *stcb) /* event not enabled */ return; } - m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_event), 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; @@ -3255,7 +3381,7 @@ sctp_notify_sender_dry_event(struct sctp_tcb *stcb, /* event not enabled */ return; } - m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_sender_dry_event), 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_sender_dry_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { /* no space left */ return; @@ -3307,7 +3433,7 @@ sctp_notify_stream_reset_add(struct sctp_tcb *stcb, uint16_t numberin, uint16_t return; } stcb->asoc.peer_req_out = 0; - m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_stream_change_event), 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_stream_change_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; @@ -3357,7 +3483,7 @@ sctp_notify_stream_reset_tsn(struct sctp_tcb *stcb, uint32_t sending_tsn, uint32 /* event not enabled */ return; } - m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_assoc_reset_event), 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_assoc_reset_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; @@ -3411,7 +3537,7 @@ sctp_notify_stream_reset(struct sctp_tcb *stcb, /* event not enabled */ return; } - m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; @@ -3467,7 +3593,8 @@ sctp_notify_remote_error(struct sctp_tcb *stcb, uint16_t error, struct sctp_erro struct mbuf *m_notify; struct sctp_remote_error *sre; struct sctp_queued_to_read *control; - size_t notif_len, chunk_len; + unsigned int notif_len; + uint16_t chunk_len; if ((stcb == NULL) || sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVPEERERR)) { @@ -3478,18 +3605,19 @@ sctp_notify_remote_error(struct sctp_tcb *stcb, uint16_t error, struct sctp_erro } else { chunk_len = 0; } - notif_len = sizeof(struct sctp_remote_error) + chunk_len; - m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA); + notif_len = (unsigned int)(sizeof(struct sctp_remote_error) + chunk_len); + m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { /* Retry with smaller value. */ - notif_len = sizeof(struct sctp_remote_error); - m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_DONTWAIT, 1, MT_DATA); + notif_len = (unsigned int)sizeof(struct sctp_remote_error); + m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { return; } } SCTP_BUF_NEXT(m_notify) = NULL; sre = mtod(m_notify, struct sctp_remote_error *); + memset(sre, 0, notif_len); sre->sre_type = SCTP_REMOTE_ERROR; sre->sre_flags = 0; sre->sre_length = sizeof(struct sctp_remote_error); @@ -3554,7 +3682,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, if (stcb->asoc.adaptation_needed && (stcb->asoc.adaptation_sent == 0)) { sctp_notify_adaptation_layer(stcb); } - if (stcb->asoc.peer_supports_auth == 0) { + if (stcb->asoc.auth_supported == 0) { sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0, NULL, so_locked); } @@ -3568,7 +3696,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, net = (struct sctp_nets *)data; sctp_notify_peer_addr_change(stcb, SCTP_ADDR_UNREACHABLE, - (struct sockaddr *)&net->ro._l_addr, error); + (struct sockaddr *)&net->ro._l_addr, error, so_locked); break; } case SCTP_NOTIFY_INTERFACE_UP: @@ -3577,7 +3705,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, net = (struct sctp_nets *)data; sctp_notify_peer_addr_change(stcb, SCTP_ADDR_AVAILABLE, - (struct sockaddr *)&net->ro._l_addr, error); + (struct sockaddr *)&net->ro._l_addr, error, so_locked); break; } case SCTP_NOTIFY_INTERFACE_CONFIRMED: @@ -3586,7 +3714,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, net = (struct sctp_nets *)data; sctp_notify_peer_addr_change(stcb, SCTP_ADDR_CONFIRMED, - (struct sockaddr *)&net->ro._l_addr, error); + (struct sockaddr *)&net->ro._l_addr, error, so_locked); break; } case SCTP_NOTIFY_SPECIAL_SP_FAIL: @@ -3628,7 +3756,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, break; case SCTP_NOTIFY_ASSOC_RESTART: sctp_notify_assoc_change(SCTP_RESTART, stcb, error, NULL, 0, so_locked); - if (stcb->asoc.peer_supports_auth == 0) { + if (stcb->asoc.auth_supported == 0) { sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0, NULL, so_locked); } @@ -3657,15 +3785,15 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, break; case SCTP_NOTIFY_ASCONF_ADD_IP: sctp_notify_peer_addr_change(stcb, SCTP_ADDR_ADDED, data, - error); + error, so_locked); break; case SCTP_NOTIFY_ASCONF_DELETE_IP: sctp_notify_peer_addr_change(stcb, SCTP_ADDR_REMOVED, data, - error); + error, so_locked); break; case SCTP_NOTIFY_ASCONF_SET_PRIMARY: sctp_notify_peer_addr_change(stcb, SCTP_ADDR_MADE_PRIM, data, - error); + error, so_locked); break; case SCTP_NOTIFY_PEER_SHUTDOWN: sctp_notify_shutdown_event(stcb); @@ -3693,7 +3821,7 @@ sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, break; default: SCTPDBG(SCTP_DEBUG_UTIL1, "%s: unknown notification %xh (%u)\n", - __FUNCTION__, notification, notification); + __func__, notification, notification); break; } /* end switch */ } @@ -3780,10 +3908,10 @@ sctp_report_all_outbound(struct sctp_tcb *stcb, uint16_t error, int holds_lock, /* For each stream */ outs = &asoc->strmout[i]; /* clean up any sends there */ - asoc->locked_on_sending = NULL; TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { - asoc->stream_queue_cnt--; + atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); + stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, holds_lock); sctp_free_spbufspace(stcb, asoc, sp); if (sp->data) { sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, @@ -3845,7 +3973,7 @@ sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct mbuf *m, int iphlen, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct mbuf *op_err, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { uint32_t vtag; @@ -3865,7 +3993,7 @@ sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, stcb->asoc.state |= SCTP_STATE_WAS_ABORTED; } sctp_send_abort(m, iphlen, src, dst, sh, vtag, op_err, - use_mflowid, mflowid, + mflowtype, mflowid, inp->fibnum, vrf_id, port); if (stcb != NULL) { /* Ok, now lets free it */ @@ -3882,7 +4010,8 @@ sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_4); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTPUTIL + SCTP_LOC_4); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) SCTP_SOCKET_UNLOCK(so, 1); #endif @@ -4006,7 +4135,8 @@ sctp_abort_an_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, atomic_subtract_int(&stcb->asoc.refcnt, 1); } #endif - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_5); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTPUTIL + SCTP_LOC_5); #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) if (!so_locked) { SCTP_SOCKET_UNLOCK(so, 1); @@ -4019,7 +4149,7 @@ sctp_handle_ootb(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_inpcb *inp, struct mbuf *cause, - uint8_t use_mflowid, uint32_t mflowid, + uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { struct sctp_chunkhdr *ch, chunk_buf; @@ -4061,7 +4191,7 @@ sctp_handle_ootb(struct mbuf *m, int iphlen, int offset, return; case SCTP_SHUTDOWN_ACK: sctp_send_shutdown_complete2(src, dst, sh, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); return; default: @@ -4075,7 +4205,7 @@ sctp_handle_ootb(struct mbuf *m, int iphlen, int offset, ((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) && (contains_init_chunk == 0))) { sctp_send_abort(m, iphlen, src, dst, sh, 0, cause, - use_mflowid, mflowid, + mflowtype, mflowid, fibnum, vrf_id, port); } } @@ -4341,6 +4471,49 @@ sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp, SCTP_INP_READ_UNLOCK(new_inp); } +void +sctp_wakeup_the_read_socket(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + if ((inp != NULL) && (inp->sctp_socket != NULL)) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) { + SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket); + } else { +#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + + so = SCTP_INP_SO(inp); + if (!so_locked) { + if (stcb) { + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_SOCKET_LOCK(so, 1); + if (stcb) { + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + SCTP_SOCKET_UNLOCK(so, 1); + return; + } + } +#endif + sctp_sorwakeup(inp, inp->sctp_socket); +#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if (!so_locked) { + SCTP_SOCKET_UNLOCK(so, 1); + } +#endif + } + } +} + void sctp_add_to_readq(struct sctp_inpcb *inp, struct sctp_tcb *stcb, @@ -4376,7 +4549,7 @@ sctp_add_to_readq(struct sctp_inpcb *inp, sctp_m_freem(control->data); control->data = NULL; } - SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), control); + sctp_free_a_readq(stcb, control); if (inp_read_lock_held == 0) SCTP_INP_READ_UNLOCK(inp); return; @@ -4422,7 +4595,7 @@ sctp_add_to_readq(struct sctp_inpcb *inp, } else { /* Everything got collapsed out?? */ sctp_free_remote_addr(control->whoFrom); - SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), control); + sctp_free_a_readq(stcb, control); if (inp_read_lock_held == 0) SCTP_INP_READ_UNLOCK(inp); return; @@ -4431,195 +4604,14 @@ sctp_add_to_readq(struct sctp_inpcb *inp, control->end_added = 1; } TAILQ_INSERT_TAIL(&inp->read_queue, control, next); + control->on_read_q = 1; if (inp_read_lock_held == 0) SCTP_INP_READ_UNLOCK(inp); if (inp && inp->sctp_socket) { - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) { - SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket); - } else { -#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) - struct socket *so; - - so = SCTP_INP_SO(inp); - if (!so_locked) { - if (stcb) { - atomic_add_int(&stcb->asoc.refcnt, 1); - SCTP_TCB_UNLOCK(stcb); - } - SCTP_SOCKET_LOCK(so, 1); - if (stcb) { - SCTP_TCB_LOCK(stcb); - atomic_subtract_int(&stcb->asoc.refcnt, 1); - } - if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { - SCTP_SOCKET_UNLOCK(so, 1); - return; - } - } -#endif - sctp_sorwakeup(inp, inp->sctp_socket); -#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) - if (!so_locked) { - SCTP_SOCKET_UNLOCK(so, 1); - } -#endif - } + sctp_wakeup_the_read_socket(inp, stcb, so_locked); } } - -int -sctp_append_to_readq(struct sctp_inpcb *inp, - struct sctp_tcb *stcb, - struct sctp_queued_to_read *control, - struct mbuf *m, - int end, - int ctls_cumack, - struct sockbuf *sb) -{ - /* - * A partial delivery API event is underway. OR we are appending on - * the reassembly queue. - * - * If PDAPI this means we need to add m to the end of the data. - * Increase the length in the control AND increment the sb_cc. - * Otherwise sb is NULL and all we need to do is put it at the end - * of the mbuf chain. - */ - int len = 0; - struct mbuf *mm, *tail = NULL, *prev = NULL; - - if (inp) { - SCTP_INP_READ_LOCK(inp); - } - if (control == NULL) { -get_out: - if (inp) { - SCTP_INP_READ_UNLOCK(inp); - } - return (-1); - } - if (inp && (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ)) { - SCTP_INP_READ_UNLOCK(inp); - return (0); - } - if (control->end_added) { - /* huh this one is complete? */ - goto get_out; - } - mm = m; - if (mm == NULL) { - goto get_out; - } - while (mm) { - if (SCTP_BUF_LEN(mm) == 0) { - /* Skip mbufs with NO lenght */ - if (prev == NULL) { - /* First one */ - m = sctp_m_free(mm); - mm = m; - } else { - SCTP_BUF_NEXT(prev) = sctp_m_free(mm); - mm = SCTP_BUF_NEXT(prev); - } - continue; - } - prev = mm; - len += SCTP_BUF_LEN(mm); - if (sb) { - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { - sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(mm)); - } - sctp_sballoc(stcb, sb, mm); - if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { - sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); - } - } - mm = SCTP_BUF_NEXT(mm); - } - if (prev) { - tail = prev; - } else { - /* Really there should always be a prev */ - if (m == NULL) { - /* Huh nothing left? */ -#ifdef INVARIANTS - panic("Nothing left to add?"); -#else - goto get_out; -#endif - } - tail = m; - } - if (control->tail_mbuf) { - /* append */ - SCTP_BUF_NEXT(control->tail_mbuf) = m; - control->tail_mbuf = tail; - } else { - /* nothing there */ -#ifdef INVARIANTS - if (control->data != NULL) { - panic("This should NOT happen"); - } -#endif - control->data = m; - control->tail_mbuf = tail; - } - atomic_add_int(&control->length, len); - if (end) { - /* message is complete */ - if (stcb && (control == stcb->asoc.control_pdapi)) { - stcb->asoc.control_pdapi = NULL; - } - control->held_length = 0; - control->end_added = 1; - } - if (stcb == NULL) { - control->do_not_ref_stcb = 1; - } - /* - * When we are appending in partial delivery, the cum-ack is used - * for the actual pd-api highest tsn on this mbuf. The true cum-ack - * is populated in the outbound sinfo structure from the true cumack - * if the association exists... - */ - control->sinfo_tsn = control->sinfo_cumtsn = ctls_cumack; - if (inp) { - SCTP_INP_READ_UNLOCK(inp); - } - if (inp && inp->sctp_socket) { - if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) { - SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket); - } else { -#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) - struct socket *so; - - so = SCTP_INP_SO(inp); - if (stcb) { - atomic_add_int(&stcb->asoc.refcnt, 1); - SCTP_TCB_UNLOCK(stcb); - } - SCTP_SOCKET_LOCK(so, 1); - if (stcb) { - SCTP_TCB_LOCK(stcb); - atomic_subtract_int(&stcb->asoc.refcnt, 1); - } - if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { - SCTP_SOCKET_UNLOCK(so, 1); - return (0); - } -#endif - sctp_sorwakeup(inp, inp->sctp_socket); -#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) - SCTP_SOCKET_UNLOCK(so, 1); -#endif - } - } - return (0); -} - - - /*************HOLD THIS COMMENT FOR PATCH FILE OF *************ALTERNATE ROUTING CODE */ @@ -4633,19 +4625,23 @@ sctp_generate_cause(uint16_t code, char *info) { struct mbuf *m; struct sctp_gen_error_cause *cause; - size_t info_len, len; + size_t info_len; + uint16_t len; if ((code == 0) || (info == NULL)) { return (NULL); } info_len = strlen(info); - len = sizeof(struct sctp_paramhdr) + info_len; + if (info_len > (SCTP_MAX_CAUSE_LENGTH - sizeof(struct sctp_paramhdr))) { + return (NULL); + } + len = (uint16_t) (sizeof(struct sctp_paramhdr) + info_len); m = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); if (m != NULL) { SCTP_BUF_LEN(m) = len; cause = mtod(m, struct sctp_gen_error_cause *); cause->code = htons(code); - cause->length = htons((uint16_t) len); + cause->length = htons(len); memcpy(cause->info, info, info_len); } return (m); @@ -4656,15 +4652,15 @@ sctp_generate_no_user_data_cause(uint32_t tsn) { struct mbuf *m; struct sctp_error_no_user_data *no_user_data_cause; - size_t len; + uint16_t len; - len = sizeof(struct sctp_error_no_user_data); + len = (uint16_t) sizeof(struct sctp_error_no_user_data); m = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); if (m != NULL) { SCTP_BUF_LEN(m) = len; no_user_data_cause = mtod(m, struct sctp_error_no_user_data *); no_user_data_cause->cause.code = htons(SCTP_CAUSE_NO_USER_DATA); - no_user_data_cause->cause.length = htons((uint16_t) len); + no_user_data_cause->cause.length = htons(len); no_user_data_cause->tsn = tsn; /* tsn is passed in as NBO */ } return (m); @@ -4724,6 +4720,21 @@ sctp_release_pr_sctp_chunk(struct sctp_tcb *stcb, struct sctp_tmit_chunk *tp1, stream = tp1->rec.data.stream_number; seq = tp1->rec.data.stream_seq; + if (sent || !(tp1->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG)) { + stcb->asoc.abandoned_sent[0]++; + stcb->asoc.abandoned_sent[PR_SCTP_POLICY(tp1->flags)]++; + stcb->asoc.strmout[stream].abandoned_sent[0]++; +#if defined(SCTP_DETAILED_STR_STATS) + stcb->asoc.strmout[stream].abandoned_sent[PR_SCTP_POLICY(tp1->flags)]++; +#endif + } else { + stcb->asoc.abandoned_unsent[0]++; + stcb->asoc.abandoned_unsent[PR_SCTP_POLICY(tp1->flags)]++; + stcb->asoc.strmout[stream].abandoned_unsent[0]++; +#if defined(SCTP_DETAILED_STR_STATS) + stcb->asoc.strmout[stream].abandoned_unsent[PR_SCTP_POLICY(tp1->flags)]++; +#endif + } do { ret_sz += tp1->book_size; if (tp1->data != NULL) { @@ -4840,28 +4851,48 @@ sctp_release_pr_sctp_chunk(struct sctp_tcb *stcb, struct sctp_tmit_chunk *tp1, goto oh_well; } memset(chk, 0, sizeof(*chk)); - chk->rec.data.rcv_flags = SCTP_DATA_LAST_FRAG; + chk->rec.data.rcv_flags = 0; chk->sent = SCTP_FORWARD_TSN_SKIP; chk->asoc = &stcb->asoc; - chk->rec.data.stream_seq = strq->next_sequence_send; + if (stcb->asoc.idata_supported == 0) { + if (sp->sinfo_flags & SCTP_UNORDERED) { + chk->rec.data.stream_seq = 0; + } else { + chk->rec.data.stream_seq = strq->next_mid_ordered; + } + } else { + if (sp->sinfo_flags & SCTP_UNORDERED) { + chk->rec.data.stream_seq = strq->next_mid_unordered; + } else { + chk->rec.data.stream_seq = strq->next_mid_ordered; + } + } chk->rec.data.stream_number = sp->stream; chk->rec.data.payloadtype = sp->ppid; chk->rec.data.context = sp->context; chk->flags = sp->act_flags; - if (sp->net) - chk->whoTo = sp->net; - else - chk->whoTo = stcb->asoc.primary_destination; - atomic_add_int(&chk->whoTo->ref_count, 1); + chk->whoTo = NULL; chk->rec.data.TSN_seq = atomic_fetchadd_int(&stcb->asoc.sending_seq, 1); - stcb->asoc.pr_sctp_cnt++; + strq->chunks_on_queues++; TAILQ_INSERT_TAIL(&stcb->asoc.sent_queue, chk, sctp_next); stcb->asoc.sent_queue_cnt++; stcb->asoc.pr_sctp_cnt++; + } + chk->rec.data.rcv_flags |= SCTP_DATA_LAST_FRAG; + if (sp->sinfo_flags & SCTP_UNORDERED) { + chk->rec.data.rcv_flags |= SCTP_DATA_UNORDERED; + } + if (stcb->asoc.idata_supported == 0) { + if ((sp->sinfo_flags & SCTP_UNORDERED) == 0) { + strq->next_mid_ordered++; + } } else { - chk->rec.data.rcv_flags |= SCTP_DATA_LAST_FRAG; + if (sp->sinfo_flags & SCTP_UNORDERED) { + strq->next_mid_unordered++; + } else { + strq->next_mid_ordered++; + } } - strq->next_sequence_send++; oh_well: if (sp->data) { /* @@ -5009,7 +5040,6 @@ sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock) vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { -stage_right: if (holds_lock == 0) SCTP_IPI_ADDR_RUNLOCK(); return (NULL); @@ -5029,15 +5059,6 @@ stage_right: return (NULL); } LIST_FOREACH(sctp_ifap, hash_head, next_bucket) { - if (sctp_ifap == NULL) { -#ifdef INVARIANTS - panic("Huh LIST_FOREACH corrupt"); - goto stage_right; -#else - SCTP_PRINTF("LIST corrupt of sctp_ifap's?\n"); - goto stage_right; -#endif - } if (addr->sa_family != sctp_ifap->address.sa.sa_family) continue; #ifdef INET @@ -5136,7 +5157,8 @@ sctp_user_rcvd(struct sctp_tcb *stcb, uint32_t * freed_so_far, int hold_rlock, sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_USR_RCVD, SCTP_SO_LOCKED); /* make sure no timer is running */ - sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_6); + sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, + SCTP_FROM_SCTPUTIL + SCTP_LOC_6); SCTP_TCB_UNLOCK(stcb); } else { /* Update how much we have pending */ @@ -5187,7 +5209,7 @@ sctp_sorecvmsg(struct socket *so, uint32_t rwnd_req = 0; int hold_sblock = 0; int hold_rlock = 0; - int slen = 0; + ssize_t slen = 0; uint32_t held_length = 0; int sockbuf_lock = 0; @@ -5232,11 +5254,11 @@ sctp_sorecvmsg(struct socket *so, in_eeor_mode = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) { sctp_misc_ints(SCTP_SORECV_ENTER, - rwnd_req, in_eeor_mode, so->so_rcv.sb_cc, uio->uio_resid); + rwnd_req, in_eeor_mode, so->so_rcv.sb_cc, (uint32_t) uio->uio_resid); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) { sctp_misc_ints(SCTP_SORECV_ENTERPL, - rwnd_req, block_allowed, so->so_rcv.sb_cc, uio->uio_resid); + rwnd_req, block_allowed, so->so_rcv.sb_cc, (uint32_t) uio->uio_resid); } error = sblock(&so->so_rcv, (block_allowed ? SBL_WAIT : 0)); if (error) { @@ -5269,8 +5291,14 @@ restart_nosblocks: } } } - if ((so->so_rcv.sb_cc <= held_length) && block_allowed) { - /* we need to wait for data */ + if (so->so_rcv.sb_cc <= held_length) { + if (so->so_error) { + error = so->so_error; + if ((in_flags & MSG_PEEK) == 0) { + so->so_error = 0; + } + goto out; + } if ((so->so_rcv.sb_cc == 0) && ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { @@ -5301,51 +5329,18 @@ restart_nosblocks: goto out; } } - error = sbwait(&so->so_rcv); - if (error) { - goto out; - } - held_length = 0; - goto restart_nosblocks; - } else if (so->so_rcv.sb_cc == 0) { - if (so->so_error) { - error = so->so_error; - if ((in_flags & MSG_PEEK) == 0) - so->so_error = 0; - } else { - if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || - (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { - if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0) { - /* - * For active open side clear flags - * for re-use passive open is - * blocked by connect. - */ - if (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED) { - /* - * You were aborted, passive - * side always hits here - */ - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET); - error = ECONNRESET; - } - so->so_state &= ~(SS_ISCONNECTING | - SS_ISDISCONNECTING | - SS_ISCONFIRMING | - SS_ISCONNECTED); - if (error == 0) { - if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) == 0) { - SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN); - error = ENOTCONN; - } - } - goto out; - } + if (block_allowed) { + error = sbwait(&so->so_rcv); + if (error) { + goto out; } + held_length = 0; + goto restart_nosblocks; + } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EWOULDBLOCK); error = EWOULDBLOCK; + goto out; } - goto out; } if (hold_sblock == 1) { SOCKBUF_UNLOCK(&so->so_rcv); @@ -5438,6 +5433,12 @@ restart_nosblocks: sctp_m_free(control->aux_data); control->aux_data = NULL; } +#ifdef INVARIANTS + if (control->on_strm_q) { + panic("About to free ctl:%p so:%p and its in %d", + control, so, control->on_strm_q); + } +#endif sctp_free_remote_addr(control->whoFrom); sctp_free_a_readq(stcb, control); if (hold_rlock) { @@ -5498,20 +5499,16 @@ restart_nosblocks: } /* Clear the held length since there is something to read */ control->held_length = 0; - if (hold_rlock) { - SCTP_INP_READ_UNLOCK(inp); - hold_rlock = 0; - } found_one: /* * If we reach here, control has a some data for us to read off. * Note that stcb COULD be NULL. */ - control->some_taken++; - if (hold_sblock) { - SOCKBUF_UNLOCK(&so->so_rcv); - hold_sblock = 0; + if (hold_rlock == 0) { + hold_rlock = 1; + SCTP_INP_READ_LOCK(inp); } + control->some_taken++; stcb = control->stcb; if (stcb) { if ((control->do_not_ref_stcb == 0) && @@ -5556,8 +5553,16 @@ found_one: stcb->asoc.strmin[control->sinfo_stream].delivery_started = 1; } /* First lets get off the sinfo and sockaddr info */ - if ((sinfo) && filling_sinfo) { - memcpy(sinfo, control, sizeof(struct sctp_nonpad_sndrcvinfo)); + if ((sinfo != NULL) && (filling_sinfo != 0)) { + sinfo->sinfo_stream = control->sinfo_stream; + sinfo->sinfo_ssn = (uint16_t) control->sinfo_ssn; + sinfo->sinfo_flags = control->sinfo_flags; + sinfo->sinfo_ppid = control->sinfo_ppid; + sinfo->sinfo_context = control->sinfo_context; + sinfo->sinfo_timetolive = control->sinfo_timetolive; + sinfo->sinfo_tsn = control->sinfo_tsn; + sinfo->sinfo_cumtsn = control->sinfo_cumtsn; + sinfo->sinfo_assoc_id = control->sinfo_assoc_id; nxt = TAILQ_NEXT(control, next); if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO) || sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) { @@ -5566,20 +5571,20 @@ found_one: s_extra = (struct sctp_extrcvinfo *)sinfo; if ((nxt) && (nxt->length)) { - s_extra->sreinfo_next_flags = SCTP_NEXT_MSG_AVAIL; + s_extra->serinfo_next_flags = SCTP_NEXT_MSG_AVAIL; if (nxt->sinfo_flags & SCTP_UNORDERED) { - s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_IS_UNORDERED; + s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_IS_UNORDERED; } if (nxt->spec_flags & M_NOTIFICATION) { - s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_IS_NOTIFICATION; + s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_IS_NOTIFICATION; } - s_extra->sreinfo_next_aid = nxt->sinfo_assoc_id; - s_extra->sreinfo_next_length = nxt->length; - s_extra->sreinfo_next_ppid = nxt->sinfo_ppid; - s_extra->sreinfo_next_stream = nxt->sinfo_stream; + s_extra->serinfo_next_aid = nxt->sinfo_assoc_id; + s_extra->serinfo_next_length = nxt->length; + s_extra->serinfo_next_ppid = nxt->sinfo_ppid; + s_extra->serinfo_next_stream = nxt->sinfo_stream; if (nxt->tail_mbuf != NULL) { if (nxt->end_added) { - s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_ISCOMPLETE; + s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_ISCOMPLETE; } } } else { @@ -5590,11 +5595,11 @@ found_one: * :-D */ nxt = NULL; - s_extra->sreinfo_next_flags = SCTP_NO_NEXT_MSG; - s_extra->sreinfo_next_aid = 0; - s_extra->sreinfo_next_length = 0; - s_extra->sreinfo_next_ppid = 0; - s_extra->sreinfo_next_stream = 0; + s_extra->serinfo_next_flags = SCTP_NO_NEXT_MSG; + s_extra->serinfo_next_aid = 0; + s_extra->serinfo_next_length = 0; + s_extra->serinfo_next_ppid = 0; + s_extra->serinfo_next_stream = 0; } } /* @@ -5631,43 +5636,43 @@ found_one: entry->flgs = control->sinfo_flags; } #endif - if (fromlen && from) { - cp_len = min((size_t)fromlen, (size_t)control->whoFrom->ro._l_addr.sa.sa_len); + if ((fromlen > 0) && (from != NULL)) { + union sctp_sockstore store; + size_t len; + switch (control->whoFrom->ro._l_addr.sa.sa_family) { #ifdef INET6 case AF_INET6: - ((struct sockaddr_in6 *)from)->sin6_port = control->port_from; + len = sizeof(struct sockaddr_in6); + store.sin6 = control->whoFrom->ro._l_addr.sin6; + store.sin6.sin6_port = control->port_from; break; #endif #ifdef INET case AF_INET: - ((struct sockaddr_in *)from)->sin_port = control->port_from; +#ifdef INET6 + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { + len = sizeof(struct sockaddr_in6); + in6_sin_2_v4mapsin6(&control->whoFrom->ro._l_addr.sin, + &store.sin6); + store.sin6.sin6_port = control->port_from; + } else { + len = sizeof(struct sockaddr_in); + store.sin = control->whoFrom->ro._l_addr.sin; + store.sin.sin_port = control->port_from; + } +#else + len = sizeof(struct sockaddr_in); + store.sin = control->whoFrom->ro._l_addr.sin; + store.sin.sin_port = control->port_from; +#endif break; #endif default: + len = 0; break; } - memcpy(from, &control->whoFrom->ro._l_addr, cp_len); - -#if defined(INET) && defined(INET6) - if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) && - (from->sa_family == AF_INET) && - ((size_t)fromlen >= sizeof(struct sockaddr_in6))) { - struct sockaddr_in *sin; - struct sockaddr_in6 sin6; - - sin = (struct sockaddr_in *)from; - bzero(&sin6, sizeof(sin6)); - sin6.sin6_family = AF_INET6; - sin6.sin6_len = sizeof(struct sockaddr_in6); - sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); - bcopy(&sin->sin_addr, - &sin6.sin6_addr.s6_addr32[3], - sizeof(sin6.sin6_addr.s6_addr32[3])); - sin6.sin6_port = sin->sin_port; - memcpy(from, &sin6, sizeof(struct sockaddr_in6)); - } -#endif + memcpy(from, &store, min((size_t)fromlen, len)); #ifdef INET6 { struct sockaddr_in6 lsa6, *from6; @@ -5677,6 +5682,14 @@ found_one: } #endif } + if (hold_rlock) { + SCTP_INP_READ_UNLOCK(inp); + hold_rlock = 0; + } + if (hold_sblock) { + SOCKBUF_UNLOCK(&so->so_rcv); + hold_sblock = 0; + } /* now copy out what data we can */ if (mp == NULL) { /* copy out each mbuf in the chain up to length */ @@ -5708,15 +5721,8 @@ get_more_data: /* error we are out of here */ goto release; } - if ((SCTP_BUF_NEXT(m) == NULL) && - (cp_len >= SCTP_BUF_LEN(m)) && - ((control->end_added == 0) || - (control->end_added && - (TAILQ_NEXT(control, next) == NULL))) - ) { - SCTP_INP_READ_LOCK(inp); - hold_rlock = 1; - } + SCTP_INP_READ_LOCK(inp); + hold_rlock = 1; if (cp_len == SCTP_BUF_LEN(m)) { if ((SCTP_BUF_NEXT(m) == NULL) && (control->end_added)) { @@ -5834,19 +5840,9 @@ get_more_data: #endif } done_with_control: - if (TAILQ_NEXT(control, next) == NULL) { - /* - * If we don't have a next we need a - * lock, if there is a next - * interrupt is filling ahead of us - * and we don't need a lock to - * remove this guy (which is the - * head of the queue). - */ - if (hold_rlock == 0) { - SCTP_INP_READ_LOCK(inp); - hold_rlock = 1; - } + if (hold_rlock == 0) { + SCTP_INP_READ_LOCK(inp); + hold_rlock = 1; } TAILQ_REMOVE(&inp->read_queue, control, next); /* Add back any hiddend data */ @@ -5862,6 +5858,12 @@ get_more_data: no_rcv_needed = control->do_not_ref_stcb; sctp_free_remote_addr(control->whoFrom); control->data = NULL; +#ifdef INVARIANTS + if (control->on_strm_q) { + panic("About to free ctl:%p so:%p and its in %d", + control, so, control->on_strm_q); + } +#endif sctp_free_a_readq(stcb, control); control = NULL; if ((freed_so_far >= rwnd_req) && @@ -6077,7 +6079,7 @@ out: struct sctp_extrcvinfo *s_extra; s_extra = (struct sctp_extrcvinfo *)sinfo; - s_extra->sreinfo_next_flags = SCTP_NO_NEXT_MSG; + s_extra->serinfo_next_flags = SCTP_NO_NEXT_MSG; } if (hold_rlock == 1) { SCTP_INP_READ_UNLOCK(inp); @@ -6103,21 +6105,21 @@ out: goto stage_left; #endif } - atomic_add_int(&stcb->asoc.refcnt, -1); /* Save the value back for next time */ stcb->freed_by_sorcv_sincelast = freed_so_far; + atomic_add_int(&stcb->asoc.refcnt, -1); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) { if (stcb) { sctp_misc_ints(SCTP_SORECV_DONE, freed_so_far, - ((uio) ? (slen - uio->uio_resid) : slen), + (uint32_t) ((uio) ? (slen - uio->uio_resid) : slen), stcb->asoc.my_rwnd, so->so_rcv.sb_cc); } else { sctp_misc_ints(SCTP_SORECV_DONE, freed_so_far, - ((uio) ? (slen - uio->uio_resid) : slen), + (uint32_t) ((uio) ? (slen - uio->uio_resid) : slen), 0, so->so_rcv.sb_cc); } @@ -6135,9 +6137,7 @@ struct mbuf * sctp_m_free(struct mbuf *m) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { - if (SCTP_BUF_IS_EXTENDED(m)) { - sctp_log_mb(m, SCTP_MBUF_IFREE); - } + sctp_log_mb(m, SCTP_MBUF_IFREE); } return (m_free(m)); } @@ -6296,14 +6296,18 @@ sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr, (sin->sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, EINVAL); - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTPUTIL + SCTP_LOC_7); *error = EINVAL; goto out_now; } - if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { + if (sctp_add_remote_addr(stcb, sa, NULL, stcb->asoc.port, + SCTP_DONOT_SETSCOPE, + SCTP_ADDR_IS_CONFIRMED)) { /* assoc gone no un-lock */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS); - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTPUTIL + SCTP_LOC_8); *error = ENOBUFS; goto out_now; } @@ -6317,14 +6321,18 @@ sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr, if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, EINVAL); - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTPUTIL + SCTP_LOC_9); *error = EINVAL; goto out_now; } - if (sctp_add_remote_addr(stcb, sa, NULL, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { + if (sctp_add_remote_addr(stcb, sa, NULL, stcb->asoc.port, + SCTP_DONOT_SETSCOPE, + SCTP_ADDR_IS_CONFIRMED)) { /* assoc gone no un-lock */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS); - (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTPUTIL + SCTP_LOC_10); *error = ENOBUFS; goto out_now; } @@ -6342,30 +6350,30 @@ out_now: struct sctp_tcb * sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, - int *totaddr, int *num_v4, int *num_v6, int *error, - int limit, int *bad_addr) + unsigned int *totaddr, + unsigned int *num_v4, unsigned int *num_v6, int *error, + unsigned int limit, int *bad_addr) { struct sockaddr *sa; struct sctp_tcb *stcb = NULL; - size_t incr, at, i; + unsigned int incr, at, i; - at = incr = 0; + at = 0; sa = addr; - *error = *num_v6 = *num_v4 = 0; /* account and validate addresses */ - for (i = 0; i < (size_t)*totaddr; i++) { + for (i = 0; i < *totaddr; i++) { switch (sa->sa_family) { #ifdef INET case AF_INET: - (*num_v4) += 1; - incr = sizeof(struct sockaddr_in); + incr = (unsigned int)sizeof(struct sockaddr_in); if (sa->sa_len != incr) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; *bad_addr = 1; return (NULL); } + (*num_v4) += 1; break; #endif #ifdef INET6 @@ -6381,14 +6389,14 @@ sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, *bad_addr = 1; return (NULL); } - (*num_v6) += 1; - incr = sizeof(struct sockaddr_in6); + incr = (unsigned int)sizeof(struct sockaddr_in6); if (sa->sa_len != incr) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; *bad_addr = 1; return (NULL); } + (*num_v6) += 1; break; } #endif @@ -6397,7 +6405,7 @@ sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, /* we are done */ break; } - if (i == (size_t)*totaddr) { + if (i == *totaddr) { break; } SCTP_INP_INCR_REF(inp); @@ -6408,7 +6416,7 @@ sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, } else { SCTP_INP_DECR_REF(inp); } - if ((at + incr) > (size_t)limit) { + if ((at + incr) > limit) { *totaddr = i; break; } @@ -6428,7 +6436,7 @@ sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp, { struct sockaddr *addr_touse; -#ifdef INET6 +#if defined(INET) && defined(INET6) struct sockaddr_in sin; #endif @@ -6442,8 +6450,10 @@ sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp, addr_touse = sa; #ifdef INET6 if (sa->sa_family == AF_INET6) { +#ifdef INET struct sockaddr_in6 *sin6; +#endif if (sa->sa_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; @@ -6455,6 +6465,7 @@ sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp, *error = EINVAL; return; } +#ifdef INET sin6 = (struct sockaddr_in6 *)addr_touse; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && @@ -6467,6 +6478,7 @@ sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp, in6_sin6_2_sin(&sin, sin6); addr_touse = (struct sockaddr *)&sin; } +#endif } #endif #ifdef INET @@ -6556,7 +6568,7 @@ sctp_bindx_delete_address(struct sctp_inpcb *inp, { struct sockaddr *addr_touse; -#ifdef INET6 +#if defined(INET) && defined(INET6) struct sockaddr_in sin; #endif @@ -6570,8 +6582,11 @@ sctp_bindx_delete_address(struct sctp_inpcb *inp, addr_touse = sa; #ifdef INET6 if (sa->sa_family == AF_INET6) { +#ifdef INET struct sockaddr_in6 *sin6; +#endif + if (sa->sa_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; @@ -6583,6 +6598,7 @@ sctp_bindx_delete_address(struct sctp_inpcb *inp, *error = EINVAL; return; } +#ifdef INET sin6 = (struct sockaddr_in6 *)addr_touse; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && @@ -6595,6 +6611,7 @@ sctp_bindx_delete_address(struct sctp_inpcb *inp, in6_sin6_2_sin(&sin, sin6); addr_touse = (struct sockaddr *)&sin; } +#endif } #endif #ifdef INET @@ -6688,7 +6705,7 @@ sctp_local_addr_count(struct sctp_tcb *stcb) if (ipv4_addr_legal) { struct sockaddr_in *sin; - sin = (struct sockaddr_in *)&sctp_ifa->address.sa; + sin = &sctp_ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* * skip unspecified @@ -6716,7 +6733,7 @@ sctp_local_addr_count(struct sctp_tcb *stcb) if (ipv6_addr_legal) { struct sockaddr_in6 *sin6; - sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa; + sin6 = &sctp_ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { continue; } @@ -6810,7 +6827,8 @@ sctp_log_trace(uint32_t subsys, const char *str SCTP_UNUSED, uint32_t a, uint32_ #endif static void -sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *ignored) +sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, + const struct sockaddr *sa SCTP_UNUSED, void *ctx SCTP_UNUSED) { struct ip *iph; @@ -6834,7 +6852,7 @@ sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *ignored) * Split out the mbuf chain. Leave the IP header in m, place the * rest in the sp. */ - sp = m_split(m, off, M_DONTWAIT); + sp = m_split(m, off, M_NOWAIT); if (sp == NULL) { /* Gak, drop packet, we can't do a split */ goto out; @@ -6857,11 +6875,23 @@ sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *ignored) for (last = m; last->m_next; last = last->m_next); last->m_next = sp; m->m_pkthdr.len += sp->m_pkthdr.len; + /* + * The CSUM_DATA_VALID flags indicates that the HW checked the UDP + * checksum and it was valid. Since CSUM_DATA_VALID == + * CSUM_SCTP_VALID this would imply that the HW also verified the + * SCTP checksum. Therefore, clear the bit. + */ + SCTPDBG(SCTP_DEBUG_CRCOFFLOAD, + "sctp_recv_udp_tunneled_packet(): Packet of length %d received on %s with csum_flags 0x%b.\n", + m->m_pkthdr.len, + if_name(m->m_pkthdr.rcvif), + (int)m->m_pkthdr.csum_flags, CSUM_BITS); + m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID; iph = mtod(m, struct ip *); switch (iph->ip_v) { #ifdef INET case IPVERSION: - iph->ip_len -= sizeof(struct udphdr); + iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); sctp_input_with_port(m, off, port); break; #endif @@ -6881,6 +6911,259 @@ out: m_freem(m); } +#ifdef INET +static void +sctp_recv_icmp_tunneled_packet(int cmd, struct sockaddr *sa, void *vip, void *ctx SCTP_UNUSED) +{ + struct ip *outer_ip, *inner_ip; + struct sctphdr *sh; + struct icmp *icmp; + struct udphdr *udp; + struct sctp_inpcb *inp; + struct sctp_tcb *stcb; + struct sctp_nets *net; + struct sctp_init_chunk *ch; + struct sockaddr_in src, dst; + uint8_t type, code; + + inner_ip = (struct ip *)vip; + icmp = (struct icmp *)((caddr_t)inner_ip - + (sizeof(struct icmp) - sizeof(struct ip))); + outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); + if (ntohs(outer_ip->ip_len) < + sizeof(struct ip) + 8 + (inner_ip->ip_hl << 2) + sizeof(struct udphdr) + 8) { + return; + } + udp = (struct udphdr *)((caddr_t)inner_ip + (inner_ip->ip_hl << 2)); + sh = (struct sctphdr *)(udp + 1); + memset(&src, 0, sizeof(struct sockaddr_in)); + src.sin_family = AF_INET; + src.sin_len = sizeof(struct sockaddr_in); + src.sin_port = sh->src_port; + src.sin_addr = inner_ip->ip_src; + memset(&dst, 0, sizeof(struct sockaddr_in)); + dst.sin_family = AF_INET; + dst.sin_len = sizeof(struct sockaddr_in); + dst.sin_port = sh->dest_port; + dst.sin_addr = inner_ip->ip_dst; + /* + * 'dst' holds the dest of the packet that failed to be sent. 'src' + * holds our local endpoint address. Thus we reverse the dst and the + * src in the lookup. + */ + inp = NULL; + net = NULL; + stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst, + (struct sockaddr *)&src, + &inp, &net, 1, + SCTP_DEFAULT_VRFID); + if ((stcb != NULL) && + (net != NULL) && + (inp != NULL)) { + /* Check the UDP port numbers */ + if ((udp->uh_dport != net->port) || + (udp->uh_sport != htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)))) { + SCTP_TCB_UNLOCK(stcb); + return; + } + /* Check the verification tag */ + if (ntohl(sh->v_tag) != 0) { + /* + * This must be the verification tag used for + * sending out packets. We don't consider packets + * reflecting the verification tag. + */ + if (ntohl(sh->v_tag) != stcb->asoc.peer_vtag) { + SCTP_TCB_UNLOCK(stcb); + return; + } + } else { + if (ntohs(outer_ip->ip_len) >= + sizeof(struct ip) + + 8 + (inner_ip->ip_hl << 2) + 8 + 20) { + /* + * In this case we can check if we got an + * INIT chunk and if the initiate tag + * matches. + */ + ch = (struct sctp_init_chunk *)(sh + 1); + if ((ch->ch.chunk_type != SCTP_INITIATION) || + (ntohl(ch->init.initiate_tag) != stcb->asoc.my_vtag)) { + SCTP_TCB_UNLOCK(stcb); + return; + } + } else { + SCTP_TCB_UNLOCK(stcb); + return; + } + } + type = icmp->icmp_type; + code = icmp->icmp_code; + if ((type == ICMP_UNREACH) && + (code == ICMP_UNREACH_PORT)) { + code = ICMP_UNREACH_PROTOCOL; + } + sctp_notify(inp, stcb, net, type, code, + ntohs(inner_ip->ip_len), + ntohs(icmp->icmp_nextmtu)); + } else { + if ((stcb == NULL) && (inp != NULL)) { + /* reduce ref-count */ + SCTP_INP_WLOCK(inp); + SCTP_INP_DECR_REF(inp); + SCTP_INP_WUNLOCK(inp); + } + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } + } + return; +} + +#endif + +#ifdef INET6 +static void +sctp_recv_icmp6_tunneled_packet(int cmd, struct sockaddr *sa, void *d, void *ctx SCTP_UNUSED) +{ + struct ip6ctlparam *ip6cp; + struct sctp_inpcb *inp; + struct sctp_tcb *stcb; + struct sctp_nets *net; + struct sctphdr sh; + struct udphdr udp; + struct sockaddr_in6 src, dst; + uint8_t type, code; + + ip6cp = (struct ip6ctlparam *)d; + /* + * XXX: We assume that when IPV6 is non NULL, M and OFF are valid. + */ + if (ip6cp->ip6c_m == NULL) { + return; + } + /* + * Check if we can safely examine the ports and the verification tag + * of the SCTP common header. + */ + if (ip6cp->ip6c_m->m_pkthdr.len < + ip6cp->ip6c_off + sizeof(struct udphdr) + offsetof(struct sctphdr, checksum)) { + return; + } + /* Copy out the UDP header. */ + memset(&udp, 0, sizeof(struct udphdr)); + m_copydata(ip6cp->ip6c_m, + ip6cp->ip6c_off, + sizeof(struct udphdr), + (caddr_t)&udp); + /* Copy out the port numbers and the verification tag. */ + memset(&sh, 0, sizeof(struct sctphdr)); + m_copydata(ip6cp->ip6c_m, + ip6cp->ip6c_off + sizeof(struct udphdr), + sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t), + (caddr_t)&sh); + memset(&src, 0, sizeof(struct sockaddr_in6)); + src.sin6_family = AF_INET6; + src.sin6_len = sizeof(struct sockaddr_in6); + src.sin6_port = sh.src_port; + src.sin6_addr = ip6cp->ip6c_ip6->ip6_src; + if (in6_setscope(&src.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) { + return; + } + memset(&dst, 0, sizeof(struct sockaddr_in6)); + dst.sin6_family = AF_INET6; + dst.sin6_len = sizeof(struct sockaddr_in6); + dst.sin6_port = sh.dest_port; + dst.sin6_addr = ip6cp->ip6c_ip6->ip6_dst; + if (in6_setscope(&dst.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) { + return; + } + inp = NULL; + net = NULL; + stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst, + (struct sockaddr *)&src, + &inp, &net, 1, SCTP_DEFAULT_VRFID); + if ((stcb != NULL) && + (net != NULL) && + (inp != NULL)) { + /* Check the UDP port numbers */ + if ((udp.uh_dport != net->port) || + (udp.uh_sport != htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)))) { + SCTP_TCB_UNLOCK(stcb); + return; + } + /* Check the verification tag */ + if (ntohl(sh.v_tag) != 0) { + /* + * This must be the verification tag used for + * sending out packets. We don't consider packets + * reflecting the verification tag. + */ + if (ntohl(sh.v_tag) != stcb->asoc.peer_vtag) { + SCTP_TCB_UNLOCK(stcb); + return; + } + } else { + if (ip6cp->ip6c_m->m_pkthdr.len >= + ip6cp->ip6c_off + sizeof(struct udphdr) + + sizeof(struct sctphdr) + + sizeof(struct sctp_chunkhdr) + + offsetof(struct sctp_init, a_rwnd)) { + /* + * In this case we can check if we got an + * INIT chunk and if the initiate tag + * matches. + */ + uint32_t initiate_tag; + uint8_t chunk_type; + + m_copydata(ip6cp->ip6c_m, + ip6cp->ip6c_off + + sizeof(struct udphdr) + + sizeof(struct sctphdr), + sizeof(uint8_t), + (caddr_t)&chunk_type); + m_copydata(ip6cp->ip6c_m, + ip6cp->ip6c_off + + sizeof(struct udphdr) + + sizeof(struct sctphdr) + + sizeof(struct sctp_chunkhdr), + sizeof(uint32_t), + (caddr_t)&initiate_tag); + if ((chunk_type != SCTP_INITIATION) || + (ntohl(initiate_tag) != stcb->asoc.my_vtag)) { + SCTP_TCB_UNLOCK(stcb); + return; + } + } else { + SCTP_TCB_UNLOCK(stcb); + return; + } + } + type = ip6cp->ip6c_icmp6->icmp6_type; + code = ip6cp->ip6c_icmp6->icmp6_code; + if ((type == ICMP6_DST_UNREACH) && + (code == ICMP6_DST_UNREACH_NOPORT)) { + type = ICMP6_PARAM_PROB; + code = ICMP6_PARAMPROB_NEXTHEADER; + } + sctp6_notify(inp, stcb, net, type, code, + (uint16_t) ntohl(ip6cp->ip6c_icmp6->icmp6_mtu)); + } else { + if ((stcb == NULL) && (inp != NULL)) { + /* reduce inp's ref-count */ + SCTP_INP_WLOCK(inp); + SCTP_INP_DECR_REF(inp); + SCTP_INP_WUNLOCK(inp); + } + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } + } +} + +#endif + void sctp_over_udp_stop(void) { @@ -6946,7 +7229,9 @@ sctp_over_udp_start(void) } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(SCTP_BASE_INFO(udp4_tun_socket), - sctp_recv_udp_tunneled_packet))) { + sctp_recv_udp_tunneled_packet, + sctp_recv_icmp_tunneled_packet, + NULL))) { sctp_over_udp_stop(); return (ret); } @@ -6970,7 +7255,9 @@ sctp_over_udp_start(void) } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(SCTP_BASE_INFO(udp6_tun_socket), - sctp_recv_udp_tunneled_packet))) { + sctp_recv_udp_tunneled_packet, + sctp_recv_icmp6_tunneled_packet, + NULL))) { sctp_over_udp_stop(); return (ret); } diff --git a/freebsd/sys/netinet/sctputil.h b/freebsd/sys/netinet/sctputil.h index af5a0f29..292068af 100644 --- a/freebsd/sys/netinet/sctputil.h +++ b/freebsd/sys/netinet/sctputil.h @@ -67,6 +67,9 @@ void /* * Function prototypes */ +int32_t +sctp_map_assoc_state(int); + uint32_t sctp_get_ifa_hash_val(struct sockaddr *addr); @@ -80,7 +83,7 @@ uint32_t sctp_select_initial_TSN(struct sctp_pcb *); uint32_t sctp_select_a_tag(struct sctp_inpcb *, uint16_t lport, uint16_t rport, int); -int sctp_init_asoc(struct sctp_inpcb *, struct sctp_tcb *, uint32_t, uint32_t); +int sctp_init_asoc(struct sctp_inpcb *, struct sctp_tcb *, uint32_t, uint32_t, uint16_t); void sctp_fill_random_store(struct sctp_pcb *); @@ -104,6 +107,14 @@ int void sctp_mtu_size_reset(struct sctp_inpcb *, struct sctp_association *, uint32_t); +void +sctp_wakeup_the_read_socket(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); + void sctp_add_to_readq(struct sctp_inpcb *inp, struct sctp_tcb *stcb, @@ -117,16 +128,6 @@ sctp_add_to_readq(struct sctp_inpcb *inp, #endif ); -int -sctp_append_to_readq(struct sctp_inpcb *inp, - struct sctp_tcb *stcb, - struct sctp_queued_to_read *control, - struct mbuf *m, - int end, - int new_cumack, - struct sockbuf *sb); - - void sctp_iterator_worker(void); uint32_t sctp_get_prev_mtu(uint32_t); @@ -147,9 +148,11 @@ struct sctp_paramhdr * sctp_get_next_param(struct mbuf *, int, struct sctp_paramhdr *, int); -int sctp_add_pad_tombuf(struct mbuf *, int); +struct mbuf * + sctp_add_pad_tombuf(struct mbuf *, int); -int sctp_pad_lastmbuf(struct mbuf *, int, struct mbuf *); +struct mbuf * + sctp_pad_lastmbuf(struct mbuf *, int, struct mbuf *); void sctp_ulp_notify(uint32_t, struct sctp_tcb *, uint32_t, void *, int @@ -206,7 +209,7 @@ sctp_handle_ootb(struct mbuf *, int, int, struct sockaddr *, struct sockaddr *, struct sctphdr *, struct sctp_inpcb *, struct mbuf *, - uint8_t, uint32_t, + uint8_t, uint32_t, uint16_t, uint32_t, uint16_t); int @@ -215,7 +218,8 @@ sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr, struct sctp_tcb * sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, - int *totaddr, int *num_v4, int *num_v6, int *error, int limit, int *bad_addr); + unsigned int *totaddr, unsigned int *num_v4, unsigned int *num_v6, + int *error, unsigned int limit, int *bad_addr); int sctp_is_there_an_abort_here(struct mbuf *, int, uint32_t *); @@ -276,42 +280,42 @@ sctp_free_bufspace(struct sctp_tcb *, struct sctp_association *, #define sctp_free_bufspace(stcb, asoc, tp1, chk_cnt) \ do { \ if (tp1->data != NULL) { \ - atomic_subtract_int(&((asoc)->chunks_on_out_queue), chk_cnt); \ + atomic_subtract_int(&((asoc)->chunks_on_out_queue), chk_cnt); \ if ((asoc)->total_output_queue_size >= tp1->book_size) { \ atomic_subtract_int(&((asoc)->total_output_queue_size), tp1->book_size); \ } else { \ (asoc)->total_output_queue_size = 0; \ } \ - if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \ - (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \ + if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \ + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \ if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) { \ atomic_subtract_int(&((stcb)->sctp_socket->so_snd.sb_cc), tp1->book_size); \ } else { \ stcb->sctp_socket->so_snd.sb_cc = 0; \ } \ } \ - } \ + } \ } while (0) #endif #define sctp_free_spbufspace(stcb, asoc, sp) \ do { \ - if (sp->data != NULL) { \ + if (sp->data != NULL) { \ if ((asoc)->total_output_queue_size >= sp->length) { \ atomic_subtract_int(&(asoc)->total_output_queue_size, sp->length); \ } else { \ (asoc)->total_output_queue_size = 0; \ } \ - if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \ - (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \ + if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \ + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \ if (stcb->sctp_socket->so_snd.sb_cc >= sp->length) { \ atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc,sp->length); \ } else { \ stcb->sctp_socket->so_snd.sb_cc = 0; \ } \ } \ - } \ + } \ } while (0) #define sctp_snd_sb_alloc(stcb, sz) \ @@ -347,9 +351,15 @@ void sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t, uint16_t, uint16_t, void sctp_log_nagle_event(struct sctp_tcb *stcb, int action); +#ifdef SCTP_MBUF_LOGGING void sctp_log_mb(struct mbuf *m, int from); +void + sctp_log_mbc(struct mbuf *m, int from); + +#endif + void sctp_sblog(struct sockbuf *sb, struct sctp_tcb *stcb, int from, int incr); @@ -365,9 +375,8 @@ void sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc void sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from); void sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *, int, int, uint8_t); -void sctp_log_block(uint8_t, struct sctp_association *, int); +void sctp_log_block(uint8_t, struct sctp_association *, size_t); void sctp_log_rwnd(uint8_t, uint32_t, uint32_t, uint32_t); -void sctp_log_mbcnt(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t); void sctp_log_rwnd_set(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t); int sctp_fill_stat_log(void *, size_t *); void sctp_log_fr(uint32_t, uint32_t, uint32_t, int); diff --git a/freebsd/sys/netinet/tcp.h b/freebsd/sys/netinet/tcp.h index fb2f8108..47038104 100644 --- a/freebsd/sys/netinet/tcp.h +++ b/freebsd/sys/netinet/tcp.h @@ -97,6 +97,10 @@ struct tcphdr { #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 +#define TCPOPT_FAST_OPEN 34 +#define TCPOLEN_FAST_OPEN_EMPTY 2 +#define TCPOLEN_FAST_OPEN_MIN 6 +#define TCPOLEN_FAST_OPEN_MAX 18 /* Miscellaneous constants */ #define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ @@ -161,11 +165,15 @@ struct tcphdr { #define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ #define TCP_INFO 32 /* retrieve tcp_info structure */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ +#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ #define TCP_KEEPINIT 128 /* N, time to establish connection */ #define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ #define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ #define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ - +#define TCP_FASTOPEN 1025 /* enable TFO / was created via TFO */ +#define TCP_PCAP_OUT 2048 /* number of output packets to keep */ +#define TCP_PCAP_IN 4096 /* number of input packets to keep */ +#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ /* Start of reserved space for third-party user-settable options. */ #define TCP_VENDOR SO_VENDOR @@ -243,5 +251,11 @@ struct tcp_info { u_int32_t __tcpi_pad[26]; /* Padding. */ }; #endif +#define TCP_FUNCTION_NAME_LEN_MAX 32 + +struct tcp_function_set { + char function_set_name[TCP_FUNCTION_NAME_LEN_MAX]; + uint32_t pcbcnt; +}; #endif /* !_NETINET_TCP_H_ */ diff --git a/freebsd/sys/netinet/tcp_debug.c b/freebsd/sys/netinet/tcp_debug.c index 2ef9ce43..c5f74182 100644 --- a/freebsd/sys/netinet/tcp_debug.c +++ b/freebsd/sys/netinet/tcp_debug.c @@ -177,11 +177,10 @@ tcp_trace(short act, short ostate, struct tcpcb *tp, void *ipgen, #ifdef INET6 isipv6 ? ntohs(((struct ip6_hdr *)ipgen)->ip6_plen) : #endif - ((struct ip *)ipgen)->ip_len; + ntohs(((struct ip *)ipgen)->ip_len); if (act == TA_OUTPUT) { seq = ntohl(seq); ack = ntohl(ack); - len = ntohs((u_short)len); } if (act == TA_OUTPUT) len -= sizeof (struct tcphdr); diff --git a/freebsd/sys/netinet/tcp_hostcache.c b/freebsd/sys/netinet/tcp_hostcache.c index 260d161d..4e78b8b2 100644 --- a/freebsd/sys/netinet/tcp_hostcache.c +++ b/freebsd/sys/netinet/tcp_hostcache.c @@ -34,8 +34,8 @@ * table to a dedicated structure indexed by the remote IP address. It keeps * information on the measured TCP parameters of past TCP sessions to allow * better initial start values to be used with later connections to/from the - * same source. Depending on the network parameters (delay, bandwidth, max - * MTU, congestion window) between local and remote sites, this can lead to + * same source. Depending on the network parameters (delay, max MTU, + * congestion window) between local and remote sites, this can lead to * significant speed-ups for new TCP connections after the first one. * * Due to the tcp_hostcache, all TCP-specific metrics information in the @@ -81,6 +81,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -118,37 +119,38 @@ static VNET_DEFINE(struct callout, tcp_hc_callout); static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); static struct hc_metrics *tcp_hc_insert(struct in_conninfo *); static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); +static int sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS); static void tcp_hc_purge_internal(int); static void tcp_hc_purge(void *); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache"); -SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, +SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.cache_limit), 0, "Overall entry limit for hostcache"); -SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN, +SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.hashsize), 0, "Size of TCP hostcache hashtable"); -SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, - CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0, +SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, + CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0, "Per-bucket hash limit for hostcache"); -SYSCTL_VNET_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcp_hostcache.cache_count), 0, "Current number of entries in hostcache"); -SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_hostcache.expire), 0, "Expire time of TCP hostcache entries"); -SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_hostcache.prune), 0, "Time between purge runs"); -SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_hostcache.purgeall), 0, "Expire all entires on next purge run"); @@ -156,6 +158,9 @@ SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0, sysctl_tcp_hc_list, "A", "List of all hostcache entries"); +SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, purgenow, + CTLTYPE_INT | CTLFLAG_RW, NULL, 0, + sysctl_tcp_hc_purgenow, "I", "Immediately purge all entries"); static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache"); @@ -235,7 +240,7 @@ tcp_hc_init(void) /* * Set up periodic cache cleanup. */ - callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE); + callout_init(&V_tcp_hc_callout, 1); callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, tcp_hc_purge, curvnet); } @@ -297,6 +302,7 @@ tcp_hc_lookup(struct in_conninfo *inc) */ TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) { if (inc->inc_flags & INC_ISIPV6) { + /* XXX: check ip6_zoneid */ if (memcmp(&inc->inc6_faddr, &hc_entry->ip6, sizeof(inc->inc6_faddr)) == 0) return hc_entry; @@ -388,9 +394,10 @@ tcp_hc_insert(struct in_conninfo *inc) * Initialize basic information of hostcache entry. */ bzero(hc_entry, sizeof(*hc_entry)); - if (inc->inc_flags & INC_ISIPV6) - bcopy(&inc->inc6_faddr, &hc_entry->ip6, sizeof(hc_entry->ip6)); - else + if (inc->inc_flags & INC_ISIPV6) { + hc_entry->ip6 = inc->inc6_faddr; + hc_entry->ip6_zoneid = inc->inc6_zoneid; + } else hc_entry->ip4 = inc->inc_faddr; hc_entry->rmx_head = hc_head; hc_entry->rmx_expire = V_tcp_hostcache.expire; @@ -435,7 +442,6 @@ tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh; hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt; hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar; - hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth; hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd; hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe; hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe; @@ -550,14 +556,6 @@ tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; TCPSTAT_INC(tcps_cachedssthresh); } - if (hcml->rmx_bandwidth != 0) { - if (hc_entry->rmx_bandwidth == 0) - hc_entry->rmx_bandwidth = hcml->rmx_bandwidth; - else - hc_entry->rmx_bandwidth = - (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2; - /* TCPSTAT_INC(tcps_cachedbandwidth); */ - } if (hcml->rmx_cwnd != 0) { if (hc_entry->rmx_cwnd == 0) hc_entry->rmx_cwnd = hcml->rmx_cwnd; @@ -595,7 +593,7 @@ tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) { - int linesize = 128; + const int linesize = 128; struct sbuf sb; int i, error; struct hc_metrics *hc_entry; @@ -604,10 +602,10 @@ sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) #endif sbuf_new(&sb, NULL, linesize * (V_tcp_hostcache.cache_count + 1), - SBUF_FIXEDLEN); + SBUF_INCLUDENUL); sbuf_printf(&sb, - "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH " + "\nIP address MTU SSTRESH RTT RTTVAR " " CWND SENDPIPE RECVPIPE HITS UPD EXP\n"); #define msec(u) (((u) + 500) / 1000) @@ -616,8 +614,8 @@ sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q) { sbuf_printf(&sb, - "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu " - "%4lu %4lu %4i\n", + "%-15s %5lu %8lu %6lums %6lums %8lu %8lu %8lu %4lu " + "%4lu %4i\n", hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) : #ifdef INET6 ip6_sprintf(ip6buf, &hc_entry->ip6), @@ -630,7 +628,6 @@ sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))), msec(hc_entry->rmx_rttvar * (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE))), - hc_entry->rmx_bandwidth * 8, hc_entry->rmx_cwnd, hc_entry->rmx_sendpipe, hc_entry->rmx_recvpipe, @@ -641,8 +638,9 @@ sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); } #undef msec - sbuf_finish(&sb); - error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); + error = sbuf_finish(&sb); + if (error == 0) + error = SYSCTL_OUT(req, sbuf_data(&sb), sbuf_len(&sb)); sbuf_delete(&sb); return(error); } @@ -694,3 +692,24 @@ tcp_hc_purge(void *arg) tcp_hc_purge, arg); CURVNET_RESTORE(); } + +/* + * Expire and purge all entries in hostcache immediately. + */ +static int +sysctl_tcp_hc_purgenow(SYSCTL_HANDLER_ARGS) +{ + int error, val; + + val = 0; + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) + return (error); + + tcp_hc_purge_internal(1); + + callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, + tcp_hc_purge, curvnet); + + return (0); +} diff --git a/freebsd/sys/netinet/tcp_hostcache.h b/freebsd/sys/netinet/tcp_hostcache.h index 8569edcc..44875ff6 100644 --- a/freebsd/sys/netinet/tcp_hostcache.h +++ b/freebsd/sys/netinet/tcp_hostcache.h @@ -51,12 +51,12 @@ struct hc_metrics { struct hc_head *rmx_head; /* head of bucket tail queue */ struct in_addr ip4; /* IP address */ struct in6_addr ip6; /* IP6 address */ + uint32_t ip6_zoneid; /* IPv6 scope zone id */ /* endpoint specific values for tcp */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ - u_long rmx_bandwidth; /* estimated bandwidth */ u_long rmx_cwnd; /* congestion window */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c index f9512eb3..eaa3eb3d 100644 --- a/freebsd/sys/netinet/tcp_input.c +++ b/freebsd/sys/netinet/tcp_input.c @@ -52,7 +52,6 @@ #include __FBSDID("$FreeBSD$"); -#include /* for ipfw_fwd */ #include #include #include @@ -65,6 +64,7 @@ __FBSDID("$FreeBSD$"); #include #include /* for proc0 declaration */ #include +#include #include #include #include @@ -77,16 +77,16 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #define TCPSTATES /* for logging */ -#include #include +#include #include #include -#include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ @@ -95,14 +95,23 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include +#ifdef TCP_RFC7413 +#include +#endif +#include #include #include #include #include #include #include +#include +#ifdef TCPPCAP +#include +#endif #include #ifdef TCPDEBUG #include @@ -122,11 +131,6 @@ __FBSDID("$FreeBSD$"); const int tcprexmtthresh = 3; -VNET_DEFINE(struct tcpstat, tcpstat); -SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, - &VNET_NAME(tcpstat), tcpstat, - "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); - int tcp_log_in_vain = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, &tcp_log_in_vain, 0, @@ -134,88 +138,96 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole), 0, "Do not send RST on segments to closed ports"); VNET_DEFINE(int, tcp_delack_enabled) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_delack_enabled), 0, "Delay ACK to try and piggyback it onto a data packet"); VNET_DEFINE(int, drop_synfin) = 0; #define V_drop_synfin VNET(drop_synfin) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); +VNET_DEFINE(int, tcp_do_rfc6675_pipe) = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc6675_pipe, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(tcp_do_rfc6675_pipe), 0, + "Use calculated pipe/in-flight bytes per RFC 6675"); + VNET_DEFINE(int, tcp_do_rfc3042) = 1; #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3042), 0, "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, experimental, CTLFLAG_RW, 0, - "Experimental TCP extensions"); - -VNET_DEFINE(int, tcp_do_initcwnd10) = 1; -SYSCTL_VNET_INT(_net_inet_tcp_experimental, OID_AUTO, initcwnd10, CTLFLAG_RW, - &VNET_NAME(tcp_do_initcwnd10), 0, - "Enable RFC 6928 (Increasing initial CWND to 10)"); +VNET_DEFINE(int, tcp_initcwnd_segments) = 10; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0, + "Slow-start flight size (initial congestion window) in number of segments"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); -VNET_DEFINE(int, tcp_do_ecn) = 0; -SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, +VNET_DEFINE(int, tcp_do_ecn) = 2; +SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, "TCP ECN support"); VNET_DEFINE(int, tcp_ecn_maxretries) = 1; -SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); +VNET_DEFINE(int, tcp_insecure_syn) = 0; +#define V_tcp_insecure_syn VNET(tcp_insecure_syn) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(tcp_insecure_syn), 0, + "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); + VNET_DEFINE(int, tcp_insecure_rst) = 0; #define V_tcp_insecure_rst VNET(tcp_insecure_rst) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, - "Follow the old (insecure) criteria for accepting RST packets"); + "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) -SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024; #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_inc), 0, "Incrementor step size of automatic receive buffer"); VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); @@ -223,47 +235,55 @@ VNET_DEFINE(struct inpcbhead, tcb); #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); -static void tcp_dooptions(struct tcpopt *, u_char *, int, int); -static void tcp_do_segment(struct mbuf *, struct tcphdr *, - struct socket *, struct tcpcb *, int, int, uint8_t, - int); -static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, - struct tcpcb *, int, int); -static void tcp_pulloutofband(struct socket *, - struct tcphdr *, struct mbuf *, int); -static void tcp_xmit_timer(struct tcpcb *, int); -static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); -static void inline tcp_fields_to_host(struct tcphdr *); -#ifdef TCP_SIGNATURE -static void inline tcp_fields_to_net(struct tcphdr *); -static int inline tcp_signature_verify_input(struct mbuf *, int, int, - int, struct tcpopt *, struct tcphdr *, u_int); -#endif -static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, - uint16_t type); -static void inline cc_conn_init(struct tcpcb *tp); -static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); -static void inline hhook_run_tcp_est_in(struct tcpcb *tp, - struct tcphdr *th, struct tcpopt *to); +/* + * TCP statistics are stored in an array of counter(9)s, which size matches + * size of struct tcpstat. TCP running connection count is a regular array. + */ +VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, + tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); +VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]); +SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD | + CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES, + "TCP connection counts by TCP state"); + +static void +tcp_vnet_init(const void *unused) +{ + + COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); + VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); +} +VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + tcp_vnet_init, NULL); + +#ifdef VIMAGE +static void +tcp_vnet_uninit(const void *unused) +{ + + COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); + VNET_PCPUSTAT_FREE(tcpstat); +} +VNET_SYSUNINIT(tcp_vnet_uninit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, + tcp_vnet_uninit, NULL); +#endif /* VIMAGE */ /* * Kernel module interface for updating tcpstat. The argument is an index - * into tcpstat treated as an array of u_long. While this encodes the - * general layout of tcpstat into the caller, it doesn't encode its location, - * so that future changes to add, for example, per-CPU stats support won't - * cause binary compatibility problems for kernel modules. + * into tcpstat treated as an array. */ void kmod_tcpstat_inc(int statnum) { - (*((u_long *)&V_tcpstat + statnum))++; + counter_u64_add(VNET(tcpstat)[statnum], 1); } /* * Wrapper for the TCP established input helper hook. */ -static void inline +void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; @@ -281,7 +301,7 @@ hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) /* * CC wrapper hook functions */ -static void inline +void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); @@ -295,7 +315,7 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) if (type == CC_ACK) { if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, - V_tcp_abc_l_var * tp->t_maxseg); + V_tcp_abc_l_var * tcp_maxseg(tp)); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; @@ -313,16 +333,18 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) } } -static void inline +void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; + u_int maxseg; int rtt; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_hc_get(&inp->inp_inc, &metrics); + maxseg = tcp_maxseg(tp); if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; @@ -344,10 +366,10 @@ cc_conn_init(struct tcpcb *tp) /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set - * the slow start threshhold, but set the + * the slow start threshold, but set the * threshold to no less than 2*mss. */ - tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh); + tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } @@ -357,27 +379,27 @@ cc_conn_init(struct tcpcb *tp) * RFC5681 Section 3.1 specifies the default conservative values. * RFC3390 specifies slightly more aggressive values. * RFC6928 increases it to ten segments. + * Support for user specified value for initial flight size. * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) - tp->snd_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ - else if (V_tcp_do_initcwnd10) - tp->snd_cwnd = min(10 * tp->t_maxseg, - max(2 * tp->t_maxseg, 14600)); + tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ + else if (V_tcp_initcwnd_segments) + tp->snd_cwnd = min(V_tcp_initcwnd_segments * maxseg, + max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); else if (V_tcp_do_rfc3390) - tp->snd_cwnd = min(4 * tp->t_maxseg, - max(2 * tp->t_maxseg, 4380)); + tp->snd_cwnd = min(4 * maxseg, max(2 * maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ - if (tp->t_maxseg > 2190) - tp->snd_cwnd = 2 * tp->t_maxseg; - else if (tp->t_maxseg > 1095) - tp->snd_cwnd = 3 * tp->t_maxseg; + if (maxseg > 2190) + tp->snd_cwnd = 2 * maxseg; + else if (maxseg > 1095) + tp->snd_cwnd = 3 * maxseg; else - tp->snd_cwnd = 4 * tp->t_maxseg; + tp->snd_cwnd = 4 * maxseg; } if (CC_ALGO(tp)->conn_init != NULL) @@ -387,6 +409,8 @@ cc_conn_init(struct tcpcb *tp) void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { + u_int maxseg; + INP_WLOCK_ASSERT(tp->t_inpcb); switch(type) { @@ -406,12 +430,13 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) } break; case CC_RTO: + maxseg = tcp_maxseg(tp); tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / - tp->t_maxseg) * tp->t_maxseg; - tp->snd_cwnd = tp->t_maxseg; + maxseg) * maxseg; + tp->snd_cwnd = maxseg; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); @@ -436,7 +461,7 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) } } -static void inline +void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); @@ -451,27 +476,7 @@ cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) tp->t_bytes_acked = 0; } -static inline void -tcp_fields_to_host(struct tcphdr *th) -{ - - th->th_seq = ntohl(th->th_seq); - th->th_ack = ntohl(th->th_ack); - th->th_win = ntohs(th->th_win); - th->th_urp = ntohs(th->th_urp); -} - #ifdef TCP_SIGNATURE -static inline void -tcp_fields_to_net(struct tcphdr *th) -{ - - th->th_seq = htonl(th->th_seq); - th->th_ack = htonl(th->th_ack); - th->th_win = htons(th->th_win); - th->th_urp = htons(th->th_urp); -} - static inline int tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, struct tcpopt *to, struct tcphdr *th, u_int tcpbflag) @@ -485,34 +490,56 @@ tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen, } #endif -/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ -#ifdef INET6 -#define ND6_HINT(tp) \ -do { \ - if ((tp) && (tp)->t_inpcb && \ - ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ - nd6_nud_hint(NULL, NULL, 0); \ -} while (0) -#else -#define ND6_HINT(tp) -#endif - /* * Indicate whether this ack should be delayed. We can delay the ack if - * - there is no delayed ack timer in progress and - * - our last ack wasn't a 0-sized window. We never want to delay - * the ack that opens up a 0-sized window and - * - delayed acks are enabled or - * - this is a half-synchronized T/TCP connection. - * - the segment size is not larger than the MSS and LRO wasn't used - * for this segment. + * following conditions are met: + * - There is no delayed ack timer in progress. + * - Our last ack wasn't a 0-sized window. We never want to delay + * the ack that opens up a 0-sized window. + * - LRO wasn't used for this segment. We make sure by checking that the + * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - (tlen <= tp->t_maxopd) && \ + (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) +static void inline +cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) +{ + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (CC_ALGO(tp)->ecnpkt_handler != NULL) { + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->ccv->flags |= CCF_IPHDR_CE; + break; + case IPTOS_ECN_ECT0: + tp->ccv->flags &= ~CCF_IPHDR_CE; + break; + case IPTOS_ECN_ECT1: + tp->ccv->flags &= ~CCF_IPHDR_CE; + break; + } + + if (th->th_flags & TH_CWR) + tp->ccv->flags |= CCF_TCPHDR_CWR; + else + tp->ccv->flags &= ~CCF_TCPHDR_CWR; + + if (tp->t_flags & TF_DELACK) + tp->ccv->flags |= CCF_DELACK; + else + tp->ccv->flags &= ~CCF_DELACK; + + CC_ALGO(tp)->ecnpkt_handler(tp->ccv); + + if (tp->ccv->flags & CCF_ACKNOW) + tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); + } +} + /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended @@ -528,6 +555,7 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct in6_ifaddr *ia6; + struct ip6_hdr *ip6; IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); @@ -535,7 +563,8 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ - ia6 = ip6_getdstifaddr(m); + ip6 = mtod(m, struct ip6_hdr *); + ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { struct ip6_hdr *ip6; @@ -543,28 +572,26 @@ tcp6_input(struct mbuf **mp, int *offp, int proto) ip6 = mtod(m, struct ip6_hdr *); icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); - return IPPROTO_DONE; + return (IPPROTO_DONE); } if (ia6) ifa_free(&ia6->ia_ifa); - tcp_input(m, *offp); - return IPPROTO_DONE; + return (tcp_input(mp, offp, proto)); } #endif /* INET6 */ -void -tcp_input(struct mbuf *m, int off0) +int +tcp_input(struct mbuf **mp, int *offp, int proto) { + struct mbuf *m = *mp; struct tcphdr *th = NULL; struct ip *ip = NULL; -#ifdef INET - struct ipovly *ipov; -#endif struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; + int off0; int optlen = 0; #ifdef INET int len; @@ -587,9 +614,6 @@ tcp_input(struct mbuf *m, int off0) struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ int ti_locked; -#define TI_UNLOCKED 1 -#define TI_WLOCKED 2 - #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -604,6 +628,9 @@ tcp_input(struct mbuf *m, int off0) isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif + off0 = *offp; + m = *mp; + *mp = NULL; to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); @@ -615,7 +642,7 @@ tcp_input(struct mbuf *m, int off0) m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); - return; + return (IPPROTO_DONE); } } @@ -660,45 +687,43 @@ tcp_input(struct mbuf *m, int off0) * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { - ip_stripoptions(m, (struct mbuf *)0); + ip_stripoptions(m); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); - return; + return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); - ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); - tlen = ip->ip_len; + tlen = ntohs(ip->ip_len) - off0; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, - ip->ip_dst.s_addr, - htonl(m->m_pkthdr.csum_data + - ip->ip_len + - IPPROTO_TCP)); + ip->ip_dst.s_addr, + htonl(m->m_pkthdr.csum_data + tlen + + IPPROTO_TCP)); th->th_sum ^= 0xffff; -#ifdef TCPDEBUG - ipov->ih_len = (u_short)tlen; - ipov->ih_len = htons(ipov->ih_len); -#endif } else { + struct ipovly *ipov = (struct ipovly *)ip; + /* * Checksum extended TCP header and data. */ - len = sizeof (struct ip) + tlen; + len = off0 + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); - ipov->ih_len = (u_short)tlen; - ipov->ih_len = htons(ipov->ih_len); + ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); + /* Reset length for SDT probes. */ + ip->ip_len = htons(tlen + off0); } + if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; @@ -732,7 +757,7 @@ tcp_input(struct mbuf *m, int off0) if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { - IP6_EXTHDR_CHECK(m, off0, off, ); + IP6_EXTHDR_CHECK(m, off0, off, IPPROTO_DONE); ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } @@ -746,10 +771,9 @@ tcp_input(struct mbuf *m, int off0) if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { TCPSTAT_INC(tcps_rcvshort); - return; + return (IPPROTO_DONE); } ip = mtod(m, struct ip *); - ipov = (struct ipovly *)ip; th = (struct tcphdr *)((caddr_t)ip + off0); } } @@ -771,26 +795,17 @@ tcp_input(struct mbuf *m, int off0) /* * Locate pcb for segment; if we're likely to add or remove a - * connection then first acquire pcbinfo lock. There are two cases + * connection then first acquire pcbinfo lock. There are three cases * where we might discover later we need a write lock despite the - * flags: ACKs moving a connection out of the syncache, and ACKs for - * a connection in TIMEWAIT. + * flags: ACKs moving a connection out of the syncache, ACKs for a + * connection in TIMEWAIT and SYNs not targeting a listening socket. */ - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) { - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + if ((thflags & (TH_FIN | TH_RST)) != 0) { + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; } else ti_locked = TI_UNLOCKED; -findpcb: -#ifdef INVARIANTS - if (ti_locked == TI_WLOCKED) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - } else { - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - } -#endif - /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ @@ -807,6 +822,14 @@ findpcb: ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); +findpcb: +#ifdef INVARIANTS + if (ti_locked == TI_RLOCKED) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } else { + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + } +#endif #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; @@ -831,10 +854,6 @@ findpcb: th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } - /* Remove the tag from the packet. We don't need it anymore. */ - m_tag_delete(m, fwd_tag); - m->m_flags &= ~M_IP6_NEXTHOP; - fwd_tag = NULL; } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, @@ -869,10 +888,6 @@ findpcb: th->th_dport, INPLOOKUP_WILDCARD | INPLOOKUP_WLOCKPCB, m->m_pkthdr.rcvif); } - /* Remove the tag from the packet. We don't need it anymore. */ - m_tag_delete(m, fwd_tag); - m->m_flags &= ~M_IP_NEXTHOP; - fwd_tag = NULL; } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, @@ -908,23 +923,20 @@ findpcb: goto dropwithreset; } INP_WLOCK_ASSERT(inp); - if (!(inp->inp_flags & INP_HW_FLOWID) - && (m->m_flags & M_FLOWID) - && ((inp->inp_socket == NULL) - || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) { - inp->inp_flags |= INP_HW_FLOWID; - inp->inp_flags &= ~INP_SW_FLOWID; + if ((inp->inp_flowtype == M_HASHTYPE_NONE) && + (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) && + ((inp->inp_socket == NULL) || + (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) { inp->inp_flowid = m->m_pkthdr.flowid; + inp->inp_flowtype = M_HASHTYPE_GET(m); } #ifdef IPSEC #ifdef INET6 if (isipv6 && ipsec6_in_reject(m, inp)) { - IPSEC6STAT_INC(in_polvio); goto dropunlock; } else #endif /* INET6 */ if (ipsec4_in_reject(m, inp) != 0) { - IPSECSTAT_INC(in_polvio); goto dropunlock; } #endif /* IPSEC */ @@ -934,9 +946,10 @@ findpcb: */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 - if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) - goto dropunlock; - else + if (isipv6) { + if (inp->inp_ip_minttl > ip6->ip6_hlim) + goto dropunlock; + } else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; @@ -945,7 +958,7 @@ findpcb: /* * A previous connection in TIMEWAIT state is supposed to catch stray * or duplicate segments arriving late. If this segment was a - * legitimate new connection attempt the old INPCB gets removed and + * legitimate new connection attempt, the old INPCB gets removed and * we can try again to find a listening socket. * * At this point, due to earlier optimism, we may hold only an inpcb @@ -961,20 +974,20 @@ findpcb: relocked: if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { + if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } } else - ti_locked = TI_WLOCKED; + ti_locked = TI_RLOCKED; } - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); @@ -983,8 +996,8 @@ relocked: */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; - INP_INFO_WUNLOCK(&V_tcbinfo); - return; + INP_INFO_RUNLOCK(&V_tcbinfo); + return (IPPROTO_DONE); } /* * The TCPCB may no longer exist if the connection is winding @@ -1013,16 +1026,18 @@ relocked: * now be in TIMEWAIT. */ #ifdef INVARIANTS - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0) - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if ((thflags & (TH_FIN | TH_RST)) != 0) + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); #endif - if (tp->t_state != TCPS_ESTABLISHED) { + if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) || + (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) && + !(tp->t_flags & TF_FASTOPEN)))) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { + if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; @@ -1030,9 +1045,9 @@ relocked: } goto relocked; } else - ti_locked = TI_WLOCKED; + ti_locked = TI_RLOCKED; } - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } #ifdef MAC @@ -1057,17 +1072,13 @@ relocked: /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection - * attempt or the completion of a previous one. Because listen - * sockets are never in TCPS_ESTABLISHED, the V_tcbinfo lock will be - * held in this case. + * attempt or the completion of a previous one. */ if (so->so_options & SO_ACCEPTCONN) { struct in_conninfo inc; KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " "tp not listening", __func__)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - bzero(&inc, sizeof(inc)); #ifdef INET6 if (isipv6) { @@ -1090,6 +1101,8 @@ relocked: * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* * Parse the TCP options here because * syncookies need access to the reflected @@ -1110,6 +1123,9 @@ relocked: rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } +#ifdef TCP_RFC7413 +new_tfo_socket: +#endif if (so == NULL) { /* * We completed the 3-way handshake @@ -1141,7 +1157,11 @@ relocked: */ INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); - INP_WLOCK(inp); /* new connection */ + /* + * New connection inpcb is already locked by + * syncache_expand(). + */ + INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); @@ -1170,10 +1190,10 @@ relocked: * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - return; + return (IPPROTO_DONE); } /* * Segment flag validation for new connection attempts: @@ -1277,7 +1297,7 @@ relocked: if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; - ia6 = ip6_getdstifaddr(m); + ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia6 != NULL && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { ifa_free(&ia6->ia_ifa); @@ -1366,14 +1386,24 @@ relocked: tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); tcp_dooptions(&to, optp, optlen, TO_SYN); - syncache_add(&inc, &to, th, inp, &so, m); +#ifdef TCP_RFC7413 + if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL)) + goto new_tfo_socket; +#else + syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); +#endif /* * Entry added to syncache and mbuf consumed. - * Everything already unlocked by syncache_add(). + * Only the listen socket is unlocked by syncache_add(). */ + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); + ti_locked = TI_UNLOCKED; + } INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - return; + return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* * When a listen socket is torn down the SO_ACCEPTCONN @@ -1404,18 +1434,22 @@ relocked: } #endif + TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); + /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); - return; + return (IPPROTO_DONE); dropwithreset: - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); + + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1435,8 +1469,11 @@ dropwithreset: goto drop; dropunlock: - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (m != NULL) + TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); + + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1456,18 +1493,23 @@ drop: free(s, M_TCPLOG); if (m != NULL) m_freem(m); + return (IPPROTO_DONE); } -static void +void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) { - int thflags, acked, ourfinisacked, needoutput = 0; + int thflags, acked, ourfinisacked, needoutput = 0, sack_changed; int rstreason, todrop, win; u_long tiwin; + char *s; + struct in_conninfo *inc; + struct mbuf *mfree; struct tcpopt to; - + int tfo_syn; + #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -1478,30 +1520,25 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, short ostate = 0; #endif thflags = th->th_flags; + inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; + sack_changed = 0; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we - * allow either a read lock or a write lock, as we may have acquired - * a write lock due to a race. - * - * Require a global write lock for SYN/FIN/RST segments or - * non-established connections; otherwise accept either a read or - * write lock, as we may have conservatively acquired a write lock in - * certain cases in tcp_input() (is this still true?). Currently we - * will never enter with no lock, so we try to drop it quickly in the - * common pure ack/pure data cases. + * allow the tcbinfo to be in either alocked or unlocked, as the + * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { - KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " + KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " "SYN/FIN/RST/!EST", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS - if (ti_locked == TI_WLOCKED) - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " "ti_locked: %d", __func__, ti_locked)); @@ -1515,6 +1552,11 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); +#ifdef TCPPCAP + /* Save segment, if requested. */ + tcp_pcap_add(th, m, &(tp->t_inpkts)); +#endif + /* * Segment received on connection. * Reset idle time and keep-alive timer. @@ -1526,7 +1568,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); /* - * Unscale the window into a 32-bit value. + * Scale up the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; @@ -1549,6 +1591,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, TCPSTAT_INC(tcps_ecn_ect1); break; } + + /* Process a packet differently from RFC3168. */ + cc_ecnpkt_handler(tp, th, iptos); + /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); @@ -1573,6 +1619,24 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } + /* + * If timestamps were negotiated during SYN/ACK they should + * appear on every segment during this session and vice versa. + */ + if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp missing, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + } + } + if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp not expected, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + } + } /* * Process options only when we get SYN/ACK back. The SYN case @@ -1652,8 +1716,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); @@ -1720,7 +1784,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); - ND6_HINT(tp); /* Some progress has been made. */ /* * If all outstanding data are acked, stop @@ -1737,14 +1800,16 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + TCP_PROBE3(debug__input, tp, th, + mtod(m, const char *)); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); sowwakeup(so); - if (so->so_snd.sb_cc) - (void) tcp_output(tp); + if (sbavail(&so->so_snd)) + (void) tp->t_fb->tfb_tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && @@ -1756,8 +1821,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * nothing on the reassembly queue and we have enough * buffer space to take it. */ - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ @@ -1777,12 +1842,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->rcv_up = tp->rcv_nxt; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); - ND6_HINT(tp); /* Some progress has been made */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); + /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network @@ -1802,11 +1868,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * reassembly queue. * * The criteria to step up the receive buffer one notch are: - * 1. the number of bytes received during the time it takes + * 1. Application has not set receive buffer size with + * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. + * 2. the number of bytes received during the time it takes * one timestamp to be reflected back to us (the RTT); - * 2. received bytes per RTT is within seven eighth of the + * 3. received bytes per RTT is within seven eighth of the * current socket buffer size; - * 3. receive buffer size has not hit maximal automatic size; + * 4. receive buffer size has not hit maximal automatic size; * * This algorithm does one step per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. @@ -1817,6 +1885,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * the buffer to better manage the socket buffer resources. */ if (V_tcp_do_autorcvbuf && + (to.to_flags & TOF_TS) && to.to_tsecr && (so->so_rcv.sb_flags & SB_AUTOSIZE)) { if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) && @@ -1851,7 +1920,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ - sbappendstream_locked(&so->so_rcv, m); + sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); @@ -1859,7 +1928,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); } goto check_delack; } @@ -1893,6 +1962,28 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) { + /* + * When a TFO connection is in SYN_RECEIVED, the + * only valid packets are the initial SYN, a + * retransmit/copy of the initial SYN (possibly with + * a subset of the original data), a valid ACK, a + * FIN, or a RST. + */ + if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } else if (thflags & TH_SYN) { + /* non-initial SYN is ignored */ + if ((tcp_timer_active(tp, TT_DELACK) || + tcp_timer_active(tp, TT_REXMT))) + goto drop; + } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { + goto drop; + } + } +#endif break; /* @@ -1916,8 +2007,11 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } - if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) + if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { + TCP_PROBE5(connect__refused, NULL, tp, + mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); + } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) @@ -1962,11 +2056,13 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { - tp->t_state = TCPS_FIN_WAIT_1; + tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { - tp->t_state = TCPS_ESTABLISHED; + tcp_state_change(tp, TCPS_ESTABLISHED); + TCP_PROBE5(connect__established, NULL, tp, + mtod(m, const char *), tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); @@ -1974,22 +2070,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, } else { /* * Received initial SYN in SYN-SENT[*] state => - * simultaneous open. If segment contains CC option - * and there is a cached CC, apply TAO test. + * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* - * If there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); - tp->t_state = TCPS_SYN_RECEIVED; + tcp_state_change(tp, TCPS_SYN_RECEIVED); } - KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " + KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " "ti_locked %d", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2045,98 +2139,84 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. - * - * - * If the RST bit is set, check the sequence number to see - * if this is a valid reset segment. - * RFC 793 page 37: - * In all states except SYN-SENT, all reset (RST) segments - * are validated by checking their SEQ-fields. A reset is - * valid if its sequence number is in the window. - * Note: this does not take into account delayed ACKs, so - * we should test against last_ack_sent instead of rcv_nxt. - * The sequence number in the reset segment is normally an - * echo of our outgoing acknowlegement numbers, but some hosts - * send a reset with the sequence number at the rightmost edge - * of our receive window, and we have to handle this case. - * Note 2: Paul Watson's paper "Slipping in the Window" has shown - * that brute force RST attacks are possible. To combat this, - * we use a much stricter check while in the ESTABLISHED state, - * only accepting RSTs where the sequence number is equal to - * last_ack_sent. In all other states (the states in which a - * RST is more likely), the more permissive check is used. - * If we have multiple segments in flight, the initial reset - * segment sequence numbers will be to the left of last_ack_sent, - * but they will eventually catch up. - * In any case, it never made sense to trim reset segments to - * fit the receive window since RFC 1122 says: - * 4.2.2.12 RST Segment: RFC-793 Section 3.4 - * - * A TCP SHOULD allow a received RST segment to include data. - * - * DISCUSSION - * It has been suggested that a RST segment could contain - * ASCII text that encoded and explained the cause of the - * RST. No standard has yet been established for such - * data. - * - * If the reset segment passes the sequence number test examine - * the state: - * SYN_RECEIVED STATE: - * If passive open, return to LISTEN state. - * If active open, inform user that connection was refused. - * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: - * Inform user that connection was reset, and close tcb. - * CLOSING, LAST_ACK STATES: - * Close the tcb. - * TIME_WAIT STATE: - * Drop the segment - see Stevens, vol. 2, p. 964 and - * RFC 1337. */ if (thflags & TH_RST) { - if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && - SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { - switch (tp->t_state) { - - case TCPS_SYN_RECEIVED: - so->so_error = ECONNREFUSED; - goto close; - - case TCPS_ESTABLISHED: - if (V_tcp_insecure_rst == 0 && - !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && - SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && - !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && - SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { - TCPSTAT_INC(tcps_badrst); - goto drop; - } - /* FALLTHROUGH */ - case TCPS_FIN_WAIT_1: - case TCPS_FIN_WAIT_2: - case TCPS_CLOSE_WAIT: - so->so_error = ECONNRESET; - close: - KASSERT(ti_locked == TI_WLOCKED, - ("tcp_do_segment: TH_RST 1 ti_locked %d", - ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - - tp->t_state = TCPS_CLOSED; + /* + * RFC5961 Section 3.2 + * + * - RST drops connection only if SEG.SEQ == RCV.NXT. + * - If RST is in window, we send challenge ACK. + * + * Note: to take into account delayed ACKs, we should + * test against last_ack_sent instead of rcv_nxt. + * Note 2: we handle special case of closed window, not + * covered by the RFC. + */ + if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || + (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED, + ("%s: TH_RST ti_locked %d, th %p tp %p", + __func__, ti_locked, th, tp)); + KASSERT(tp->t_state != TCPS_SYN_SENT, + ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", + __func__, th, tp)); + + if (V_tcp_insecure_rst || + tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); - tp = tcp_close(tp); - break; + /* Drop the connection. */ + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + tcp_state_change(tp, TCPS_CLOSED); + /* FALLTHROUGH */ + default: + tp = tcp_close(tp); + } + } else { + TCPSTAT_INC(tcps_badrst); + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, + tp->rcv_nxt, tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; + } + } + goto drop; + } - case TCPS_CLOSING: - case TCPS_LAST_ACK: - KASSERT(ti_locked == TI_WLOCKED, - ("tcp_do_segment: TH_RST 2 ti_locked %d", - ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + /* + * RFC5961 Section 4.2 + * Send challenge ACK for any SYN in synchronized state. + */ + if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && + tp->t_state != TCPS_SYN_RECEIVED) { + KASSERT(ti_locked == TI_RLOCKED, + ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - tp = tcp_close(tp); - break; - } + TCPSTAT_INC(tcps_badsyn); + if (V_tcp_insecure_syn && + SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + tp = tcp_drop(tp, ECONNRESET); + rstreason = BANDLIM_UNLIMITED; + } else { + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, + tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; } goto drop; } @@ -2236,15 +2316,14 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { - char *s; - - KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " + KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { - log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " - "was closed, sending RST and removing tcpcb\n", + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " + "after socket was closed, " + "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } @@ -2308,20 +2387,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->ts_recent = to.to_tsval; } - /* - * If a SYN is in the window, then this is an - * error and we send an RST and drop the connection. - */ - if (thflags & TH_SYN) { - KASSERT(ti_locked == TI_WLOCKED, - ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - - tp = tcp_drop(tp, ECONNRESET); - rstreason = BANDLIM_UNLIMITED; - goto drop; - } - /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for @@ -2329,9 +2394,16 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || - (tp->t_flags & TF_NEEDSYN)) + (tp->t_flags & TF_NEEDSYN)) { +#ifdef TCP_RFC7413 + if (tp->t_state == TCPS_SYN_RECEIVED && + tp->t_flags & TF_FASTOPEN) { + tp->snd_wnd = tiwin; + cc_conn_init(tp); + } +#endif goto step6; - else if (tp->t_flags & TF_ACKNOW) + } else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; @@ -2364,11 +2436,33 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { - tp->t_state = TCPS_FIN_WAIT_1; + tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { - tp->t_state = TCPS_ESTABLISHED; - cc_conn_init(tp); + tcp_state_change(tp, TCPS_ESTABLISHED); + TCP_PROBE5(accept__established, NULL, tp, + mtod(m, const char *), tp, th); +#ifdef TCP_RFC7413 + if (tp->t_tfo_pending) { + tcp_fastopen_decrement_counter(tp->t_tfo_pending); + tp->t_tfo_pending = NULL; + + /* + * Account for the ACK of our SYN prior to + * regular ACK processing below. + */ + tp->snd_una++; + } + /* + * TFO connections call cc_conn_init() during SYN + * processing. Calling it again here for such + * connections is not harmless as it would undo the + * snd_cwnd reduction that occurs when a TFO SYN|ACK + * is retransmitted. + */ + if (!(tp->t_flags & TF_FASTOPEN)) +#endif + cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* @@ -2402,21 +2496,45 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) - tcp_sack_doack(tp, &to, th->th_ack); + sack_changed = tcp_sack_doack(tp, &to, th->th_ack); + else + /* + * Reset the value so that previous (valid) value + * from the last ack with SACK doesn't get used. + */ + tp->sackhint.sacked_bytes = 0; /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { - if (tlen == 0 && tiwin == tp->snd_wnd) { + u_int maxseg; + + maxseg = tcp_maxseg(tp); + if (tlen == 0 && + (tiwin == tp->snd_wnd || + (tp->t_flags & TF_SACK_PERMIT))) { + /* + * If this is the first time we've seen a + * FIN from the remote, this is not a + * duplicate and it needs to be processed + * normally. This happens during a + * simultaneous close. + */ + if ((thflags & TH_FIN) && + (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { + tp->t_dupacks = 0; + break; + } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't - * change), the ack is the biggest we've + * change and FIN isn't set), + * the ack is the biggest we've * seen and we've seen exactly our rexmt - * threshhold of them, assume a packet + * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one @@ -2437,8 +2555,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * When using TCP ECN, notify the peer that * we reduced the cwnd. */ - if (!tcp_timer_active(tp, TT_REXMT) || - th->th_ack != tp->snd_una) + /* + * Following 2 kinds of acks should not affect + * dupack counting: + * 1) Old acks + * 2) Acks with SACK but without any new SACK + * information in them. These could result from + * any anomaly in the network like a switch + * duplicating packets or a possible DoS attack. + */ + if (th->th_ack != tp->snd_una || + ((tp->t_flags & TF_SACK_PERMIT) && + !sack_changed)) + break; + else if (!tcp_timer_active(tp, TT_REXMT)) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { @@ -2453,26 +2583,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, * we have less than 1/2 the original window's * worth of data in flight. */ - awnd = (tp->snd_nxt - tp->snd_fack) + - tp->sackhint.sack_bytes_rexmit; + if (V_tcp_do_rfc6675_pipe) + awnd = tcp_compute_pipe(tp); + else + awnd = (tp->snd_nxt - tp->snd_fack) + + tp->sackhint.sack_bytes_rexmit; + if (awnd < tp->snd_ssthresh) { - tp->snd_cwnd += tp->t_maxseg; + tp->snd_cwnd += maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else - tp->snd_cwnd += tp->t_maxseg; - if ((thflags & TH_FIN) && - (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { - /* - * If its a fin we need to process - * it to avoid a race where both - * sides enter FIN-WAIT and send FIN|ACK - * at the same time. - */ - break; - } - (void) tcp_output(tp); + tp->snd_cwnd += maxseg; + (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; @@ -2505,33 +2629,33 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, TCPSTAT_INC( tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; - tp->snd_cwnd = tp->t_maxseg; - (void) tcp_output(tp); + tp->snd_cwnd = maxseg; + (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; - tp->snd_cwnd = tp->t_maxseg; - if ((thflags & TH_FIN) && - (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { - /* - * If its a fin we need to process - * it to avoid a race where both - * sides enter FIN-WAIT and send FIN|ACK - * at the same time. - */ - break; - } - (void) tcp_output(tp); + tp->snd_cwnd = maxseg; + (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + - tp->t_maxseg * + maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { + /* + * Process first and second duplicate + * ACKs. Each indicates a segment + * leaving the network, creating room + * for more. Make sure we can send a + * packet on reception of each duplicate + * ACK by increasing snd_cwnd by one + * segment. Restore the original + * snd_cwnd after packet transmission. + */ cc_ack_received(tp, th, CC_DUPACK); u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; @@ -2547,33 +2671,23 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * - tp->t_maxseg; - if ((thflags & TH_FIN) && - (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { - /* - * If its a fin we need to process - * it to avoid a race where both - * sides enter FIN-WAIT and send FIN|ACK - * at the same time. - */ - break; - } + maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); - avail = so->so_snd.sb_cc - + avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; - if (sent > tp->t_maxseg) { + if (sent > maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || - (sent == tp->t_maxseg + 1 && + (sent == maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); @@ -2583,9 +2697,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, tp->snd_cwnd = oldcwnd; goto drop; } - } else - tp->t_dupacks = 0; + } break; + } else { + /* + * This ack is advancing the left edge, reset the + * counter. + */ + tp->t_dupacks = 0; + /* + * If this ack also has new SACK info, increment the + * counter as per rfc6675. + */ + if ((tp->t_flags & TF_SACK_PERMIT) && sack_changed) + tp->t_dupacks++; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), @@ -2604,7 +2729,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, } else cc_post_recovery(tp, th); } - tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. @@ -2631,6 +2755,9 @@ process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); + KASSERT(acked >= 0, ("%s: acked unexepectedly negative " + "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, + tp->snd_una, th->th_ack, tp, m)); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); @@ -2699,17 +2826,25 @@ process_ACK: cc_ack_received(tp, th, CC_ACK); SOCKBUF_LOCK(&so->so_snd); - if (acked > so->so_snd.sb_cc) { - tp->snd_wnd -= so->so_snd.sb_cc; - sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); + if (acked > sbavail(&so->so_snd)) { + if (tp->snd_wnd >= sbavail(&so->so_snd)) + tp->snd_wnd -= sbavail(&so->so_snd); + else + tp->snd_wnd = 0; + mfree = sbcut_locked(&so->so_snd, + (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { - sbdrop_locked(&so->so_snd, acked); - tp->snd_wnd -= acked; + mfree = sbcut_locked(&so->so_snd, acked); + if (tp->snd_wnd >= (u_long) acked) + tp->snd_wnd -= acked; + else + tp->snd_wnd = 0; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); + m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && @@ -2755,7 +2890,7 @@ process_ACK: tcp_finwait2_timeout : TP_MAXIDLE(tp))); } - tp->t_state = TCPS_FIN_WAIT_2; + tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; @@ -2767,9 +2902,9 @@ process_ACK: */ case TCPS_CLOSING: if (ourfinisacked) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return; } @@ -2783,7 +2918,7 @@ process_ACK: */ case TCPS_LAST_ACK: if (ourfinisacked) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } @@ -2826,7 +2961,7 @@ step6: * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); - if (th->th_urp + so->so_rcv.sb_cc > sb_max) { + if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ @@ -2848,7 +2983,7 @@ step6: */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; - so->so_oobmark = so->so_rcv.sb_cc + + so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; @@ -2887,7 +3022,9 @@ dodata: /* XXX */ * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ - if ((tlen || (thflags & TH_FIN)) && + tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && + (tp->t_flags & TF_FASTOPEN)); + if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ @@ -2905,8 +3042,9 @@ dodata: /* XXX */ */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && - TCPS_HAVEESTABLISHED(tp->t_state)) { - if (DELAY_ACK(tp, tlen)) + (TCPS_HAVEESTABLISHED(tp->t_state) || + tfo_syn)) { + if (DELAY_ACK(tp, tlen) || tfo_syn) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; @@ -2914,12 +3052,11 @@ dodata: /* XXX */ thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); - ND6_HINT(tp); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else - sbappendstream_locked(&so->so_rcv, m); + sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { @@ -2981,7 +3118,7 @@ dodata: /* XXX */ tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: - tp->t_state = TCPS_CLOSE_WAIT; + tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* @@ -2989,7 +3126,7 @@ dodata: /* XXX */ * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: - tp->t_state = TCPS_CLOSING; + tcp_state_change(tp, TCPS_CLOSING); break; /* @@ -2998,18 +3135,18 @@ dodata: /* XXX */ * standard timers. */ case TCPS_FIN_WAIT_2: - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata " + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " "TCP_FIN_WAIT_2 ti_locked: %d", __func__, ti_locked)); tcp_twstart(tp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return; } } - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG @@ -3017,12 +3154,13 @@ dodata: /* XXX */ tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); check_delack: KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", @@ -3064,19 +3202,20 @@ dropafterack: tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; if (tp != NULL) { @@ -3087,8 +3226,8 @@ dropwithreset: return; drop: - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -3104,6 +3243,7 @@ drop: tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif + TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); m_freem(m); @@ -3114,7 +3254,7 @@ drop: * The mbuf must still include the original packet header. * tp may be NULL. */ -static void +void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { @@ -3177,7 +3317,7 @@ drop: /* * Parse TCP options and place in tcpopt. */ -static void +void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; @@ -3259,6 +3399,21 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; +#ifdef TCP_RFC7413 + case TCPOPT_FAST_OPEN: + if ((optlen != TCPOLEN_FAST_OPEN_EMPTY) && + (optlen < TCPOLEN_FAST_OPEN_MIN) && + (optlen > TCPOLEN_FAST_OPEN_MAX)) + continue; + if (!(flags & TO_SYN)) + continue; + if (!V_tcp_fastopen_enabled) + continue; + to->to_flags |= TOF_FASTOPEN; + to->to_tfo_len = optlen - 2; + to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; + break; +#endif default: continue; } @@ -3271,7 +3426,7 @@ tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) * It is still reflected in the segment length for * sequencing purposes. */ -static void +void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { @@ -3304,7 +3459,7 @@ tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, * Collect new round-trip time estimate * and update averages and current timeout. */ -static void +void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; @@ -3394,11 +3549,9 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt) * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * - * Also take into account the space needed for options that we - * send regularly. Make maxseg shorter by that amount to assure - * that we can send maxseg amount of data even when the options - * are present. Store the upper limit of the length of options plus - * data in maxopd. + * NOTE that resulting t_maxseg doesn't include space for TCP options or + * IP options, e.g. IPSEC data, since length of this data may vary, and + * thus it is calculated for every segment separately in tcp_output(). * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS @@ -3412,7 +3565,6 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, u_long maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; - int origoffer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? @@ -3428,13 +3580,12 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } - origoffer = offer; /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); - tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; + tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) @@ -3443,7 +3594,7 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); - tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; + tp->t_maxseg = V_tcp_mssdflt; } #endif @@ -3467,9 +3618,9 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as - * already assigned to t_maxopd above. + * already assigned to t_maxseg above. */ - offer = tp->t_maxopd; + offer = tp->t_maxseg; break; case -1: @@ -3494,8 +3645,8 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); /* - * If there's a discovered mtu int tcp hostcache, use it - * else, use the link mtu. + * If there's a discovered mtu in tcp hostcache, use it. + * Else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; @@ -3541,31 +3692,15 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, mss = min(mss, offer); /* - * Sanity check: make sure that maxopd will be large + * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. + * + * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ mss = max(mss, 64); - /* - * maxopd stores the maximum length of data AND options - * in a segment; maxseg is the amount of data in a normal - * segment. We need to store this value (maxopd) apart - * from maxseg, because now every segment carries options - * and thus we normally have somewhat less data in segments. - */ - tp->t_maxopd = mss; - - /* - * origoffer==-1 indicates that no segments were received yet. - * In this case we just guess. - */ - if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && - (origoffer == -1 || - (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) - mss -= TCPOLEN_TSTAMP_APPA; - tp->t_maxseg = mss; } @@ -3684,11 +3819,12 @@ tcp_mssopt(struct in_conninfo *inc) * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ -static void +void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; - u_long ocwnd = tp->snd_cwnd; + u_long ocwnd = tp->snd_cwnd; + u_int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); @@ -3699,9 +3835,9 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ - tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); + tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; @@ -3713,5 +3849,13 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; - tp->snd_cwnd += tp->t_maxseg; + tp->snd_cwnd += maxseg; +} + +int +tcp_compute_pipe(struct tcpcb *tp) +{ + return (tp->snd_max - tp->snd_una + + tp->sackhint.sack_bytes_rexmit - + tp->sackhint.sacked_bytes); } diff --git a/freebsd/sys/netinet/tcp_lro.c b/freebsd/sys/netinet/tcp_lro.c index 52d92aa0..3550ab84 100644 --- a/freebsd/sys/netinet/tcp_lro.c +++ b/freebsd/sys/netinet/tcp_lro.c @@ -4,6 +4,7 @@ * Copyright (c) 2007, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2012 The FreeBSD Foundation + * Copyright (c) 2016 Mellanox Technologies. * All rights reserved. * * Portions of this software were developed by Bjoern Zeeb @@ -39,9 +40,11 @@ __FBSDID("$FreeBSD$"); #include #include -#include #include +#include +#include #include +#include #include #include @@ -55,59 +58,139 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include -#ifndef LRO_ENTRIES -#define LRO_ENTRIES 8 /* # of LRO entries per RX queue. */ -#endif +static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); #define TCP_LRO_UPDATE_CSUM 1 #ifndef TCP_LRO_UPDATE_CSUM #define TCP_LRO_INVALID_CSUM 0x0000 #endif +static void tcp_lro_rx_done(struct lro_ctrl *lc); +static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, + uint32_t csum, int use_hash); + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "TCP LRO"); + +static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; +SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, + CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, + "default number of LRO entries"); + +static __inline void +tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, + struct lro_entry *le) +{ + + LIST_INSERT_HEAD(&lc->lro_active, le, next); + LIST_INSERT_HEAD(bucket, le, hash_next); +} + +static __inline void +tcp_lro_active_remove(struct lro_entry *le) +{ + + LIST_REMOVE(le, next); /* active list */ + LIST_REMOVE(le, hash_next); /* hash bucket */ +} + int tcp_lro_init(struct lro_ctrl *lc) +{ + return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); +} + +int +tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, + unsigned lro_entries, unsigned lro_mbufs) { struct lro_entry *le; - int error, i; + size_t size; + unsigned i, elements; lc->lro_bad_csum = 0; lc->lro_queued = 0; lc->lro_flushed = 0; lc->lro_cnt = 0; - SLIST_INIT(&lc->lro_free); - SLIST_INIT(&lc->lro_active); - - error = 0; - for (i = 0; i < LRO_ENTRIES; i++) { - le = (struct lro_entry *)malloc(sizeof(*le), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (le == NULL) { - if (i == 0) - error = ENOMEM; - break; - } - lc->lro_cnt = i + 1; - SLIST_INSERT_HEAD(&lc->lro_free, le, next); - } - - return (error); + lc->lro_mbuf_count = 0; + lc->lro_mbuf_max = lro_mbufs; + lc->lro_cnt = lro_entries; + lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; + lc->lro_length_lim = TCP_LRO_LENGTH_MAX; + lc->ifp = ifp; + LIST_INIT(&lc->lro_free); + LIST_INIT(&lc->lro_active); + + /* create hash table to accelerate entry lookup */ + if (lro_entries > lro_mbufs) + elements = lro_entries; + else + elements = lro_mbufs; + lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, + HASH_NOWAIT); + if (lc->lro_hash == NULL) { + memset(lc, 0, sizeof(*lc)); + return (ENOMEM); + } + + /* compute size to allocate */ + size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + + (lro_entries * sizeof(*le)); + lc->lro_mbuf_data = (struct lro_mbuf_sort *) + malloc(size, M_LRO, M_NOWAIT | M_ZERO); + + /* check for out of memory */ + if (lc->lro_mbuf_data == NULL) { + memset(lc, 0, sizeof(*lc)); + return (ENOMEM); + } + /* compute offset for LRO entries */ + le = (struct lro_entry *) + (lc->lro_mbuf_data + lro_mbufs); + + /* setup linked list */ + for (i = 0; i != lro_entries; i++) + LIST_INSERT_HEAD(&lc->lro_free, le + i, next); + + return (0); } void tcp_lro_free(struct lro_ctrl *lc) { struct lro_entry *le; + unsigned x; - while (!SLIST_EMPTY(&lc->lro_free)) { - le = SLIST_FIRST(&lc->lro_free); - SLIST_REMOVE_HEAD(&lc->lro_free, next); - free(le, M_DEVBUF); + /* reset LRO free list */ + LIST_INIT(&lc->lro_free); + + /* free active mbufs, if any */ + while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { + tcp_lro_active_remove(le); + m_freem(le->m_head); } + + /* free hash table */ + if (lc->lro_hash != NULL) { + free(lc->lro_hash, M_LRO); + lc->lro_hash = NULL; + } + lc->lro_hashsz = 0; + + /* free mbuf array, if any */ + for (x = 0; x != lc->lro_mbuf_count; x++) + m_freem(lc->lro_mbuf_data[x].mb); + lc->lro_mbuf_count = 0; + + /* free allocated memory, if any */ + free(lc->lro_mbuf_data, M_LRO); + lc->lro_mbuf_data = NULL; } #ifdef TCP_LRO_UPDATE_CSUM @@ -195,6 +278,36 @@ tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, } #endif +static void +tcp_lro_rx_done(struct lro_ctrl *lc) +{ + struct lro_entry *le; + + while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { + tcp_lro_active_remove(le); + tcp_lro_flush(lc, le); + } +} + +void +tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) +{ + struct lro_entry *le, *le_tmp; + struct timeval tv; + + if (LIST_EMPTY(&lc->lro_active)) + return; + + getmicrotime(&tv); + timevalsub(&tv, timeout); + LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { + if (timevalcmp(&tv, &le->mtime, >=)) { + tcp_lro_active_remove(le); + tcp_lro_flush(lc, le); + } + } +} + void tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) { @@ -285,7 +398,143 @@ tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) lc->lro_queued += le->append_cnt + 1; lc->lro_flushed++; bzero(le, sizeof(*le)); - SLIST_INSERT_HEAD(&lc->lro_free, le, next); + LIST_INSERT_HEAD(&lc->lro_free, le, next); +} + +#ifdef HAVE_INLINE_FLSLL +#define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) +#else +static inline uint64_t +tcp_lro_msb_64(uint64_t x) +{ + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + x |= (x >> 32); + return (x & ~(x >> 1)); +} +#endif + +/* + * The tcp_lro_sort() routine is comparable to qsort(), except it has + * a worst case complexity limit of O(MIN(N,64)*N), where N is the + * number of elements to sort and 64 is the number of sequence bits + * available. The algorithm is bit-slicing the 64-bit sequence number, + * sorting one bit at a time from the most significant bit until the + * least significant one, skipping the constant bits. This is + * typically called a radix sort. + */ +static void +tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) +{ + struct lro_mbuf_sort temp; + uint64_t ones; + uint64_t zeros; + uint32_t x; + uint32_t y; + +repeat: + /* for small arrays insertion sort is faster */ + if (size <= 12) { + for (x = 1; x < size; x++) { + temp = parray[x]; + for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) + parray[y] = parray[y - 1]; + parray[y] = temp; + } + return; + } + + /* compute sequence bits which are constant */ + ones = 0; + zeros = 0; + for (x = 0; x != size; x++) { + ones |= parray[x].seq; + zeros |= ~parray[x].seq; + } + + /* compute bits which are not constant into "ones" */ + ones &= zeros; + if (ones == 0) + return; + + /* pick the most significant bit which is not constant */ + ones = tcp_lro_msb_64(ones); + + /* + * Move entries having cleared sequence bits to the beginning + * of the array: + */ + for (x = y = 0; y != size; y++) { + /* skip set bits */ + if (parray[y].seq & ones) + continue; + /* swap entries */ + temp = parray[x]; + parray[x] = parray[y]; + parray[y] = temp; + x++; + } + + KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); + + /* sort zeros */ + tcp_lro_sort(parray, x); + + /* sort ones */ + parray += x; + size -= x; + goto repeat; +} + +void +tcp_lro_flush_all(struct lro_ctrl *lc) +{ + uint64_t seq; + uint64_t nseq; + unsigned x; + + /* check if no mbufs to flush */ + if (lc->lro_mbuf_count == 0) + goto done; + + /* sort all mbufs according to stream */ + tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); + + /* input data into LRO engine, stream by stream */ + seq = 0; + for (x = 0; x != lc->lro_mbuf_count; x++) { + struct mbuf *mb; + + /* get mbuf */ + mb = lc->lro_mbuf_data[x].mb; + + /* get sequence number, masking away the packet index */ + nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); + + /* check for new stream */ + if (seq != nseq) { + seq = nseq; + + /* flush active streams */ + tcp_lro_rx_done(lc); + } + + /* add packet to LRO engine */ + if (tcp_lro_rx2(lc, mb, 0, 0) != 0) { + /* input packet to network layer */ + (*lc->ifp->if_input)(lc->ifp, mb); + lc->lro_queued++; + lc->lro_flushed++; + } + } +done: + /* flush active streams */ + tcp_lro_rx_done(lc); + + lc->lro_mbuf_count = 0; } #ifdef INET6 @@ -348,8 +597,8 @@ tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, } #endif -int -tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) +static int +tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) { struct lro_entry *le; struct ether_header *eh; @@ -365,6 +614,8 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) tcp_seq seq; int error, ip_len, l; uint16_t eh_type, tcp_data_len; + struct lro_head *bucket; + int force_flush = 0; /* We expect a contiguous header [eh, ip, tcp]. */ @@ -431,10 +682,17 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) * Check TCP header constraints. */ /* Ensure no bits set besides ACK or PSH. */ - if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) - return (TCP_LRO_CANNOT); + if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { + if (th->th_flags & TH_SYN) + return (TCP_LRO_CANNOT); + /* + * Make sure that previously seen segements/ACKs are delivered + * before this segement, e.g. FIN. + */ + force_flush = 1; + } - /* XXX-BZ We lose a AKC|PUSH flag concatinating multiple segments. */ + /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */ /* XXX-BZ Ideally we'd flush on PUSH? */ /* @@ -448,8 +706,13 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) ts_ptr = (uint32_t *)(th + 1); if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| - TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) - return (TCP_LRO_CANNOT); + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* + * Make sure that previously seen segements/ACKs are delivered + * before this segement. + */ + force_flush = 1; + } /* If the driver did not pass in the checksum, set it now. */ if (csum == 0x0000) @@ -457,8 +720,41 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) seq = ntohl(th->th_seq); + if (!use_hash) { + bucket = &lc->lro_hash[0]; + } else if (M_HASHTYPE_ISHASH(m)) { + bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz]; + } else { + uint32_t hash; + + switch (eh_type) { +#ifdef INET + case ETHERTYPE_IP: + hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr; + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + hash = ip6->ip6_src.s6_addr32[0] + + ip6->ip6_dst.s6_addr32[0]; + hash += ip6->ip6_src.s6_addr32[1] + + ip6->ip6_dst.s6_addr32[1]; + hash += ip6->ip6_src.s6_addr32[2] + + ip6->ip6_dst.s6_addr32[2]; + hash += ip6->ip6_src.s6_addr32[3] + + ip6->ip6_dst.s6_addr32[3]; + break; +#endif + default: + hash = 0; + break; + } + hash += th->th_sport + th->th_dport; + bucket = &lc->lro_hash[hash % lc->lro_hashsz]; + } + /* Try to find a matching previous segment. */ - SLIST_FOREACH(le, &lc->lro_active, next) { + LIST_FOREACH(le, bucket, hash_next) { if (le->eh_type != eh_type) continue; if (le->source_port != th->th_sport || @@ -483,9 +779,16 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) #endif } + if (force_flush) { + /* Timestamps mismatch; this is a FIN, etc */ + tcp_lro_active_remove(le); + tcp_lro_flush(lc, le); + return (TCP_LRO_CANNOT); + } + /* Flush now if appending will result in overflow. */ - if (le->p_len > (65535 - tcp_data_len)) { - SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); + if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { + tcp_lro_active_remove(le); tcp_lro_flush(lc, le); break; } @@ -494,7 +797,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) if (__predict_false(seq != le->next_seq || (tcp_data_len == 0 && le->ack_seq == th->th_ack))) { /* Out of order packet or duplicate ACK. */ - SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); + tcp_lro_active_remove(le); tcp_lro_flush(lc, le); return (TCP_LRO_CANNOT); } @@ -522,6 +825,14 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) if (tcp_data_len == 0) { m_freem(m); + /* + * Flush this LRO entry, if this ACK should not + * be further delayed. + */ + if (le->append_cnt >= lc->lro_ackcnt_lim) { + tcp_lro_active_remove(le); + tcp_lro_flush(lc, le); + } return (0); } @@ -533,7 +844,7 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) * append new segment to existing mbuf chain. */ m_adj(m, m->m_pkthdr.len - tcp_data_len); - m->m_flags &= ~M_PKTHDR; + m_demote_pkthdr(m); le->m_tail->m_next = m; le->m_tail = m_last(m); @@ -542,22 +853,32 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) * If a possible next full length packet would cause an * overflow, pro-actively flush now. */ - if (le->p_len > (65535 - lc->ifp->if_mtu)) { - SLIST_REMOVE(&lc->lro_active, le, lro_entry, next); + if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) { + tcp_lro_active_remove(le); tcp_lro_flush(lc, le); - } + } else + getmicrotime(&le->mtime); return (0); } - /* Try to find an empty slot. */ - if (SLIST_EMPTY(&lc->lro_free)) + if (force_flush) { + /* + * Nothing to flush, but this segment can not be further + * aggregated/delayed. + */ return (TCP_LRO_CANNOT); + } + + /* Try to find an empty slot. */ + if (LIST_EMPTY(&lc->lro_free)) + return (TCP_LRO_NO_ENTRIES); /* Start a new segment chain. */ - le = SLIST_FIRST(&lc->lro_free); - SLIST_REMOVE_HEAD(&lc->lro_free, next); - SLIST_INSERT_HEAD(&lc->lro_active, le, next); + le = LIST_FIRST(&lc->lro_free); + LIST_REMOVE(le, next); + tcp_lro_active_insert(lc, bucket, le); + getmicrotime(&le->mtime); /* Start filling in details. */ switch (eh_type) { @@ -614,4 +935,47 @@ tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) return (0); } +int +tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) +{ + + return tcp_lro_rx2(lc, m, csum, 1); +} + +void +tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) +{ + /* sanity checks */ + if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || + lc->lro_mbuf_max == 0)) { + /* packet drop */ + m_freem(mb); + return; + } + + /* check if packet is not LRO capable */ + if (__predict_false(mb->m_pkthdr.csum_flags == 0 || + (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { + lc->lro_flushed++; + lc->lro_queued++; + + /* input packet to network layer */ + (*lc->ifp->if_input) (lc->ifp, mb); + return; + } + + /* check if array is full */ + if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max)) + tcp_lro_flush_all(lc); + + /* create sequence number */ + lc->lro_mbuf_data[lc->lro_mbuf_count].seq = + (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | + (((uint64_t)mb->m_pkthdr.flowid) << 24) | + ((uint64_t)lc->lro_mbuf_count); + + /* enter mbuf */ + lc->lro_mbuf_data[lc->lro_mbuf_count++].mb = mb; +} + /* end */ diff --git a/freebsd/sys/netinet/tcp_lro.h b/freebsd/sys/netinet/tcp_lro.h index b3a50179..e019cd1e 100644 --- a/freebsd/sys/netinet/tcp_lro.h +++ b/freebsd/sys/netinet/tcp_lro.h @@ -1,6 +1,7 @@ /*- * Copyright (c) 2006, Myricom Inc. * Copyright (c) 2008, Intel Corporation. + * Copyright (c) 2016 Mellanox Technologies. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,9 +31,16 @@ #ifndef _TCP_LRO_H_ #define _TCP_LRO_H_ -struct lro_entry -{ - SLIST_ENTRY(lro_entry) next; +#include + +#ifndef TCP_LRO_ENTRIES +/* Define default number of LRO entries per RX queue */ +#define TCP_LRO_ENTRIES 8 +#endif + +struct lro_entry { + LIST_ENTRY(lro_entry) next; + LIST_ENTRY(lro_entry) hash_next; struct mbuf *m_head; struct mbuf *m_tail; union { @@ -59,8 +67,9 @@ struct lro_entry uint32_t tsecr; uint16_t window; uint16_t timestamp; /* flag, not a TCP hdr field. */ + struct timeval mtime; }; -SLIST_HEAD(lro_head, lro_entry); +LIST_HEAD(lro_head, lro_entry); #define le_ip4 leip.ip4 #define le_ip6 leip.ip6 @@ -69,23 +78,43 @@ SLIST_HEAD(lro_head, lro_entry); #define source_ip6 lesource.s_ip6 #define dest_ip6 ledest.d_ip6 +struct lro_mbuf_sort { + uint64_t seq; + struct mbuf *mb; +}; + /* NB: This is part of driver structs. */ struct lro_ctrl { struct ifnet *ifp; - int lro_queued; - int lro_flushed; - int lro_bad_csum; - int lro_cnt; + struct lro_mbuf_sort *lro_mbuf_data; + uint64_t lro_queued; + uint64_t lro_flushed; + uint64_t lro_bad_csum; + unsigned lro_cnt; + unsigned lro_mbuf_count; + unsigned lro_mbuf_max; + unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */ + unsigned lro_length_lim; /* max len of aggregated data */ + u_long lro_hashsz; + struct lro_head *lro_hash; struct lro_head lro_active; struct lro_head lro_free; }; +#define TCP_LRO_LENGTH_MAX 65535 +#define TCP_LRO_ACKCNT_MAX 65535 /* unlimited */ + int tcp_lro_init(struct lro_ctrl *); +int tcp_lro_init_args(struct lro_ctrl *, struct ifnet *, unsigned, unsigned); void tcp_lro_free(struct lro_ctrl *); +void tcp_lro_flush_inactive(struct lro_ctrl *, const struct timeval *); void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); +void tcp_lro_flush_all(struct lro_ctrl *); int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); +void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *); +#define TCP_LRO_NO_ENTRIES -2 #define TCP_LRO_CANNOT -1 #define TCP_LRO_NOT_SUPPORTED 1 diff --git a/freebsd/sys/netinet/tcp_offload.c b/freebsd/sys/netinet/tcp_offload.c index 1a90f408..78275fb8 100644 --- a/freebsd/sys/netinet/tcp_offload.c +++ b/freebsd/sys/netinet/tcp_offload.c @@ -39,14 +39,15 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include -#include #include #define TCPOUTFLAGS #include +#include #include int registered_toedevs; diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c index 550af64f..af11d805 100644 --- a/freebsd/sys/netinet/tcp_output.c +++ b/freebsd/sys/netinet/tcp_output.c @@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -56,8 +57,8 @@ __FBSDID("$FreeBSD$"); #include #include -#include #include +#include #include #include #include @@ -68,12 +69,20 @@ __FBSDID("$FreeBSD$"); #include #include #endif +#ifdef TCP_RFC7413 +#include +#endif +#include #define TCPOUTFLAGS #include #include #include #include #include +#include +#ifdef TCPPCAP +#include +#endif #ifdef TCPDEBUG #include #endif @@ -90,46 +99,56 @@ __FBSDID("$FreeBSD$"); #include VNET_DEFINE(int, path_mtu_discovery) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; #define V_tcp_do_tso VNET(tcp_do_tso) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) -SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); +/* + * Make sure that either retransmit or persist timer is set for SYN, FIN and + * non-ACK. + */ +#define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \ + KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\ + tcp_timer_active((tp), TT_REXMT) || \ + tcp_timer_active((tp), TT_PERSIST), \ + ("neither rexmt nor persist timer is set")) + static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, long len, int tso); static void inline cc_after_idle(struct tcpcb *tp); /* - * Wrapper for the TCP established ouput helper hook. + * Wrapper for the TCP established output helper hook. */ static void inline hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, @@ -201,6 +220,17 @@ tcp_output(struct tcpcb *tp) return (tcp_offload_output(tp)); #endif +#ifdef TCP_RFC7413 + /* + * For TFO connections in SYN_RECEIVED, only allow the initial + * SYN|ACK and those sent by the retransmit timer. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED) && + SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ + (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ + return (0); +#endif /* * Determine length of data that should be transmitted, * and flags that will be used. @@ -322,7 +352,7 @@ after_sack_rexmit: * to send then the probe will be the FIN * itself. */ - if (off < so->so_snd.sb_cc) + if (off < sbused(&so->so_snd)) flags &= ~TH_FIN; sendwin = 1; } else { @@ -348,7 +378,8 @@ after_sack_rexmit: */ if (sack_rxmit == 0) { if (sack_bytes_rxmt == 0) - len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); + len = ((long)ulmin(sbavail(&so->so_snd), sendwin) - + off); else { long cwin; @@ -357,8 +388,8 @@ after_sack_rexmit: * sending new data, having retransmitted all the * data possible in the scoreboard. */ - len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - - off); + len = ((long)ulmin(sbavail(&so->so_snd), tp->snd_wnd) - + off); /* * Don't remove this (len > 0) check ! * We explicitly check for len > 0 here (although it @@ -386,6 +417,15 @@ after_sack_rexmit: if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; +#ifdef TCP_RFC7413 + /* + * When sending additional segments following a TFO SYN|ACK, + * do not include the SYN bit. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED)) + flags &= ~TH_SYN; +#endif off--, len++; } @@ -399,7 +439,18 @@ after_sack_rexmit: flags &= ~TH_FIN; } - if (len < 0) { +#ifdef TCP_RFC7413 + /* + * When retransmitting SYN|ACK on a passively-created TFO socket, + * don't include data, as the presence of data may have caused the + * original SYN|ACK to have been dropped by a middlebox. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)) || + (flags & TH_RST))) + len = 0; +#endif + if (len <= 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, @@ -409,9 +460,16 @@ after_sack_rexmit: * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. + * + * We also do a general check here to ensure that + * we will set the persist timer when we have data + * to send, but a 0-byte window. This makes sure + * the persist timer is set even if the packet + * hits one of the "goto send" lines below. */ len = 0; - if (sendwin == 0) { + if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && + (off < (int) sbavail(&so->so_snd))) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; @@ -449,20 +507,23 @@ after_sack_rexmit: * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given - * delay*bandwith product. However testing has shown this not + * delay*bandwidth product. However testing has shown this not * to be much of an problem. At worst we are trading wasting - * of available bandwith (the non-use of it) for wasting some + * of available bandwidth (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. + * + * XXXGL: should there be used sbused() or sbavail()? */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && - so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && - so->so_snd.sb_cc < V_tcp_autosndbuf_max && - sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { + sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) && + sbused(&so->so_snd) < V_tcp_autosndbuf_max && + sendwin >= (sbused(&so->so_snd) - + (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), so, curthread)) @@ -499,10 +560,11 @@ after_sack_rexmit: tso = 1; if (sack_rxmit) { - if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) + if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } else { - if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + + sbused(&so->so_snd))) flags &= ~TH_FIN; } @@ -532,7 +594,7 @@ after_sack_rexmit: */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && - len + off >= so->so_snd.sb_cc && + len + off >= sbavail(&so->so_snd) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } @@ -660,7 +722,7 @@ dontupdate: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ - if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) && + if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); @@ -675,6 +737,12 @@ just_return: send: SOCKBUF_LOCK_ASSERT(&so->so_snd); + if (len > 0) { + if (len >= tp->t_maxseg) + tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; + else + tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; + } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. @@ -697,13 +765,29 @@ send: * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ + to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { - to.to_flags = 0; /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); to.to_flags |= TOF_MSS; +#ifdef TCP_RFC7413 + /* + * Only include the TFO option on the first + * transmission of the SYN|ACK on a + * passively-created TFO socket, as the presence of + * the TFO option may have caused the original + * SYN|ACK to have been dropped by a middlebox. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED) && + (tp->t_rxtshift == 0)) { + to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; + to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; + to.to_flags |= TOF_FASTOPEN; + } +#endif } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { @@ -759,11 +843,11 @@ send: /* * Adjust data length if insertion of options will - * bump the packet length beyond the t_maxopd length. + * bump the packet length beyond the t_maxseg length. * Clear the FIN bit because we cut off the tail of * the segment. */ - if (len + optlen + ipoptlen > tp->t_maxopd) { + if (len + optlen + ipoptlen > tp->t_maxseg) { flags &= ~TH_FIN; if (tso) { @@ -793,7 +877,8 @@ send: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ - max_len = (if_hw_tsomax - hdrlen); + max_len = (if_hw_tsomax - hdrlen - + max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { @@ -808,6 +893,15 @@ send: */ if (if_hw_tsomaxsegcount != 0 && if_hw_tsomaxsegsize != 0) { + /* + * Subtract one segment for the LINK + * and TCP/IP headers mbuf that will + * be prepended to this mbuf chain + * after the code in this section + * limits the number of mbufs in the + * chain to if_hw_tsomaxsegcount. + */ + if_hw_tsomaxsegcount -= 1; max_len = 0; mb = sbsndmbuf(&so->so_snd, off, &moff); @@ -856,8 +950,8 @@ send: * fractional unless the send sockbuf can be * emptied: */ - max_len = (tp->t_maxopd - optlen); - if ((off + len) < so->so_snd.sb_cc) { + max_len = (tp->t_maxseg - optlen); + if ((off + len) < sbavail(&so->so_snd)) { moff = len % max_len; if (moff != 0) { len -= moff; @@ -886,7 +980,7 @@ send: sendalot = 1; } else { - len = tp->t_maxopd - optlen - ipoptlen; + len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; } } else @@ -929,23 +1023,20 @@ send: TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); } - MGETHDR(m, M_DONTWAIT, MT_DATA); +#ifdef INET6 + if (MHLEN < hdrlen + max_linkhdr) + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + else +#endif + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; + sack_rxmit = 0; goto out; } -#ifdef INET6 - if (MHLEN < hdrlen + max_linkhdr) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - SOCKBUF_UNLOCK(&so->so_snd); - m_freem(m); - error = ENOBUFS; - goto out; - } - } -#endif + m->m_data += max_linkhdr; m->m_len = hdrlen; @@ -965,6 +1056,7 @@ send: SOCKBUF_UNLOCK(&so->so_snd); (void) m_free(m); error = ENOBUFS; + sack_rxmit = 0; goto out; } } @@ -975,7 +1067,7 @@ send: * give data to the user when a buffer fills or * a PUSH comes in.) */ - if (off + len == so->so_snd.sb_cc) + if ((off + len == sbused(&so->so_snd)) && !(flags & TH_SYN)) flags |= TH_PUSH; SOCKBUF_UNLOCK(&so->so_snd); } else { @@ -989,15 +1081,16 @@ send: else TCPSTAT_INC(tcps_sndwinup); - MGETHDR(m, M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; + sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { - MH_ALIGN(m, hdrlen); + M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; @@ -1036,7 +1129,7 @@ send: * resend those bits a number of times as per * RFC 3168. */ - if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { + if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE|TH_CWR; @@ -1153,7 +1246,7 @@ send: tp->snd_up = tp->snd_una; /* drag it along */ #ifdef TCP_SIGNATURE - if (tp->t_flags & TF_SIGNATURE) { + if (to.to_flags & TOF_SIGNATURE) { int sigoff = to.to_signature - opt; tcp_signature_compute(m, 0, len, optlen, (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); @@ -1195,13 +1288,12 @@ send: /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. - * XXX: Fixme: This is currently not the case for IPv6. */ if (tso) { - KASSERT(len > tp->t_maxopd - optlen, + KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; - m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; + m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } #ifdef IPSEC @@ -1214,75 +1306,6 @@ send: __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); #endif - /* - * In transmit state, time the transmission and arrange for - * the retransmit. In persist state, just set snd_max. - */ - if ((tp->t_flags & TF_FORCEDATA) == 0 || - !tcp_timer_active(tp, TT_PERSIST)) { - tcp_seq startseq = tp->snd_nxt; - - /* - * Advance snd_nxt over sequence space of this segment. - */ - if (flags & (TH_SYN|TH_FIN)) { - if (flags & TH_SYN) - tp->snd_nxt++; - if (flags & TH_FIN) { - tp->snd_nxt++; - tp->t_flags |= TF_SENTFIN; - } - } - if (sack_rxmit) - goto timer; - tp->snd_nxt += len; - if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { - tp->snd_max = tp->snd_nxt; - /* - * Time this transmission if not a retransmission and - * not currently timing anything. - */ - if (tp->t_rtttime == 0) { - tp->t_rtttime = ticks; - tp->t_rtseq = startseq; - TCPSTAT_INC(tcps_segstimed); - } - } - - /* - * Set retransmit timer if not currently set, - * and not doing a pure ack or a keep-alive probe. - * Initial value for retransmit timer is smoothed - * round-trip time + 2 * round-trip time variance. - * Initialize shift counter which is used for backoff - * of retransmit time. - */ -timer: - if (!tcp_timer_active(tp, TT_REXMT) && - ((sack_rxmit && tp->snd_nxt != tp->snd_max) || - (tp->snd_nxt != tp->snd_una))) { - if (tcp_timer_active(tp, TT_PERSIST)) { - tcp_timer_activate(tp, TT_PERSIST, 0); - tp->t_rxtshift = 0; - } - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); - } - } else { - /* - * Persist case, update snd_max but since we are in - * persist mode (no window) we do not update snd_nxt. - */ - int xlen = len; - if (flags & TH_SYN) - ++xlen; - if (flags & TH_FIN) { - ++xlen; - tp->t_flags |= TF_SENTFIN; - } - if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) - tp->snd_max = tp->snd_nxt + len; - } - /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); @@ -1306,6 +1329,7 @@ timer: ipov->ih_len = save; } #endif /* TCPDEBUG */ + TCP_PROBE3(debug__output, tp, th, mtod(m, const char *)); /* * Fill in IP length and desired time to live and @@ -1314,7 +1338,7 @@ timer: * the template, but need a way to checksum without them. */ /* - * m->m_pkthdr.len should have been set before cksum calcuration, + * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 @@ -1330,13 +1354,35 @@ timer: */ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); + /* + * Set the packet size here for the benefit of DTrace probes. + * ip6_output() will set it properly; it's supposed to include + * the option header lengths as well. + */ + ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); + + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + else + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + + if (tp->t_state == TCPS_SYN_SENT) + TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); + + TCP_PROBE5(send, NULL, tp, ip6, tp, th); + +#ifdef TCPPCAP + /* Save packet, if requested. */ + tcp_pcap_add(th, m, &(tp->t_outpkts)); +#endif + /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, tp->t_inpcb->in6p_outputopts, &ro, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); if (error == EMSGSIZE && ro.ro_rt != NULL) - mtu = ro.ro_rt->rt_rmx.rmx_mtu; + mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); } #endif /* INET6 */ @@ -1345,10 +1391,7 @@ timer: #endif #ifdef INET { - struct route ro; - - bzero(&ro, sizeof(ro)); - ip->ip_len = m->m_pkthdr.len; + ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); @@ -1361,18 +1404,126 @@ timer: * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ - if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) - ip->ip_off |= IP_DF; + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { + ip->ip_off |= htons(IP_DF); + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + } else { + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + } + + if (tp->t_state == TCPS_SYN_SENT) + TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); - error = ip_output(m, tp->t_inpcb->inp_options, &ro, + TCP_PROBE5(send, NULL, tp, ip, tp, th); + +#ifdef TCPPCAP + /* Save packet, if requested. */ + tcp_pcap_add(th, m, &(tp->t_outpkts)); +#endif + + error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); - if (error == EMSGSIZE && ro.ro_rt != NULL) - mtu = ro.ro_rt->rt_rmx.rmx_mtu; - RO_RTFREE(&ro); + if (error == EMSGSIZE && tp->t_inpcb->inp_route.ro_rt != NULL) + mtu = tp->t_inpcb->inp_route.ro_rt->rt_mtu; } #endif /* INET */ + +out: + /* + * In transmit state, time the transmission and arrange for + * the retransmit. In persist state, just set snd_max. + */ + if ((tp->t_flags & TF_FORCEDATA) == 0 || + !tcp_timer_active(tp, TT_PERSIST)) { + tcp_seq startseq = tp->snd_nxt; + + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (flags & (TH_SYN|TH_FIN)) { + if (flags & TH_SYN) + tp->snd_nxt++; + if (flags & TH_FIN) { + tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } + } + if (sack_rxmit) + goto timer; + tp->snd_nxt += len; + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + tp->snd_max = tp->snd_nxt; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + TCPSTAT_INC(tcps_segstimed); + } + } + + /* + * Set retransmit timer if not currently set, + * and not doing a pure ack or a keep-alive probe. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. + */ +timer: + if (!tcp_timer_active(tp, TT_REXMT) && + ((sack_rxmit && tp->snd_nxt != tp->snd_max) || + (tp->snd_nxt != tp->snd_una))) { + if (tcp_timer_active(tp, TT_PERSIST)) { + tcp_timer_activate(tp, TT_PERSIST, 0); + tp->t_rxtshift = 0; + } + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + } else if (len == 0 && sbavail(&so->so_snd) && + !tcp_timer_active(tp, TT_REXMT) && + !tcp_timer_active(tp, TT_PERSIST)) { + /* + * Avoid a situation where we do not set persist timer + * after a zero window condition. For example: + * 1) A -> B: packet with enough data to fill the window + * 2) B -> A: ACK for #1 + new data (0 window + * advertisement) + * 3) A -> B: ACK for #2, 0 len packet + * + * In this case, A will not activate the persist timer, + * because it chose to send a packet. Unless tcp_output + * is called for some other reason (delayed ack timer, + * another input packet from B, socket syscall), A will + * not send zero window probes. + * + * So, if you send a 0-length packet, but there is data + * in the socket buffer, and neither the rexmt or + * persist timer is already set, then activate the + * persist timer. + */ + tp->t_rxtshift = 0; + tcp_setpersist(tp); + } + } else { + /* + * Persist case, update snd_max but since we are in + * persist mode (no window) we do not update snd_nxt. + */ + int xlen = len; + if (flags & TH_SYN) + ++xlen; + if (flags & TH_FIN) { + ++xlen; + tp->t_flags |= TF_SENTFIN; + } + if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) + tp->snd_max = tp->snd_nxt + len; + } + if (error) { /* @@ -1400,16 +1551,13 @@ timer: } else tp->snd_nxt -= len; } -out: SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ switch (error) { case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: - if (!tcp_timer_active(tp, TT_REXMT) && - !tcp_timer_active(tp, TT_PERSIST)) - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + TCP_XMIT_TIMER_ASSERT(tp, len, flags); tp->snd_cwnd = tp->t_maxseg; return (0); case EMSGSIZE: @@ -1481,10 +1629,10 @@ tcp_setpersist(struct tcpcb *tp) if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* - * Start/restart persistance timer. + * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], - TCPTV_PERSMIN, TCPTV_PERSMAX); + tcp_persmin, tcp_persmax); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; @@ -1510,7 +1658,7 @@ tcp_setpersist(struct tcpcb *tp) int tcp_addoptions(struct tcpopt *to, u_char *optp) { - u_int mask, optlen = 0; + u_int32_t mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) @@ -1572,6 +1720,7 @@ tcp_addoptions(struct tcpopt *to, u_char *optp) bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; +#ifdef TCP_SIGNATURE case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; @@ -1590,6 +1739,7 @@ tcp_addoptions(struct tcpopt *to, u_char *optp) *optp++ = 0; break; } +#endif case TOF_SACK: { int sackblks = 0; @@ -1620,6 +1770,25 @@ tcp_addoptions(struct tcpopt *to, u_char *optp) TCPSTAT_INC(tcps_sack_send_blocks); break; } +#ifdef TCP_RFC7413 + case TOF_FASTOPEN: + { + int total_len; + + /* XXX is there any point to aligning this option? */ + total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len; + if (TCP_MAXOLEN - optlen < total_len) + continue; + *optp++ = TCPOPT_FAST_OPEN; + *optp++ = total_len; + if (to->to_tfo_len > 0) { + bcopy(to->to_tfo_cookie, optp, to->to_tfo_len); + optp += to->to_tfo_len; + } + optlen += total_len; + break; + } +#endif default: panic("%s: unknown TCP option type", __func__); break; diff --git a/freebsd/sys/netinet/tcp_reass.c b/freebsd/sys/netinet/tcp_reass.c index 2570a5f3..49184a5f 100644 --- a/freebsd/sys/netinet/tcp_reass.c +++ b/freebsd/sys/netinet/tcp_reass.c @@ -40,6 +40,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -51,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -76,67 +78,46 @@ __FBSDID("$FreeBSD$"); #include #endif /* TCPDEBUG */ -static int tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS); - static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); -static VNET_DEFINE(int, tcp_reass_maxseg) = 0; -#define V_tcp_reass_maxseg VNET(tcp_reass_maxseg) -SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, - &VNET_NAME(tcp_reass_maxseg), 0, +static int tcp_reass_maxseg = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, + &tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); -SYSCTL_VNET_PROC(_net_inet_tcp_reass, OID_AUTO, cursegments, - (CTLTYPE_INT | CTLFLAG_RD), NULL, 0, &tcp_reass_sysctl_qsize, "I", +static uma_zone_t tcp_reass_zone; +SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, 0, + &tcp_reass_zone, "Global number of TCP Segments currently in Reassembly Queue"); -static VNET_DEFINE(int, tcp_reass_overflows) = 0; -#define V_tcp_reass_overflows VNET(tcp_reass_overflows) -SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows, - CTLFLAG_RD, - &VNET_NAME(tcp_reass_overflows), 0, - "Global number of TCP Segment Reassembly Queue Overflows"); - -static VNET_DEFINE(uma_zone_t, tcp_reass_zone); -#define V_tcp_reass_zone VNET(tcp_reass_zone) - /* Initialize TCP reassembly queue */ static void tcp_reass_zone_change(void *tag) { /* Set the zone limit and read back the effective value. */ - V_tcp_reass_maxseg = nmbclusters / 16; - V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone, - V_tcp_reass_maxseg); + tcp_reass_maxseg = nmbclusters / 16; + tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone, + tcp_reass_maxseg); } void -tcp_reass_init(void) +tcp_reass_global_init(void) { - V_tcp_reass_maxseg = nmbclusters / 16; + tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", - &V_tcp_reass_maxseg); - V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), + &tcp_reass_maxseg); + tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); /* Set the zone limit and read back the effective value. */ - V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone, - V_tcp_reass_maxseg); + tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone, + tcp_reass_maxseg); EVENTHANDLER_REGISTER(nmbclusters_change, tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); } -#ifdef VIMAGE -void -tcp_reass_destroy(void) -{ - - uma_zdestroy(V_tcp_reass_zone); -} -#endif - void tcp_reass_flush(struct tcpcb *tp) { @@ -147,7 +128,7 @@ tcp_reass_flush(struct tcpcb *tp) while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(qe, tqe_q); m_freem(qe->tqe_m); - uma_zfree(V_tcp_reass_zone, qe); + uma_zfree(tcp_reass_zone, qe); tp->t_segqlen--; } @@ -156,15 +137,6 @@ tcp_reass_flush(struct tcpcb *tp) tp, tp->t_segqlen)); } -static int -tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS) -{ - int qsize; - - qsize = uma_zone_get_cur(V_tcp_reass_zone); - return (sysctl_handle_int(oidp, &qsize, 0, req)); -} - int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { @@ -209,15 +181,14 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) */ if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) { - V_tcp_reass_overflows++; - TCPSTAT_INC(tcps_rcvmemdrop); - m_freem(m); + TCPSTAT_INC(tcps_rcvreassfull); *tlenp = 0; if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: queue limit reached, " "segment dropped\n", s, __func__); free(s, M_TCPLOG); } + m_freem(m); return (0); } @@ -228,7 +199,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) * Use a temporary structure on the stack for the missing segment * when the zone is exhausted. Otherwise we may get stuck. */ - te = uma_zalloc(V_tcp_reass_zone, M_NOWAIT); + te = uma_zalloc(tcp_reass_zone, M_NOWAIT); if (te == NULL) { if (th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) { TCPSTAT_INC(tcps_rcvmemdrop); @@ -279,7 +250,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp); m_freem(m); if (te != &tqs) - uma_zfree(V_tcp_reass_zone, te); + uma_zfree(tcp_reass_zone, te); tp->t_segqlen--; /* * Try to present any queued data @@ -316,7 +287,7 @@ tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); - uma_zfree(V_tcp_reass_zone, q); + uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; q = nq; } @@ -353,13 +324,12 @@ present: if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(q->tqe_m); else - sbappendstream_locked(&so->so_rcv, q->tqe_m); + sbappendstream_locked(&so->so_rcv, q->tqe_m, 0); if (q != &tqs) - uma_zfree(V_tcp_reass_zone, q); + uma_zfree(tcp_reass_zone, q); tp->t_segqlen--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); - ND6_HINT(tp); sorwakeup_locked(so); return (flags); } diff --git a/freebsd/sys/netinet/tcp_sack.c b/freebsd/sys/netinet/tcp_sack.c index 9cc1d86a..c7e32cba 100644 --- a/freebsd/sys/netinet/tcp_sack.c +++ b/freebsd/sys/netinet/tcp_sack.c @@ -97,6 +97,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -130,24 +131,24 @@ VNET_DECLARE(struct uma_zone *, sack_hole_zone); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); VNET_DEFINE(int, tcp_do_sack) = 1; #define V_tcp_do_sack VNET(tcp_do_sack) -SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_sack), 0, "Enable/Disable TCP SACK support"); VNET_DEFINE(int, tcp_sack_maxholes) = 128; #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) -SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sack_maxholes), 0, "Maximum number of TCP SACK holes allowed per connection"); VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536; #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) -SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sack_globalmaxholes), 0, "Global maximum number of TCP SACK holes"); VNET_DEFINE(int, tcp_sack_globalholes) = 0; #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) -SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcp_sack_globalholes), 0, "Global number of TCP SACK holes currently allocated"); @@ -346,17 +347,22 @@ tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole) * Process cumulative ACK and the TCP SACK option to update the scoreboard. * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of * the sequence space). + * Returns 1 if incoming ACK has previously unknown SACK information, + * 0 otherwise. Note: We treat (snd_una, th_ack) as a sack block so any changes + * to that (i.e. left edge moving) would also be considered a change in SACK + * information which is slightly different than rfc6675. */ -void +int tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) { struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; - int i, j, num_sack_blks; + int i, j, num_sack_blks, sack_changed; INP_WLOCK_ASSERT(tp->t_inpcb); num_sack_blks = 0; + sack_changed = 0; /* * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, * treat [SND.UNA, SEG.ACK) as if it is a SACK block. @@ -370,6 +376,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) * received new blocks from the other side. */ if (to->to_flags & TOF_SACK) { + tp->sackhint.sacked_bytes = 0; /* reset */ for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); @@ -380,8 +387,11 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) SEQ_GT(sack.start, th_ack) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, tp->snd_una) && - SEQ_LEQ(sack.end, tp->snd_max)) + SEQ_LEQ(sack.end, tp->snd_max)) { sack_blocks[num_sack_blks++] = sack; + tp->sackhint.sacked_bytes += + (sack.end-sack.start); + } } } /* @@ -389,12 +399,12 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) * received. */ if (num_sack_blks == 0) - return; + return (sack_changed); /* * Sort the SACK blocks so we can update the scoreboard with just one - * pass. The overhead of sorting upto 4+1 elements is less than - * making upto 4+1 passes over the scoreboard. + * pass. The overhead of sorting up to 4+1 elements is less than + * making up to 4+1 passes over the scoreboard. */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { @@ -440,6 +450,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; + sack_changed = 1; } else { /* * We failed to add a new hole based on the current @@ -456,9 +467,11 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) SEQ_LT(tp->snd_fack, sblkp->end)) tp->snd_fack = sblkp->end; } - } else if (SEQ_LT(tp->snd_fack, sblkp->end)) + } else if (SEQ_LT(tp->snd_fack, sblkp->end)) { /* fack is advanced. */ tp->snd_fack = sblkp->end; + sack_changed = 1; + } /* We must have at least one SACK hole in scoreboard. */ KASSERT(!TAILQ_EMPTY(&tp->snd_holes), ("SACK scoreboard must not be empty")); @@ -487,6 +500,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); + sack_changed = 1; if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { @@ -542,6 +556,7 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) else sblkp--; } + return (sack_changed); } /* @@ -586,7 +601,7 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); } #if 0 diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c index b175c0c0..cff9bd7b 100644 --- a/freebsd/sys/netinet/tcp_subr.c +++ b/freebsd/sys/netinet/tcp_subr.c @@ -47,18 +47,21 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include #include +#include #include #ifdef INET6 #include #endif #include #include +#include #include #include #include @@ -68,10 +71,12 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include -#include #include +#include +#include #include #include #include @@ -79,22 +84,32 @@ __FBSDID("$FreeBSD$"); #include #include #ifdef INET6 +#include #include +#include #include #include #include #include #endif +#ifdef TCP_RFC7413 +#include +#endif +#include #include #include #include #include #include +#include #ifdef INET6 #include #endif #include +#ifdef TCPPCAP +#include +#endif #ifdef TCPDEBUG #include #endif @@ -125,6 +140,8 @@ VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif +struct rwlock tcp_function_lock; + static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { @@ -141,8 +158,8 @@ sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) return (error); } -SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); @@ -163,8 +180,8 @@ sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) return (error); } -SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, - CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ @@ -178,12 +195,12 @@ SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; -SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); @@ -191,30 +208,30 @@ static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); -static int tcp_tcbhashsize = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, +static int tcp_tcbhashsize; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); -SYSCTL_VNET_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); static VNET_DEFINE(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); -static int tcp_soreceive_stream = 0; +static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); @@ -231,9 +248,193 @@ VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); +static void tcp_mtudisc(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, const void *ip6hdr); + +static struct tcp_function_block tcp_def_funcblk = { + "default", + tcp_output, + tcp_do_segment, + tcp_default_ctloutput, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + 0, + 0 +}; + +int t_functions_inited = 0; +struct tcp_funchead t_functions; +static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; + +static void +init_tcp_functions(void) +{ + if (t_functions_inited == 0) { + TAILQ_INIT(&t_functions); + rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); + t_functions_inited = 1; + } +} + +static struct tcp_function_block * +find_tcp_functions_locked(struct tcp_function_set *fs) +{ + struct tcp_function *f; + struct tcp_function_block *blk=NULL; + + TAILQ_FOREACH(f, &t_functions, tf_next) { + if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) { + blk = f->tf_fb; + break; + } + } + return(blk); +} + +static struct tcp_function_block * +find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) +{ + struct tcp_function_block *rblk=NULL; + struct tcp_function *f; + + TAILQ_FOREACH(f, &t_functions, tf_next) { + if (f->tf_fb == blk) { + rblk = blk; + if (s) { + *s = f; + } + break; + } + } + return (rblk); +} + +struct tcp_function_block * +find_and_ref_tcp_functions(struct tcp_function_set *fs) +{ + struct tcp_function_block *blk; + + rw_rlock(&tcp_function_lock); + blk = find_tcp_functions_locked(fs); + if (blk) + refcount_acquire(&blk->tfb_refcnt); + rw_runlock(&tcp_function_lock); + return(blk); +} + +struct tcp_function_block * +find_and_ref_tcp_fb(struct tcp_function_block *blk) +{ + struct tcp_function_block *rblk; + + rw_rlock(&tcp_function_lock); + rblk = find_tcp_fb_locked(blk, NULL); + if (rblk) + refcount_acquire(&rblk->tfb_refcnt); + rw_runlock(&tcp_function_lock); + return(rblk); +} + + +static int +sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) +{ + int error=ENOENT; + struct tcp_function_set fs; + struct tcp_function_block *blk; + + memset(&fs, 0, sizeof(fs)); + rw_rlock(&tcp_function_lock); + blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); + if (blk) { + /* Found him */ + strcpy(fs.function_set_name, blk->tfb_tcp_block_name); + fs.pcbcnt = blk->tfb_refcnt; + } + rw_runlock(&tcp_function_lock); + error = sysctl_handle_string(oidp, fs.function_set_name, + sizeof(fs.function_set_name), req); + + /* Check for error or no change */ + if (error != 0 || req->newptr == NULL) + return(error); + + rw_wlock(&tcp_function_lock); + blk = find_tcp_functions_locked(&fs); + if ((blk == NULL) || + (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { + error = ENOENT; + goto done; + } + tcp_func_set_ptr = blk; +done: + rw_wunlock(&tcp_function_lock); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, + CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_net_inet_default_tcp_functions, "A", + "Set/get the default TCP functions"); + +static int +sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) +{ + int error, cnt, linesz; + struct tcp_function *f; + char *buffer, *cp; + size_t bufsz, outsz; + + cnt = 0; + rw_rlock(&tcp_function_lock); + TAILQ_FOREACH(f, &t_functions, tf_next) { + cnt++; + } + rw_runlock(&tcp_function_lock); + + bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1; + buffer = malloc(bufsz, M_TEMP, M_WAITOK); + + error = 0; + cp = buffer; + + linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count"); + cp += linesz; + bufsz -= linesz; + outsz = linesz; + + rw_rlock(&tcp_function_lock); + TAILQ_FOREACH(f, &t_functions, tf_next) { + linesz = snprintf(cp, bufsz, "%-32s%c %u\n", + f->tf_fb->tfb_tcp_block_name, + (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', + f->tf_fb->tfb_refcnt); + if (linesz >= bufsz) { + error = EOVERFLOW; + break; + } + cp += linesz; + bufsz -= linesz; + outsz += linesz; + } + rw_runlock(&tcp_function_lock); + if (error == 0) + error = sysctl_handle_string(oidp, buffer, outsz + 1, req); + free(buffer, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, + CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, sysctl_net_inet_list_available, "A", + "list available TCP Function sets"); + /* * Target size of TCP PCB hash tables. Must be a power of two. * @@ -241,7 +442,7 @@ static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE -#define TCBHASHSIZE 512 +#define TCBHASHSIZE 0 #endif /* @@ -261,6 +462,8 @@ static VNET_DEFINE(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); +MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); + static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) @@ -288,48 +491,196 @@ tcp_inpcb_init(void *mem, int size, int flags) return (0); } +/* + * Take a value and get the next power of 2 that doesn't overflow. + * Used to size the tcp_inpcb hash buckets. + */ +static int +maketcp_hashsize(int size) +{ + int hashsize; + + /* + * auto tune. + * get the next power of 2 higher than maxsockets. + */ + hashsize = 1 << fls(size); + /* catch overflow, and just go one power of 2 smaller */ + if (hashsize < size) { + hashsize = 1 << (fls(size) - 1); + } + return (hashsize); +} + +int +register_tcp_functions(struct tcp_function_block *blk, int wait) +{ + struct tcp_function_block *lblk; + struct tcp_function *n; + struct tcp_function_set fs; + + if (t_functions_inited == 0) { + init_tcp_functions(); + } + if ((blk->tfb_tcp_output == NULL) || + (blk->tfb_tcp_do_segment == NULL) || + (blk->tfb_tcp_ctloutput == NULL) || + (strlen(blk->tfb_tcp_block_name) == 0)) { + /* + * These functions are required and you + * need a name. + */ + return (EINVAL); + } + if (blk->tfb_tcp_timer_stop_all || + blk->tfb_tcp_timer_activate || + blk->tfb_tcp_timer_active || + blk->tfb_tcp_timer_stop) { + /* + * If you define one timer function you + * must have them all. + */ + if ((blk->tfb_tcp_timer_stop_all == NULL) || + (blk->tfb_tcp_timer_activate == NULL) || + (blk->tfb_tcp_timer_active == NULL) || + (blk->tfb_tcp_timer_stop == NULL)) { + return (EINVAL); + } + } + n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); + if (n == NULL) { + return (ENOMEM); + } + n->tf_fb = blk; + strcpy(fs.function_set_name, blk->tfb_tcp_block_name); + rw_wlock(&tcp_function_lock); + lblk = find_tcp_functions_locked(&fs); + if (lblk) { + /* Duplicate name space not allowed */ + rw_wunlock(&tcp_function_lock); + free(n, M_TCPFUNCTIONS); + return (EALREADY); + } + refcount_init(&blk->tfb_refcnt, 0); + blk->tfb_flags = 0; + TAILQ_INSERT_TAIL(&t_functions, n, tf_next); + rw_wunlock(&tcp_function_lock); + return(0); +} + +int +deregister_tcp_functions(struct tcp_function_block *blk) +{ + struct tcp_function_block *lblk; + struct tcp_function *f; + int error=ENOENT; + + if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { + /* You can't un-register the default */ + return (EPERM); + } + rw_wlock(&tcp_function_lock); + if (blk == tcp_func_set_ptr) { + /* You can't free the current default */ + rw_wunlock(&tcp_function_lock); + return (EBUSY); + } + if (blk->tfb_refcnt) { + /* Still tcb attached, mark it. */ + blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; + rw_wunlock(&tcp_function_lock); + return (EBUSY); + } + lblk = find_tcp_fb_locked(blk, &f); + if (lblk) { + /* Found */ + TAILQ_REMOVE(&t_functions, f, tf_next); + f->tf_fb = NULL; + free(f, M_TCPFUNCTIONS); + error = 0; + } + rw_wunlock(&tcp_function_lock); + return (error); +} + void tcp_init(void) { + const char *tcbhash_tuneable; int hashsize; + tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; + if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); - hashsize = TCBHASHSIZE; - TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); + TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); + if (hashsize == 0) { + /* + * Auto tune the hash size based on maxsockets. + * A perfect hash would have a 1:1 mapping + * (hashsize = maxsockets) however it's been + * suggested that O(2) average is better. + */ + hashsize = maketcp_hashsize(maxsockets / 4); + /* + * Our historical default is 512, + * do not autotune lower than this. + */ + if (hashsize < 512) + hashsize = 512; + if (bootverbose && IS_DEFAULT_VNET(curvnet)) + printf("%s: %s auto tuned to %d\n", __func__, + tcbhash_tuneable, hashsize); + } + /* + * We require a hashsize to be a power of two. + * Previously if it was not a power of two we would just reset it + * back to 512, which could be a nasty surprise if you did not notice + * the error message. + * Instead what we do is clip it to the closest power of two lower + * than the specified hash value. + */ if (!powerof2(hashsize)) { - printf("WARNING: TCB hash size not a power of 2\n"); - hashsize = 512; /* safe default */ + int oldhashsize = hashsize; + + hashsize = maketcp_hashsize(hashsize); + /* prevent absurdly low value */ + if (hashsize < 16) + hashsize = 16; + printf("%s: WARNING: TCB hash size not a power of 2, " + "clipped from %d to %d.\n", __func__, oldhashsize, + hashsize); } in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, - "tcp_inpcb", tcp_inpcb_init, NULL, UMA_ZONE_NOFREE, - IPI_HASHFIELDS_4TUPLE); + "tcp_inpcb", tcp_inpcb_init, NULL, 0, IPI_HASHFIELDS_4TUPLE); /* * These have to be type stable for the benefit of the timers. */ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcpcb_zone, maxsockets); + uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); tcp_tw_init(); syncache_init(); tcp_hc_init(); - tcp_reass_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* Skip initialization of globals for non-default instances. */ if (!IS_DEFAULT_VNET(curvnet)) return; + tcp_reass_global_init(); + /* XXX virtualize those bellow? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; @@ -340,11 +691,15 @@ tcp_init(void) tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; + tcp_persmin = TCPTV_PERSMIN; + tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; tcp_tcbhashsize = hashsize; + /* Setup the tcp function block list */ + init_tcp_functions(); + register_tcp_functions(&tcp_def_funcblk, M_WAITOK); - TUNABLE_INT_FETCH("net.inet.tcp.soreceive_stream", &tcp_soreceive_stream); if (tcp_soreceive_stream) { #ifdef INET tcp_usrreqs.pru_soreceive = soreceive_stream; @@ -370,21 +725,64 @@ tcp_init(void) SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, EVENTHANDLER_PRI_ANY); +#ifdef TCPPCAP + tcp_pcap_init(); +#endif + +#ifdef TCP_RFC7413 + tcp_fastopen_init(); +#endif } #ifdef VIMAGE -void -tcp_destroy(void) +static void +tcp_destroy(void *unused __unused) { + int error, n; - tcp_reass_destroy(); + /* + * All our processes are gone, all our sockets should be cleaned + * up, which means, we should be past the tcp_discardcb() calls. + * Sleep to let all tcpcb timers really disappear and cleanup. + */ + for (;;) { + INP_LIST_RLOCK(&V_tcbinfo); + n = V_tcbinfo.ipi_count; + INP_LIST_RUNLOCK(&V_tcbinfo); + if (n == 0) + break; + pause("tcpdes", hz / 10); + } tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); in_pcbinfo_destroy(&V_tcbinfo); + /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); + +#ifdef TCP_RFC7413 + /* + * Cannot free the zone until all tcpcbs are released as we attach + * the allocations to them. + */ + tcp_fastopen_destroy(); +#endif + + error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); + if (error != 0) { + printf("%s: WARNING: unable to deregister helper hook " + "type=%d, id=%d: error %d returned\n", __func__, + HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); + } + error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); + if (error != 0) { + printf("%s: WARNING: unable to deregister helper hook " + "type=%d, id=%d: error %d returned\n", __func__, + HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); + } } +VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void @@ -473,31 +871,33 @@ tcpip_maketemplate(struct inpcb *inp) /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy - * of the tcpiphdr at ti and send directly to the addressed host. + * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send - * a message back to the TCP which originated the * segment ti, + * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * - * NOTE: If m != NULL, then ti must point to *inside* the mbuf. + * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, int flags) { - int tlen; - int win = 0; + struct tcpopt to; + struct inpcb *inp; struct ip *ip; + struct mbuf *optm; struct tcphdr *nth; + u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ - int ipflags = 0; - struct inpcb *inp; + int optlen, tlen, win; + bool incl_opts; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); @@ -514,18 +914,21 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, } else inp = NULL; + incl_opts = false; + win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } + if ((tp->t_flags & TF_NOOPT) == 0) + incl_opts = true; } if (m == NULL) { - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; - tlen = 0; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { @@ -535,35 +938,71 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ - { - bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); - ip = mtod(m, struct ip *); - nth = (struct tcphdr *)(ip + 1); - } + { + bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + ip = mtod(m, struct ip *); + nth = (struct tcphdr *)(ip + 1); + } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; + } else if (!M_WRITABLE(m)) { + struct mbuf *n; + + /* Can't reuse 'm', allocate a new mbuf. */ + n = m_gethdr(M_NOWAIT, MT_DATA); + if (n == NULL) { + m_freem(m); + return; + } + + if (!m_dup_pkthdr(n, m, M_NOWAIT)) { + m_freem(m); + m_freem(n); + return; + } + + n->m_data += max_linkhdr; + /* m_len is set later */ +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } +#ifdef INET6 + if (isipv6) { + bcopy((caddr_t)ip6, mtod(n, caddr_t), + sizeof(struct ip6_hdr)); + ip6 = mtod(n, struct ip6_hdr *); + xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); + ip = mtod(n, struct ip *); + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); + nth = (struct tcphdr *)(ip + 1); + } + bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); + xchg(nth->th_dport, nth->th_sport, uint16_t); + th = nth; + m_freem(m); + m = n; } else { /* * reuse the mbuf. - * XXX MRT We inherrit the FIB, which is lucky. + * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; - m_addr_changed(m); /* m_len is set later */ - tlen = 0; -#define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ - { - xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); - nth = (struct tcphdr *)(ip + 1); - } + { + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); + nth = (struct tcphdr *)(ip + 1); + } if (th != nth) { /* * this is usually a case when an extension header @@ -576,13 +1015,65 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } + tlen = 0; +#ifdef INET6 + if (isipv6) + tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); +#endif +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + tlen = sizeof (struct tcpiphdr); +#endif +#ifdef INVARIANTS + m->m_len = 0; + KASSERT(M_TRAILINGSPACE(m) >= tlen, + ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", + m, tlen, (long)M_TRAILINGSPACE(m))); +#endif + m->m_len = tlen; + to.to_flags = 0; + if (incl_opts) { + /* Make sure we have room. */ + if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { + m->m_next = m_get(M_NOWAIT, MT_DATA); + if (m->m_next) { + optp = mtod(m->m_next, u_char *); + optm = m->m_next; + } else + incl_opts = false; + } else { + optp = (u_char *) (nth + 1); + optm = m; + } + } + if (incl_opts) { + /* Timestamps. */ + if (tp->t_flags & TF_RCVD_TSTMP) { + to.to_tsval = tcp_ts_getticks() + tp->ts_offset; + to.to_tsecr = tp->ts_recent; + to.to_flags |= TOF_TS; + } +#ifdef TCP_SIGNATURE + /* TCP-MD5 (RFC2385). */ + if (tp->t_flags & TF_SIGNATURE) + to.to_flags |= TOF_SIGNATURE; +#endif + + /* Add the options. */ + tlen += optlen = tcp_addoptions(&to, optp); + + /* Update m_len in the correct mbuf. */ + optm->m_len += optlen; + } else + optlen = 0; #ifdef INET6 if (isipv6) { ip6->ip6_flow = 0; ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; - ip6->ip6_plen = 0; /* Set in ip6_output(). */ - tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) @@ -590,14 +1081,12 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, #endif #ifdef INET { - tlen += sizeof (struct tcpiphdr); - ip->ip_len = tlen; + ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (V_path_mtu_discovery) - ip->ip_off |= IP_DF; + ip->ip_off |= htons(IP_DF); } #endif - m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC @@ -619,7 +1108,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_x2 = 0; - nth->th_off = sizeof (struct tcphdr) >> 2; + nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; nth->th_flags = flags; if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); @@ -627,6 +1116,13 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, nth->th_win = htons((u_short)win); nth->th_urp = 0; +#ifdef TCP_SIGNATURE + if (to.to_flags & TOF_SIGNATURE) { + tcp_signature_compute(m, 0, 0, optlen, to.to_signature, + IPSEC_DIR_OUTBOUND); + } +#endif + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (isipv6) { @@ -651,15 +1147,21 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif + TCP_PROBE3(debug__output, tp, th, mtod(m, const char *)); + if (flags & TH_RST) + TCP_PROBE5(accept__refused, NULL, NULL, mtod(m, const char *), + tp, nth); + + TCP_PROBE5(send, NULL, tp, mtod(m, const char *), tp, nth); #ifdef INET6 if (isipv6) - (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); + (void) ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET - (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); + (void) ip_output(m, NULL, NULL, 0, NULL, inp); #endif } @@ -687,7 +1189,10 @@ tcp_newtcpcb(struct inpcb *inp) tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; - + rw_rlock(&tcp_function_lock); + tp->t_fb = tcp_func_set_ptr; + refcount_acquire(&tp->t_fb->tfb_refcnt); + rw_runlock(&tcp_function_lock); /* * Use the current system default CC algorithm. */ @@ -698,12 +1203,18 @@ tcp_newtcpcb(struct inpcb *inp) if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } @@ -713,25 +1224,31 @@ tcp_newtcpcb(struct inpcb *inp) #endif tp->t_timers = &tm->tt; /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ - tp->t_maxseg = tp->t_maxopd = + tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ - callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE); + callout_init(&tp->t_timers->tt_rexmt, 1); + callout_init(&tp->t_timers->tt_persist, 1); + callout_init(&tp->t_timers->tt_keep, 1); + callout_init(&tp->t_timers->tt_2msl, 1); + callout_init(&tp->t_timers->tt_delack, 1); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); - tp->t_inpcb = inp; /* XXX */ + /* + * The tcpcb will hold a reference on its inpcb until tcp_discardcb() + * is called. + */ + in_pcbref(inp); /* Reference for tcpcb */ + tp->t_inpcb = inp; + /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives @@ -751,6 +1268,15 @@ tcp_newtcpcb(struct inpcb *inp) */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; +#ifdef TCPPCAP + /* + * Init the TCP PCAP queues. + */ + tcp_pcap_tcpcb_init(tp); +#endif + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } return (tp); /* XXX */ } @@ -779,7 +1305,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo) VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); /* * New connections already part way through being initialised * with the CC algo we're removing will not race with this code @@ -809,7 +1335,7 @@ tcp_ccalgounload(struct cc_algo *unload_algo) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); @@ -827,12 +1353,12 @@ tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { - tp->t_state = TCPS_CLOSED; - (void) tcp_output(tp); + tcp_state_change(tp, TCPS_CLOSED); + (void) tp->t_fb->tfb_tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); @@ -850,6 +1376,7 @@ tcp_discardcb(struct tcpcb *tp) #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ + int released; INP_WLOCK_ASSERT(inp); @@ -857,22 +1384,27 @@ tcp_discardcb(struct tcpcb *tp) * Make sure that all of our timers are stopped before we delete the * PCB. * - * XXXRW: Really, we would like to use callout_drain() here in order - * to avoid races experienced in tcp_timer.c where a timer is already - * executing at this point. However, we can't, both because we're - * running in a context where we can't sleep, and also because we - * hold locks required by the timers. What we instead need to do is - * test to see if callout_drain() is required, and if so, defer some - * portion of the remainder of tcp_discardcb() to an asynchronous - * context that can callout_drain() and then continue. Some care - * will be required to ensure that no further processing takes place - * on the tcpcb, even though it hasn't been freed (a flag?). + * If stopping a timer fails, we schedule a discard function in same + * callout, and the last discard function called will take care of + * deleting the tcpcb. */ - callout_stop(&tp->t_timers->tt_rexmt); - callout_stop(&tp->t_timers->tt_persist); - callout_stop(&tp->t_timers->tt_keep); - callout_stop(&tp->t_timers->tt_2msl); - callout_stop(&tp->t_timers->tt_delack); + tp->t_timers->tt_draincnt = 0; + tcp_timer_stop(tp, TT_REXMT); + tcp_timer_stop(tp, TT_PERSIST); + tcp_timer_stop(tp, TT_KEEP); + tcp_timer_stop(tp, TT_2MSL); + tcp_timer_stop(tp, TT_DELACK); + if (tp->t_fb->tfb_tcp_timer_stop_all) { + /* + * Call the stop-all function of the methods, + * this function should call the tcp_timer_stop() + * method with each of the function specific timeouts. + * That stop will be called via the tfb_tcp_timer_stop() + * which should use the async drain function of the + * callout system (see tcp_var.h). + */ + tp->t_fb->tfb_tcp_timer_stop_all(tp); + } /* * If we got enough samples through the srtt filter, @@ -893,7 +1425,7 @@ tcp_discardcb(struct tcpcb *tp) * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. - * ssthresh is only set if packet loss occured on a session. + * ssthresh is only set if packet loss occurred on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. @@ -909,14 +1441,14 @@ tcp_discardcb(struct tcpcb *tp) ssthresh = 2; ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 - (isipv6 ? sizeof (struct ip6_hdr) + - sizeof (struct tcphdr) : + (isipv6 ? sizeof (struct ip6_hdr) + + sizeof (struct tcphdr) : #endif - sizeof (struct tcpiphdr) + sizeof (struct tcpiphdr) #ifdef INET6 - ) + ) #endif - ); + ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; @@ -941,6 +1473,12 @@ tcp_discardcb(struct tcpcb *tp) tcp_free_sackholes(tp); +#ifdef TCPPCAP + /* Free the TCP PCAP queues. */ + tcp_pcap_drain(&(tp->t_inpkts)); + tcp_pcap_drain(&(tp->t_outpkts)); +#endif + /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); @@ -949,8 +1487,51 @@ tcp_discardcb(struct tcpcb *tp) CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; - tp->t_inpcb = NULL; - uma_zfree(V_tcpcb_zone, tp); + if (tp->t_timers->tt_draincnt == 0) { + /* We own the last reference on tcpcb, let's free it. */ + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_inpcb = NULL; + uma_zfree(V_tcpcb_zone, tp); + released = in_pcbrele_wlocked(inp); + KASSERT(!released, ("%s: inp %p should not have been released " + "here", __func__, inp)); + } +} + +void +tcp_timer_discard(void *ptp) +{ + struct inpcb *inp; + struct tcpcb *tp; + + tp = (struct tcpcb *)ptp; + CURVNET_SET(tp->t_vnet); + INP_INFO_RLOCK(&V_tcbinfo); + inp = tp->t_inpcb; + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", + __func__, tp)); + INP_WLOCK(inp); + KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, + ("%s: tcpcb has to be stopped here", __func__)); + tp->t_timers->tt_draincnt--; + if (tp->t_timers->tt_draincnt == 0) { + /* We own the last reference on this tcpcb, let's free it. */ + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_inpcb = NULL; + uma_zfree(V_tcpcb_zone, tp); + if (in_pcbrele_wlocked(inp)) { + INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + } + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); } /* @@ -963,15 +1544,27 @@ tcp_close(struct tcpcb *tp) struct inpcb *inp = tp->t_inpcb; struct socket *so; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); +#endif +#ifdef TCP_RFC7413 + /* + * This releases the TFO pending counter resource for TFO listen + * sockets as well as passively-created TFO sockets that transition + * from SYN_RECEIVED to CLOSED. + */ + if (tp->t_tfo_pending) { + tcp_fastopen_decrement_counter(tp->t_tfo_pending); + tp->t_tfo_pending = NULL; + } #endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); + TCPSTATES_DEC(tp->t_state); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); @@ -1009,9 +1602,9 @@ tcp_drain(void) * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially - * usefull. + * useful. */ - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_flags & INP_TIMEWAIT) continue; @@ -1019,10 +1612,17 @@ tcp_drain(void) if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); +#ifdef TCPPCAP + if (tcp_pcap_aggressive_free) { + /* Free the TCP PCAP queues. */ + tcp_pcap_drain(&(tcpb->t_inpkts)); + tcp_pcap_drain(&(tcpb->t_outpkts)); + } +#endif } INP_WUNLOCK(inpb); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); @@ -1041,7 +1641,7 @@ tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || @@ -1061,6 +1661,10 @@ tcp_notify(struct inpcb *inp, int error) if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { + if (inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { @@ -1093,7 +1697,8 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { - n = V_tcbinfo.ipi_count + syncache_pcbcount(); + n = V_tcbinfo.ipi_count + + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); @@ -1105,12 +1710,12 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_tcbinfo); + INP_LIST_RLOCK(&V_tcbinfo); gencnt = V_tcbinfo.ipi_gencnt; n = V_tcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_LIST_RUNLOCK(&V_tcbinfo); - m = syncache_pcbcount(); + m = counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + (n + m) * sizeof(struct xtcpcb)); @@ -1130,10 +1735,8 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == NULL) - return (ENOMEM); - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); @@ -1158,7 +1761,7 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); n = i; error = 0; @@ -1196,14 +1799,14 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) } else INP_RUNLOCK(inp); } - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); for (i = 0; i < n; i++) { inp = inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); if (!error) { /* @@ -1213,11 +1816,11 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) * while we were processing this request, and it * might be necessary to retry. */ - INP_INFO_RLOCK(&V_tcbinfo); + INP_LIST_RLOCK(&V_tcbinfo); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + pcb_count; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_LIST_RUNLOCK(&V_tcbinfo); error = SYSCTL_OUT(req, &xig, sizeof xig); } free(inp_list, M_TEMP); @@ -1354,16 +1957,7 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; - /* - * Redirects don't need to be handled up here. - */ - else if (PRC_IS_REDIRECT(cmd)) - return; - /* - * Source quench is depreciated. - */ - else if (cmd == PRC_QUENCH) - return; + /* * Hostdead is ugly because it goes linearly through all PCBs. * XXX: We never get this from ICMP, otherwise it makes an @@ -1373,75 +1967,79 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) ip = NULL; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; - if (ip != NULL) { - icp = (struct icmp *)((caddr_t)ip - - offsetof(struct icmp, icmp_ip)); - th = (struct tcphdr *)((caddr_t)ip - + (ip->ip_hl << 2)); - INP_INFO_WLOCK(&V_tcbinfo); - inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, - ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); - if (inp != NULL) { - if (!(inp->inp_flags & INP_TIMEWAIT) && - !(inp->inp_flags & INP_DROPPED) && - !(inp->inp_socket == NULL)) { - icmp_tcp_seq = htonl(th->th_seq); - tp = intotcpcb(inp); - if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && - SEQ_LT(icmp_tcp_seq, tp->snd_max)) { - if (cmd == PRC_MSGSIZE) { - /* - * MTU discovery: - * If we got a needfrag set the MTU - * in the route to the suggested new - * value (if given) and then notify. - */ - bzero(&inc, sizeof(inc)); - inc.inc_faddr = faddr; - inc.inc_fibnum = - inp->inp_inc.inc_fibnum; - - mtu = ntohs(icp->icmp_nextmtu); - /* - * If no alternative MTU was - * proposed, try the next smaller - * one. ip->ip_len has already - * been swapped in icmp_input(). - */ - if (!mtu) - mtu = ip_next_mtu(ip->ip_len, - 1); - if (mtu < V_tcp_minmss - + sizeof(struct tcpiphdr)) - mtu = V_tcp_minmss - + sizeof(struct tcpiphdr); - /* - * Only cache the MTU if it - * is smaller than the interface - * or route MTU. tcp_mtudisc() - * will do right thing by itself. - */ - if (mtu <= tcp_maxmtu(&inc, NULL)) + + if (ip == NULL) { + in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); + return; + } + + icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); + th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + INP_INFO_RLOCK(&V_tcbinfo); + inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, + th->th_sport, INPLOOKUP_WLOCKPCB, NULL); + if (inp != NULL && PRC_IS_REDIRECT(cmd)) { + /* signal EHOSTDOWN, as it flushes the cached route */ + inp = (*notify)(inp, EHOSTDOWN); + if (inp != NULL) + INP_WUNLOCK(inp); + } else if (inp != NULL) { + if (!(inp->inp_flags & INP_TIMEWAIT) && + !(inp->inp_flags & INP_DROPPED) && + !(inp->inp_socket == NULL)) { + icmp_tcp_seq = ntohl(th->th_seq); + tp = intotcpcb(inp); + if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && + SEQ_LT(icmp_tcp_seq, tp->snd_max)) { + if (cmd == PRC_MSGSIZE) { + /* + * MTU discovery: + * If we got a needfrag set the MTU + * in the route to the suggested new + * value (if given) and then notify. + */ + mtu = ntohs(icp->icmp_nextmtu); + /* + * If no alternative MTU was + * proposed, try the next smaller + * one. + */ + if (!mtu) + mtu = ip_next_mtu( + ntohs(ip->ip_len), 1); + if (mtu < V_tcp_minmss + + sizeof(struct tcpiphdr)) + mtu = V_tcp_minmss + + sizeof(struct tcpiphdr); + /* + * Only process the offered MTU if it + * is smaller than the current one. + */ + if (mtu < tp->t_maxseg + + sizeof(struct tcpiphdr)) { + bzero(&inc, sizeof(inc)); + inc.inc_faddr = faddr; + inc.inc_fibnum = + inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); - tcp_mtudisc(inp, mtu); - } else - inp = (*notify)(inp, - inetctlerrmap[cmd]); - } + tcp_mtudisc(inp, mtu); + } + } else + inp = (*notify)(inp, + inetctlerrmap[cmd]); } - if (inp != NULL) - INP_WUNLOCK(inp); - } else { - bzero(&inc, sizeof(inc)); - inc.inc_fport = th->th_dport; - inc.inc_lport = th->th_sport; - inc.inc_faddr = faddr; - inc.inc_laddr = ip->ip_src; - syncache_unreach(&inc, th); } - INP_INFO_WUNLOCK(&V_tcbinfo); - } else - in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); + if (inp != NULL) + INP_WUNLOCK(inp); + } else { + bzero(&inc, sizeof(inc)); + inc.inc_fport = th->th_dport; + inc.inc_lport = th->th_sport; + inc.inc_faddr = faddr; + inc.inc_laddr = ip->ip_src; + syncache_unreach(&inc, th); + } + INP_INFO_RUNLOCK(&V_tcbinfo); } #endif /* INET */ @@ -1449,75 +2047,146 @@ tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) void tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) { - struct tcphdr th; + struct in6_addr *dst; + struct tcphdr *th; struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct ip6_hdr *ip6; struct mbuf *m; + struct inpcb *inp; + struct tcpcb *tp; + struct icmp6_hdr *icmp6; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; - int off; - struct tcp_portonly { - u_int16_t th_sport; - u_int16_t th_dport; - } *thp; + struct in_conninfo inc; + tcp_seq icmp_tcp_seq; + unsigned int mtu; + unsigned int off; + if (sa->sa_family != AF_INET6 || sa->sa_len != sizeof(struct sockaddr_in6)) return; - if (cmd == PRC_MSGSIZE) - notify = tcp_mtudisc_notify; - else if (!PRC_IS_REDIRECT(cmd) && - ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) - return; - /* Source quench is depreciated. */ - else if (cmd == PRC_QUENCH) - return; - /* if the parameter is from icmp6, decode it. */ if (d != NULL) { ip6cp = (struct ip6ctlparam *)d; + icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; sa6_src = ip6cp->ip6c_src; + dst = ip6cp->ip6c_finaldst; } else { m = NULL; ip6 = NULL; off = 0; /* fool gcc */ sa6_src = &sa6_any; + dst = NULL; } - if (ip6 != NULL) { - struct in_conninfo inc; - /* - * XXX: We assume that when IPV6 is non NULL, - * M and OFF are valid. - */ + if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc_notify; + else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || + cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && + ip6 != NULL) + notify = tcp_drop_syn_sent; - /* check if we can safely examine src and dst ports */ - if (m->m_pkthdr.len < off + sizeof(*thp)) - return; + /* + * Hostdead is ugly because it goes linearly through all PCBs. + * XXX: We never get this from ICMP, otherwise it makes an + * excellent DoS attack on machines with many connections. + */ + else if (cmd == PRC_HOSTDEAD) + ip6 = NULL; + else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0) + return; - bzero(&th, sizeof(th)); - m_copydata(m, off, sizeof(*thp), (caddr_t)&th); + if (ip6 == NULL) { + in6_pcbnotify(&V_tcbinfo, sa, 0, + (const struct sockaddr *)sa6_src, + 0, cmd, NULL, notify); + return; + } - in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, - (struct sockaddr *)ip6cp->ip6c_src, - th.th_sport, cmd, NULL, notify); + /* Check if we can safely get the ports from the tcp hdr */ + if (m == NULL || + (m->m_pkthdr.len < + (int32_t) (off + offsetof(struct tcphdr, th_seq)))) { + return; + } + th = (struct tcphdr *) mtodo(ip6cp->ip6c_m, ip6cp->ip6c_off); + INP_INFO_RLOCK(&V_tcbinfo); + inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, th->th_dport, + &ip6->ip6_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); + if (inp != NULL && PRC_IS_REDIRECT(cmd)) { + /* signal EHOSTDOWN, as it flushes the cached route */ + inp = (*notify)(inp, EHOSTDOWN); + if (inp != NULL) + INP_WUNLOCK(inp); + } else if (inp != NULL) { + if (!(inp->inp_flags & INP_TIMEWAIT) && + !(inp->inp_flags & INP_DROPPED) && + !(inp->inp_socket == NULL)) { + icmp_tcp_seq = ntohl(th->th_seq); + tp = intotcpcb(inp); + if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && + SEQ_LT(icmp_tcp_seq, tp->snd_max)) { + if (cmd == PRC_MSGSIZE) { + /* + * MTU discovery: + * If we got a needfrag set the MTU + * in the route to the suggested new + * value (if given) and then notify. + */ + mtu = ntohl(icmp6->icmp6_mtu); + /* + * If no alternative MTU was + * proposed, or the proposed + * MTU was too small, set to + * the min. + */ + if (mtu < IPV6_MMTU) + mtu = IPV6_MMTU - 8; + + + bzero(&inc, sizeof(inc)); + inc.inc_fibnum = M_GETFIB(m); + inc.inc_flags |= INC_ISIPV6; + inc.inc6_faddr = *dst; + if (in6_setscope(&inc.inc6_faddr, + m->m_pkthdr.rcvif, NULL)) + goto unlock_inp; + + /* + * Only process the offered MTU if it + * is smaller than the current one. + */ + if (mtu < tp->t_maxseg + + (sizeof (*th) + sizeof (*ip6))) { + tcp_hc_updatemtu(&inc, mtu); + tcp_mtudisc(inp, mtu); + ICMP6STAT_INC(icp6s_pmtuchg); + } + } else + inp = (*notify)(inp, + inet6ctlerrmap[cmd]); + } + } +unlock_inp: + if (inp != NULL) + INP_WUNLOCK(inp); + } else { bzero(&inc, sizeof(inc)); - inc.inc_fport = th.th_dport; - inc.inc_lport = th.th_sport; - inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; - inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; + inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; - INP_INFO_WLOCK(&V_tcbinfo); - syncache_unreach(&inc, &th); - INP_INFO_WUNLOCK(&V_tcbinfo); - } else - in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, - 0, cmd, NULL, notify); + inc.inc_fport = th->th_dport; + inc.inc_lport = th->th_sport; + inc.inc6_faddr = *dst; + inc.inc6_laddr = ip6->ip6_src; + syncache_unreach(&inc, th); + } + INP_INFO_RUNLOCK(&V_tcbinfo); } #endif /* INET6 */ @@ -1647,7 +2316,7 @@ tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || @@ -1675,10 +2344,11 @@ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { - return (tcp_mtudisc(inp, -1)); + tcp_mtudisc(inp, -1); + return (inp); } -struct inpcb * +static void tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; @@ -1687,7 +2357,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || (inp->inp_flags & INP_DROPPED)) - return (inp); + return; tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); @@ -1708,8 +2378,7 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); - tcp_output(tp); - return (inp); + tp->t_fb->tfb_tcp_output(tp); } #ifdef INET @@ -1722,27 +2391,20 @@ tcp_mtudisc(struct inpcb *inp, int mtuoffer) u_long tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { - struct route sro; - struct sockaddr_in *dst; + struct nhop4_extended nh4; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); - bzero(&sro, sizeof(sro)); if (inc->inc_faddr.s_addr != INADDR_ANY) { - dst = (struct sockaddr_in *)&sro.ro_dst; - dst->sin_family = AF_INET; - dst->sin_len = sizeof(*dst); - dst->sin_addr = inc->inc_faddr; - in_rtalloc_ign(&sro, 0, inc->inc_fibnum); - } - if (sro.ro_rt != NULL) { - ifp = sro.ro_rt->rt_ifp; - if (sro.ro_rt->rt_rmx.rmx_mtu == 0) - maxmtu = ifp->if_mtu; - else - maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); + + if (fib4_lookup_nh_ext(inc->inc_fibnum, inc->inc_faddr, + NHR_REF, 0, &nh4) != 0) + return (0); + + ifp = nh4.nh_ifp; + maxmtu = nh4.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { @@ -1754,7 +2416,7 @@ tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } - RTFREE(sro.ro_rt); + fib4_free_nh_ext(inc->inc_fibnum, &nh4); } return (maxmtu); } @@ -1764,26 +2426,22 @@ tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) u_long tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { - struct route_in6 sro6; + struct nhop6_extended nh6; + struct in6_addr dst6; + uint32_t scopeid; struct ifnet *ifp; u_long maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); - bzero(&sro6, sizeof(sro6)); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { - sro6.ro_dst.sin6_family = AF_INET6; - sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); - sro6.ro_dst.sin6_addr = inc->inc6_faddr; - in6_rtalloc_ign(&sro6, 0, inc->inc_fibnum); - } - if (sro6.ro_rt != NULL) { - ifp = sro6.ro_rt->rt_ifp; - if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) - maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); - else - maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, - IN6_LINKMTU(sro6.ro_rt->rt_ifp)); + in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); + if (fib6_lookup_nh_ext(inc->inc_fibnum, &dst6, scopeid, 0, + 0, &nh6) != 0) + return (0); + + ifp = nh6.nh_ifp; + maxmtu = nh6.nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { @@ -1795,13 +2453,66 @@ tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } - RTFREE(sro6.ro_rt); + fib6_free_nh_ext(inc->inc_fibnum, &nh6); } return (maxmtu); } #endif /* INET6 */ +/* + * Calculate effective SMSS per RFC5681 definition for a given TCP + * connection at its current state, taking into account SACK and etc. + */ +u_int +tcp_maxseg(const struct tcpcb *tp) +{ + u_int optlen; + + if (tp->t_flags & TF_NOOPT) + return (tp->t_maxseg); + + /* + * Here we have a simplified code from tcp_addoptions(), + * without a proper loop, and having most of paddings hardcoded. + * We might make mistakes with padding here in some edge cases, + * but this is harmless, since result of tcp_maxseg() is used + * only in cwnd and ssthresh estimations. + */ +#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) + if (TCPS_HAVEESTABLISHED(tp->t_state)) { + if (tp->t_flags & TF_RCVD_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = 0; +#ifdef TCP_SIGNATURE + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { + optlen += TCPOLEN_SACKHDR; + optlen += tp->rcv_numsacks * TCPOLEN_SACK; + optlen = PAD(optlen); + } + } else { + if (tp->t_flags & TF_REQ_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = PAD(TCPOLEN_MAXSEG); + if (tp->t_flags & TF_REQ_SCALE) + optlen += PAD(TCPOLEN_WINDOW); +#ifdef TCP_SIGNATURE + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if (tp->t_flags & TF_SACK_PERMIT) + optlen += PAD(TCPOLEN_SACK_PERMITTED); + } +#undef PAD + optlen = min(optlen, TCP_MAXOLEN); + return (tp->t_maxseg - optlen); +} + #ifdef IPSEC /* compute ESP/AH header size for TCP, including outer IP header. */ size_t @@ -1816,9 +2527,10 @@ ipsec_hdrsiz_tcp(struct tcpcb *tp) #endif struct tcphdr *th; - if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) + if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL) || + (!key_havesp(IPSEC_DIR_OUTBOUND))) return (0); - MGETHDR(m, M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) return (0); @@ -1859,55 +2571,20 @@ tcp_signature_apply(void *fstate, void *data, u_int len) } /* - * Compute TCP-MD5 hash of a TCP segment. (RFC2385) - * - * Parameters: - * m pointer to head of mbuf chain - * _unused - * len length of TCP segment data, excluding options - * optlen length of TCP segment options - * buf pointer to storage for computed MD5 digest - * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) - * - * We do this over ip, tcphdr, segment data, and the key in the SADB. - * When called from tcp_input(), we can be sure that th_sum has been - * zeroed out and verified already. - * - * Return 0 if successful, otherwise return -1. - * * XXX The key is retrieved from the system's PF_KEY SADB, by keying a * search with the destination IP address, and a 'magic SPI' to be * determined by the application. This is hardcoded elsewhere to 1179 - * right now. Another branch of this code exists which uses the SPD to - * specify per-application flows but it is unstable. - */ -int -tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, - u_char *buf, u_int direction) +*/ +struct secasvar * +tcp_get_sav(struct mbuf *m, u_int direction) { union sockaddr_union dst; -#ifdef INET - struct ippseudo ippseudo; -#endif - MD5_CTX ctx; - int doff; - struct ip *ip; -#ifdef INET - struct ipovly *ipovly; -#endif struct secasvar *sav; - struct tcphdr *th; + struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; - struct in6_addr in6; char ip6buf[INET6_ADDRSTRLEN]; - uint32_t plen; - uint16_t nhdr; #endif - u_short savecsum; - - KASSERT(m != NULL, ("NULL mbuf chain")); - KASSERT(buf != NULL, ("NULL signature pointer")); /* Extract the destination from the IP header in the mbuf. */ bzero(&dst, sizeof(union sockaddr_union)); @@ -1934,7 +2611,7 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, break; #endif default: - return (EINVAL); + return (NULL); /* NOTREACHED */ break; } @@ -1949,9 +2626,61 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) : #endif "(unsupported)")); - return (EINVAL); } + return (sav); +} + +/* + * Compute TCP-MD5 hash of a TCP segment. (RFC2385) + * + * Parameters: + * m pointer to head of mbuf chain + * len length of TCP segment data, excluding options + * optlen length of TCP segment options + * buf pointer to storage for computed MD5 digest + * sav pointer to security assosiation + * + * We do this over ip, tcphdr, segment data, and the key in the SADB. + * When called from tcp_input(), we can be sure that th_sum has been + * zeroed out and verified already. + * + * Releases reference to SADB key before return. + * + * Return 0 if successful, otherwise return -1. + * + */ +int +tcp_signature_do_compute(struct mbuf *m, int len, int optlen, + u_char *buf, struct secasvar *sav) +{ +#ifdef INET + struct ippseudo ippseudo; +#endif + MD5_CTX ctx; + int doff; + struct ip *ip; +#ifdef INET + struct ipovly *ipovly; +#endif + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6; + struct in6_addr in6; + uint32_t plen; + uint16_t nhdr; +#endif + u_short savecsum; + + KASSERT(m != NULL, ("NULL mbuf chain")); + KASSERT(buf != NULL, ("NULL signature pointer")); + + /* Extract the destination from the IP header in the mbuf. */ + ip = mtod(m, struct ip *); +#ifdef INET6 + ip6 = NULL; /* Make the compiler happy. */ +#endif + MD5Init(&ctx); /* * Step 1: Update MD5 hash with IP(v6) pseudo-header. @@ -2008,7 +2737,8 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, break; #endif default: - return (EINVAL); + KEY_FREESAV(&sav); + return (-1); /* NOTREACHED */ break; } @@ -2041,6 +2771,23 @@ tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, return (0); } +/* + * Compute TCP-MD5 hash of a TCP segment. (RFC2385) + * + * Return 0 if successful, otherwise return -1. + */ +int +tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, + u_char *buf, u_int direction) +{ + struct secasvar *sav; + + if ((sav = tcp_get_sav(m, direction)) == NULL) + return (-1); + + return (tcp_signature_do_compute(m, len, optlen, buf, sav)); +} + /* * Verify the TCP-MD5 hash of a TCP segment. (RFC2385) * @@ -2170,7 +2917,7 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) default: return (EINVAL); } - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: @@ -2209,12 +2956,12 @@ sysctl_drop(SYSCTL_HANDLER_ARGS) INP_WUNLOCK(inp); } else error = ESRCH; - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } -SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_DROP, drop, - CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, + CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP, NULL, 0, sysctl_drop, "", "Drop TCP connection"); /* @@ -2332,3 +3079,21 @@ tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, panic("%s: string too long", __func__); return (s); } + +/* + * A subroutine which makes it easy to track TCP state changes with DTrace. + * This function shouldn't be called for t_state initializations that don't + * correspond to actual TCP state transitions. + */ +void +tcp_state_change(struct tcpcb *tp, int newstate) +{ +#if defined(KDTRACE_HOOKS) + int pstate = tp->t_state; +#endif + + TCPSTATES_DEC(tp->t_state); + TCPSTATES_INC(newstate); + tp->t_state = newstate; + TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); +} diff --git a/freebsd/sys/netinet/tcp_syncache.c b/freebsd/sys/netinet/tcp_syncache.c index 10bd00ae..d7da3a01 100644 --- a/freebsd/sys/netinet/tcp_syncache.c +++ b/freebsd/sys/netinet/tcp_syncache.c @@ -2,13 +2,13 @@ /*- * Copyright (c) 2001 McAfee, Inc. - * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG + * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and McAfee Research, the Security Research Division of McAfee, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the - * DARPA CHATS research program. + * DARPA CHATS research program. [2001 McAfee, Inc.] * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -42,6 +42,8 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include #include #include #include @@ -49,7 +51,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include /* for proc0 declaration */ #include #include @@ -57,9 +58,13 @@ __FBSDID("$FreeBSD$"); #include #include +#include +#include + #include #include +#include #include #include @@ -78,6 +83,9 @@ __FBSDID("$FreeBSD$"); #include #endif #include +#ifdef TCP_RFC7413 +#include +#endif #include #include #include @@ -104,13 +112,13 @@ __FBSDID("$FreeBSD$"); static VNET_DEFINE(int, tcp_syncookies) = 1; #define V_tcp_syncookies VNET(tcp_syncookies) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookies), 0, "Use TCP SYN cookies if the syncache overflows"); static VNET_DEFINE(int, tcp_syncookiesonly) = 0; #define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); @@ -121,20 +129,27 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW, static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); -struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **); -static int syncache_respond(struct syncache *); +static int syncache_respond(struct syncache *, struct syncache_head *, int, + const struct mbuf *); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); -static int syncache_sysctl_count(SYSCTL_HANDLER_ARGS); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); -static void syncookie_generate(struct syncache_head *, struct syncache *, - u_int32_t *); + +static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, + uint8_t *, uintptr_t); +static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, - struct syncache *, struct tcpopt *, struct tcphdr *, + struct syncache *, struct tcphdr *, struct tcpopt *, struct socket *); +static void syncookie_reseed(void *); +#ifdef INVARIANTS +static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, + struct syncache *sc, struct tcphdr *th, struct tcpopt *to, + struct socket *lso); +#endif /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. @@ -153,54 +168,32 @@ static VNET_DEFINE(struct tcp_syncache, tcp_syncache); static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); -SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, +SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.bucket_limit), 0, "Per-bucket hash limit for syncache"); -SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, +SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.cache_limit), 0, "Overall entry limit for syncache"); -SYSCTL_VNET_PROC(_net_inet_tcp_syncache, OID_AUTO, count, (CTLTYPE_UINT|CTLFLAG_RD), - NULL, 0, &syncache_sysctl_count, "IU", - "Current number of entries in syncache"); +SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET, + &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache"); -SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN, +SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.hashsize), 0, "Size of TCP syncache hashtable"); -SYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, +SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncache.rexmt_limit), 0, "Limit on SYN/ACK retransmissions"); VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; -SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, - CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, +SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, "Send reset on socket allocation failure"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); -#define SYNCACHE_HASH(inc, mask) \ - ((V_tcp_syncache.hash_secret ^ \ - (inc)->inc_faddr.s_addr ^ \ - ((inc)->inc_faddr.s_addr >> 16) ^ \ - (inc)->inc_fport ^ (inc)->inc_lport) & mask) - -#define SYNCACHE_HASH6(inc, mask) \ - ((V_tcp_syncache.hash_secret ^ \ - (inc)->inc6_faddr.s6_addr32[0] ^ \ - (inc)->inc6_faddr.s6_addr32[3] ^ \ - (inc)->inc_fport ^ (inc)->inc_lport) & mask) - -#define ENDPTS_EQ(a, b) ( \ - (a)->ie_fport == (b)->ie_fport && \ - (a)->ie_lport == (b)->ie_lport && \ - (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ - (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ -) - -#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) - #define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) #define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) #define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) @@ -254,17 +247,19 @@ syncache_init(void) V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); - /* Initialize the hash buckets. */ - for (i = 0; i < V_tcp_syncache.hashsize; i++) { #ifdef VIMAGE - V_tcp_syncache.hashbase[i].sch_vnet = curvnet; + V_tcp_syncache.vnet = curvnet; #endif + + /* Initialize the hash buckets. */ + for (i = 0; i < V_tcp_syncache.hashsize; i++) { TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, &V_tcp_syncache.hashbase[i].sch_mtx, 0); V_tcp_syncache.hashbase[i].sch_length = 0; + V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache; } /* Create the syncache entry zone. */ @@ -272,6 +267,13 @@ syncache_init(void) NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); + + /* Start the SYN cookie reseeder callout. */ + callout_init(&V_tcp_syncache.secret.reseed, 1); + arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0); + arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); + callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, + syncookie_reseed, &V_tcp_syncache); } #ifdef VIMAGE @@ -282,6 +284,12 @@ syncache_destroy(void) struct syncache *sc, *nsc; int i; + /* + * Stop the re-seed timer before freeing resources. No need to + * possibly schedule it another time. + */ + callout_drain(&V_tcp_syncache.secret.reseed); + /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { @@ -308,15 +316,6 @@ syncache_destroy(void) } #endif -static int -syncache_sysctl_count(SYSCTL_HANDLER_ARGS) -{ - int count; - - count = uma_zone_get_cur(V_tcp_syncache.zone); - return (sysctl_handle_int(oidp, &count, 0, req)); -} - /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. @@ -359,6 +358,7 @@ syncache_insert(struct syncache *sc, struct syncache_head *sch) SCH_UNLOCK(sch); + TCPSTATES_INC(TCPS_SYN_RECEIVED); TCPSTAT_INC(tcps_sc_added); } @@ -372,6 +372,7 @@ syncache_drop(struct syncache *sc, struct syncache_head *sch) SCH_LOCK_ASSERT(sch); + TCPSTATES_DEC(TCPS_SYN_RECEIVED); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; @@ -393,7 +394,7 @@ static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) { sc->sc_rxttime = ticks + - TCPTV_RTOBASE * (tcp_backoff[sc->sc_rxmits]); + TCPTV_RTOBASE * (tcp_syn_backoff[sc->sc_rxmits]); sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { sch->sch_nextc = sc->sc_rxttime; @@ -416,7 +417,7 @@ syncache_timer(void *xsch) int tick = ticks; char *s; - CURVNET_SET(sch->sch_vnet); + CURVNET_SET(sch->sch_sc->vnet); /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); @@ -459,7 +460,7 @@ syncache_timer(void *xsch) free(s, M_TCPLOG); } - (void) syncache_respond(sc); + syncache_respond(sc, sch, 1, NULL); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } @@ -473,46 +474,34 @@ syncache_timer(void *xsch) * Find an entry in the syncache. * Returns always with locked syncache_head plus a matching entry or NULL. */ -struct syncache * +static struct syncache * syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) { struct syncache *sc; struct syncache_head *sch; + uint32_t hash; -#ifdef INET6 - if (inc->inc_flags & INC_ISIPV6) { - sch = &V_tcp_syncache.hashbase[ - SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)]; - *schp = sch; - - SCH_LOCK(sch); + /* + * The hash is built on foreign port + local port + foreign address. + * We rely on the fact that struct in_conninfo starts with 16 bits + * of foreign port, then 16 bits of local port then followed by 128 + * bits of foreign address. In case of IPv4 address, the first 3 + * 32-bit words of the address always are zeroes. + */ + hash = jenkins_hash32((uint32_t *)&inc->inc_ie, 5, + V_tcp_syncache.hash_secret) & V_tcp_syncache.hashmask; - /* Circle through bucket row to find matching entry. */ - TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { - if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) - return (sc); - } - } else -#endif - { - sch = &V_tcp_syncache.hashbase[ - SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)]; - *schp = sch; + sch = &V_tcp_syncache.hashbase[hash]; + *schp = sch; + SCH_LOCK(sch); - SCH_LOCK(sch); + /* Circle through bucket row to find matching entry. */ + TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) + if (bcmp(&inc->inc_ie, &sc->sc_inc.inc_ie, + sizeof(struct in_endpoints)) == 0) + break; - /* Circle through bucket row to find matching entry. */ - TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { -#ifdef INET6 - if (sc->sc_inc.inc_flags & INC_ISIPV6) - continue; -#endif - if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) - return (sc); - } - } - SCH_LOCK_ASSERT(*schp); - return (NULL); /* always returns with locked sch */ + return (sc); /* Always returns with locked sch. */ } /* @@ -644,17 +633,20 @@ done: /* * Build a new TCP socket structure from a syncache entry. + * + * On success return the newly created socket with its underlying inp locked. */ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { + struct tcp_function_block *blk; struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; int error; char *s; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* * Ok, create the full blown connection, and set things up @@ -662,7 +654,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) * connection when the SYN arrived. If we can't create * the connection, abort it. */ - so = sonewconn(lso, SS_ISCONNECTED); + so = sonewconn(lso, 0); if (so == NULL) { /* * Drop the connection; we will either send a RST or @@ -685,6 +677,15 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) inp = sotoinpcb(so); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WLOCK(inp); + /* + * Exclusive pcbinfo lock is not required in syncache socket case even + * if two inpcb locks can be acquired simultaneously: + * - the inpcb in LISTEN state, + * - the newly created inp. + * + * In this case, an inp cannot be at same time in LISTEN state and + * just created by an accept() call. + */ INP_HASH_WLOCK(&V_tcbinfo); /* Insert new socket into PCB hash list. */ @@ -701,6 +702,15 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) } #endif + /* + * If there's an mbuf and it has a flowid, then let's initialise the + * inp with that particular flowid. + */ + if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { + inp->inp_flowid = m->m_pkthdr.flowid; + inp->inp_flowtype = M_HASHTYPE_GET(m); + } + /* * Install in the reservation hash table for now, but don't yet * install a connection group since the full 4-tuple isn't yet @@ -824,11 +834,31 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) #endif /* INET */ INP_HASH_WUNLOCK(&V_tcbinfo); tp = intotcpcb(inp); - tp->t_state = TCPS_SYN_RECEIVED; + tcp_state_change(tp, TCPS_SYN_RECEIVED); tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); + blk = sototcpcb(lso)->t_fb; + if (blk != tp->t_fb) { + /* + * Our parents t_fb was not the default, + * we need to release our ref on tp->t_fb and + * pickup one on the new entry. + */ + struct tcp_function_block *rblk; + + rblk = find_and_ref_tcp_fb(blk); + KASSERT(rblk != NULL, + ("cannot find blk %p out of syncache?", blk)); + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_fb = rblk; + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } + } tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; @@ -898,7 +928,7 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - INP_WUNLOCK(inp); + soisconnected(so); TCPSTAT_INC(tcps_accepts); return (so); @@ -917,6 +947,9 @@ abort2: * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. + * + * On syncache_socket() success the newly created socket + * has its underlying inp locked. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, @@ -931,12 +964,22 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); + +#ifdef INVARIANTS + /* + * Test code for syncookies comparing the syncache stored + * values with the reconstructed values from the cookie. + */ + if (sc != NULL) + syncookie_cmp(inc, sch, sc, th, to, *lsop); +#endif + if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is @@ -956,7 +999,7 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, goto failed; } bzero(&scs, sizeof(scs)); - sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop); + sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) @@ -966,7 +1009,16 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, goto failed; } } else { - /* Pull out the entry to unlock the bucket row. */ + /* + * Pull out the entry to unlock the bucket row. + * + * NOTE: We must decrease TCPS_SYN_RECEIVED count here, not + * tcp_state_change(). The tcpcb is not existent at this + * moment. A new one will be allocated via syncache_socket-> + * sonewconn->tcp_usr_attach in TCPS_CLOSED state, then + * syncache_socket() will change it to TCPS_SYN_RECEIVED. + */ + TCPSTATES_DEC(TCPS_SYN_RECEIVED); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD @@ -1002,12 +1054,32 @@ syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, goto failed; } + /* + * If timestamps were not negotiated during SYN/ACK they + * must not appear on any segment during this session. + */ if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment rejected\n", s, __func__); goto failed; } + + /* + * If timestamps were negotiated during SYN/ACK they should + * appear on every segment during this session. + * XXXAO: This is only informal as there have been unverified + * reports of non-compliants stacks. + */ + if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Timestamp missing, " + "no action\n", s, __func__); + free(s, M_TCPLOG); + s = NULL; + } + } + /* * If timestamps were negotiated the reflected timestamp * must be equal to what we actually sent in the SYN|ACK. @@ -1040,6 +1112,39 @@ failed: return (0); } +#ifdef TCP_RFC7413 +static void +syncache_tfo_expand(struct syncache *sc, struct socket **lsop, struct mbuf *m, + uint64_t response_cookie) +{ + struct inpcb *inp; + struct tcpcb *tp; + unsigned int *pending_counter; + + /* + * Global TCP locks are held because we manipulate the PCB lists + * and create a new socket. + */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + pending_counter = intotcpcb(sotoinpcb(*lsop))->t_tfo_pending; + *lsop = syncache_socket(sc, *lsop, m); + if (*lsop == NULL) { + TCPSTAT_INC(tcps_sc_aborted); + atomic_subtract_int(pending_counter, 1); + } else { + inp = sotoinpcb(*lsop); + tp = intotcpcb(inp); + tp->t_flags |= TF_FASTOPEN; + tp->t_tfo_cookie = response_cookie; + tp->snd_max = tp->iss; + tp->snd_nxt = tp->iss; + tp->t_tfo_pending = pending_counter; + TCPSTAT_INC(tcps_sc_completed); + } +} +#endif /* TCP_RFC7413 */ + /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: @@ -1052,9 +1157,16 @@ failed: * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. + * + * The exception to the above is when a SYN with a valid TCP Fast Open (TFO) + * cookie is processed, V_tcp_fastopen_enabled set to true, and the + * TCP_FASTOPEN socket option is set. In this case, a new socket is created + * and returned via lsop, the mbuf is not freed so that tcp_input() can + * queue its data to the socket, and 1 is returned to indicate the + * TFO-socket-creation path was taken. */ -static void -_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, +int +syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, void *todctx) { @@ -1063,10 +1175,10 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; - u_int32_t flowtmp; u_int ltflags; int win, sb_hiwat, ip_ttl, ip_tos; char *s; + int rv = 0; #ifdef INET6 int autoflowlabel = 0; #endif @@ -1075,8 +1187,12 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, #endif struct syncache scs; struct ucred *cred; +#ifdef TCP_RFC7413 + uint64_t tfo_response_cookie; + int tfo_cookie_valid = 0; + int tfo_response_cookie_valid = 0; +#endif - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); @@ -1100,6 +1216,29 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sb_hiwat = so->so_rcv.sb_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); +#ifdef TCP_RFC7413 + if (V_tcp_fastopen_enabled && (tp->t_flags & TF_FASTOPEN) && + (tp->t_tfo_pending != NULL) && (to->to_flags & TOF_FASTOPEN)) { + /* + * Limit the number of pending TFO connections to + * approximately half of the queue limit. This prevents TFO + * SYN floods from starving the service by filling the + * listen queue with bogus TFO connections. + */ + if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <= + (so->so_qlimit / 2)) { + int result; + + result = tcp_fastopen_check_cookie(inc, + to->to_tfo_cookie, to->to_tfo_len, + &tfo_response_cookie); + tfo_cookie_valid = (result > 0); + tfo_response_cookie_valid = (result >= 0); + } else + atomic_subtract_int(tp->t_tfo_pending, 1); + } +#endif + /* By the time we drop the lock these should no longer be used. */ so = NULL; tp = NULL; @@ -1107,13 +1246,14 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); goto done; } else mac_syncache_create(maclabel, inp); #endif - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); +#ifdef TCP_RFC7413 + if (!tfo_cookie_valid) +#endif + INP_WUNLOCK(inp); /* * Remember the IP options, if any. @@ -1142,6 +1282,10 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, sc = syncache_lookup(inc, &sch); /* returns locked entry */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { +#ifdef TCP_RFC7413 + if (tfo_cookie_valid) + INP_WUNLOCK(inp); +#endif TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* @@ -1174,7 +1318,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, s, __func__); free(s, M_TCPLOG); } - if (syncache_respond(sc) == 0) { + if (syncache_respond(sc, sch, 1, m) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); @@ -1184,6 +1328,14 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, goto done; } +#ifdef TCP_RFC7413 + if (tfo_cookie_valid) { + bzero(&scs, sizeof(scs)); + sc = &scs; + goto skip_alloc; + } +#endif + sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* @@ -1207,7 +1359,13 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, } } } - + +#ifdef TCP_RFC7413 +skip_alloc: + if (!tfo_cookie_valid && tfo_response_cookie_valid) + sc->sc_tfo_cookie = &tfo_response_cookie; +#endif + /* * Fill in the syncache values. */ @@ -1271,7 +1429,7 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, * With the default maxsockbuf of 256K, a scale factor * of 3 will be chosen by this algorithm. Those who * choose a larger maxsockbuf should watch out - * for the compatiblity problems mentioned above. + * for the compatibility problems mentioned above. * * RFC1323: The Window field in a SYN (i.e., a * or ) segment itself is never scaled. @@ -1286,11 +1444,9 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, } #ifdef TCP_SIGNATURE /* - * If listening socket requested TCP digests, and received SYN + * If listening socket requested TCP digests, OR received SYN * contains the option, flag this in the syncache so that * syncache_respond() will do the right thing with the SYN+ACK. - * XXX: Currently we always record the option by default and will - * attempt to use it in syncache_respond(). */ if (to->to_flags & TOF_SIGNATURE || ltflags & TF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; @@ -1304,25 +1460,32 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) sc->sc_flags |= SCF_ECN; - if (V_tcp_syncookies) { - syncookie_generate(sch, sc, &flowtmp); + if (V_tcp_syncookies) + sc->sc_iss = syncookie_generate(sch, sc); #ifdef INET6 - if (autoflowlabel) - sc->sc_flowlabel = flowtmp; -#endif - } else { -#ifdef INET6 - if (autoflowlabel) - sc->sc_flowlabel = - (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); -#endif + if (autoflowlabel) { + if (V_tcp_syncookies) + sc->sc_flowlabel = sc->sc_iss; + else + sc->sc_flowlabel = ip6_randomflowlabel(); + sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; } +#endif SCH_UNLOCK(sch); +#ifdef TCP_RFC7413 + if (tfo_cookie_valid) { + syncache_tfo_expand(sc, lsop, m, tfo_response_cookie); + /* INP_WUNLOCK(inp) will be performed by the called */ + rv = 1; + goto tfo_done; + } +#endif + /* * Do a standard 3-way handshake. */ - if (syncache_respond(sc) == 0) { + if (syncache_respond(sc, sch, 0, m) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) @@ -1336,21 +1499,29 @@ _syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, } done: + if (m) { + *lsop = NULL; + m_freem(m); + } +#ifdef TCP_RFC7413 +tfo_done: +#endif if (cred != NULL) crfree(cred); #ifdef MAC if (sc == &scs) mac_syncache_destroy(&maclabel); #endif - if (m) { - - *lsop = NULL; - m_freem(m); - } + return (rv); } +/* + * Send SYN|ACK to the peer. Either in response to the peer's SYN, + * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL. + */ static int -syncache_respond(struct syncache *sc) +syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked, + const struct mbuf *m0) { struct ip *ip = NULL; struct mbuf *m; @@ -1361,6 +1532,9 @@ syncache_respond(struct syncache *sc) #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif +#ifdef TCP_SIGNATURE + struct secasvar *sav; +#endif hlen = #ifdef INET6 @@ -1379,7 +1553,7 @@ syncache_respond(struct syncache *sc) ("syncache: mbuf too small")); /* Create the IP+TCP header from scratch. */ - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); #ifdef MAC @@ -1413,7 +1587,7 @@ syncache_respond(struct syncache *sc) ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; - ip->ip_len = tlen; + ip->ip_len = htons(tlen); ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; @@ -1431,7 +1605,7 @@ syncache_respond(struct syncache *sc) * 2) the SCF_UNREACH flag has been set */ if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) - ip->ip_off |= IP_DF; + ip->ip_off |= htons(IP_DF); th = (struct tcphdr *)(ip + 1); } @@ -1471,8 +1645,39 @@ syncache_respond(struct syncache *sc) if (sc->sc_flags & SCF_SACK) to.to_flags |= TOF_SACKPERM; #ifdef TCP_SIGNATURE - if (sc->sc_flags & SCF_SIGNATURE) - to.to_flags |= TOF_SIGNATURE; + sav = NULL; + if (sc->sc_flags & SCF_SIGNATURE) { + sav = tcp_get_sav(m, IPSEC_DIR_OUTBOUND); + if (sav != NULL) + to.to_flags |= TOF_SIGNATURE; + else { + + /* + * We've got SCF_SIGNATURE flag + * inherited from listening socket, + * but no SADB key for given source + * address. Assume signature is not + * required and remove signature flag + * instead of silently dropping + * connection. + */ + if (locked == 0) + SCH_LOCK(sch); + sc->sc_flags &= ~SCF_SIGNATURE; + if (locked == 0) + SCH_UNLOCK(sch); + } + } +#endif + +#ifdef TCP_RFC7413 + if (sc->sc_tfo_cookie) { + to.to_flags |= TOF_FASTOPEN; + to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; + to.to_tfo_cookie = sc->sc_tfo_cookie; + /* don't send cookie again when retransmitting response */ + sc->sc_tfo_cookie = NULL; + } #endif optlen = tcp_addoptions(&to, (u_char *)(th + 1)); @@ -1483,20 +1688,29 @@ syncache_respond(struct syncache *sc) #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) - tcp_signature_compute(m, 0, 0, optlen, - to.to_signature, IPSEC_DIR_OUTBOUND); + tcp_signature_do_compute(m, 0, optlen, + to.to_signature, sav); #endif #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); else #endif - ip->ip_len += optlen; + ip->ip_len = htons(ntohs(ip->ip_len) + optlen); } else optlen = 0; M_SETFIB(m, sc->sc_inc.inc_fibnum); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + /* + * If we have peer's SYN and it has a flowid, then let's assign it to + * our SYN|ACK. ip6_output() and ip_output() will not assign flowid + * to SYN|ACK due to lack of inp here. + */ + if (m0 != NULL && M_HASHTYPE_GET(m0) != M_HASHTYPE_NONE) { + m->m_pkthdr.flowid = m0->m_pkthdr.flowid; + M_HASHTYPE_SET(m, M_HASHTYPE_GET(m0)); + } #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; @@ -1538,292 +1752,379 @@ syncache_respond(struct syncache *sc) return (error); } -void -syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, struct socket **lsop, struct mbuf *m) -{ - _syncache_add(inc, to, th, inp, lsop, m, NULL, NULL); -} - -void -tcp_offload_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, struct socket **lsop, void *tod, void *todctx) -{ - - _syncache_add(inc, to, th, inp, lsop, NULL, tod, todctx); -} /* - * The purpose of SYN cookies is to avoid keeping track of all SYN's we - * receive and to be able to handle SYN floods from bogus source addresses - * (where we will never receive any reply). SYN floods try to exhaust all - * our memory and available slots in the SYN cache table to cause a denial - * of service to legitimate users of the local host. + * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks + * that exceed the capacity of the syncache by avoiding the storage of any + * of the SYNs we receive. Syncookies defend against blind SYN flooding + * attacks where the attacker does not have access to our responses. + * + * Syncookies encode and include all necessary information about the + * connection setup within the SYN|ACK that we send back. That way we + * can avoid keeping any local state until the ACK to our SYN|ACK returns + * (if ever). Normally the syncache and syncookies are running in parallel + * with the latter taking over when the former is exhausted. When matching + * syncache entry is found the syncookie is ignored. * - * The idea of SYN cookies is to encode and include all necessary information - * about the connection setup state within the SYN-ACK we send back and thus - * to get along without keeping any local state until the ACK to the SYN-ACK - * arrives (if ever). Everything we need to know should be available from - * the information we encoded in the SYN-ACK. + * The only reliable information persisting the 3WHS is our initial sequence + * number ISS of 32 bits. Syncookies embed a cryptographically sufficient + * strong hash (MAC) value and a few bits of TCP SYN options in the ISS + * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK + * returns and signifies a legitimate connection if it matches the ACK. * - * More information about the theory behind SYN cookies and its first - * discussion and specification can be found at: - * http://cr.yp.to/syncookies.html (overview) - * http://cr.yp.to/syncookies/archive (gory details) + * The available space of 32 bits to store the hash and to encode the SYN + * option information is very tight and we should have at least 24 bits for + * the MAC to keep the number of guesses by blind spoofing reasonably high. * - * This implementation extends the orginal idea and first implementation - * of FreeBSD by using not only the initial sequence number field to store - * information but also the timestamp field if present. This way we can - * keep track of the entire state we need to know to recreate the session in - * its original form. Almost all TCP speakers implement RFC1323 timestamps - * these days. For those that do not we still have to live with the known - * shortcomings of the ISN only SYN cookies. + * SYN option information we have to encode to fully restore a connection: + * MSS: is imporant to chose an optimal segment size to avoid IP level + * fragmentation along the path. The common MSS values can be encoded + * in a 3-bit table. Uncommon values are captured by the next lower value + * in the table leading to a slight increase in packetization overhead. + * WSCALE: is necessary to allow large windows to be used for high delay- + * bandwidth product links. Not scaling the window when it was initially + * negotiated is bad for performance as lack of scaling further decreases + * the apparent available send window. We only need to encode the WSCALE + * we received from the remote end. Our end can be recalculated at any + * time. The common WSCALE values can be encoded in a 3-bit table. + * Uncommon values are captured by the next lower value in the table + * making us under-estimate the available window size halving our + * theoretically possible maximum throughput for that connection. + * SACK: Greatly assists in packet loss recovery and requires 1 bit. + * TIMESTAMP and SIGNATURE is not encoded because they are permanent options + * that are included in all segments on a connection. We enable them when + * the ACK has them. * - * Cookie layers: + * Security of syncookies and attack vectors: * - * Initial sequence number we send: - * 31|................................|0 - * DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP - * D = MD5 Digest (first dword) - * M = MSS index - * R = Rotation of secret - * P = Odd or Even secret + * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod) + * together with the gloabl secret to make it unique per connection attempt. + * Thus any change of any of those parameters results in a different MAC output + * in an unpredictable way unless a collision is encountered. 24 bits of the + * MAC are embedded into the ISS. * - * The MD5 Digest is computed with over following parameters: - * a) randomly rotated secret - * b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6) - * c) the received initial sequence number from remote host - * d) the rotation offset and odd/even bit + * To prevent replay attacks two rotating global secrets are updated with a + * new random value every 15 seconds. The life-time of a syncookie is thus + * 15-30 seconds. * - * Timestamp we send: - * 31|................................|0 - * DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5 - * D = MD5 Digest (third dword) (only as filler) - * S = Requested send window scale - * R = Requested receive window scale - * A = SACK allowed - * 5 = TCP-MD5 enabled (not implemented yet) - * XORed with MD5 Digest (forth dword) + * Vector 1: Attacking the secret. This requires finding a weakness in the + * MAC itself or the way it is used here. The attacker can do a chosen plain + * text attack by varying and testing the all parameters under his control. + * The strength depends on the size and randomness of the secret, and the + * cryptographic security of the MAC function. Due to the constant updating + * of the secret the attacker has at most 29.999 seconds to find the secret + * and launch spoofed connections. After that he has to start all over again. * - * The timestamp isn't cryptographically secure and doesn't need to be. - * The double use of the MD5 digest dwords ties it to a specific remote/ - * local host/port, remote initial sequence number and our local time - * limited secret. A received timestamp is reverted (XORed) and then - * the contained MD5 dword is compared to the computed one to ensure the - * timestamp belongs to the SYN-ACK we sent. The other parameters may - * have been tampered with but this isn't different from supplying bogus - * values in the SYN in the first place. + * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC + * size an average of 4,823 attempts are required for a 50% chance of success + * to spoof a single syncookie (birthday collision paradox). However the + * attacker is blind and doesn't know if one of his attempts succeeded unless + * he has a side channel to interfere success from. A single connection setup + * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. + * This many attempts are required for each one blind spoofed connection. For + * every additional spoofed connection he has to launch another N attempts. + * Thus for a sustained rate 100 spoofed connections per second approximately + * 1,800,000 packets per second would have to be sent. * - * Some problems with SYN cookies remain however: - * Consider the problem of a recreated (and retransmitted) cookie. If the - * original SYN was accepted, the connection is established. The second - * SYN is inflight, and if it arrives with an ISN that falls within the - * receive window, the connection is killed. + * NB: The MAC function should be fast so that it doesn't become a CPU + * exhaustion attack vector itself. * - * Notes: - * A heuristic to determine when to accept syn cookies is not necessary. - * An ACK flood would cause the syncookie verification to be attempted, - * but a SYN flood causes syncookies to be generated. Both are of equal - * cost, so there's no point in trying to optimize the ACK flood case. - * Also, if you don't process certain ACKs for some reason, then all someone - * would have to do is launch a SYN and ACK flood at the same time, which - * would stop cookie verification and defeat the entire purpose of syncookies. + * References: + * RFC4987 TCP SYN Flooding Attacks and Common Mitigations + * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 + * http://cr.yp.to/syncookies.html (overview) + * http://cr.yp.to/syncookies/archive (details) + * + * + * Schematic construction of a syncookie enabled Initial Sequence Number: + * 0 1 2 3 + * 12345678901234567890123456789012 + * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| + * + * x 24 MAC (truncated) + * W 3 Send Window Scale index + * M 3 MSS index + * S 1 SACK permitted + * P 1 Odd/even secret */ -static int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 }; -static void -syncookie_generate(struct syncache_head *sch, struct syncache *sc, - u_int32_t *flowlabel) +/* + * Distribution and probability of certain MSS values. Those in between are + * rounded down to the next lower one. + * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011] + * .2% .3% 5% 7% 7% 20% 15% 45% + */ +static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 }; + +/* + * Distribution and probability of certain WSCALE values. We have to map the + * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 + * bits based on prevalence of certain values. Where we don't have an exact + * match for are rounded down to the next lower one letting us under-estimate + * the true available window. At the moment this would happen only for the + * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer + * and window size). The absence of the WSCALE option (no scaling in either + * direction) is encoded with index zero. + * [WSCALE values histograms, Allman, 2012] + * X 10 10 35 5 6 14 10% by host + * X 11 4 5 5 18 49 3% by connections + */ +static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 }; + +/* + * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed + * and good cryptographic properties. + */ +static uint32_t +syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags, + uint8_t *secbits, uintptr_t secmod) { - MD5_CTX ctx; - u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; - u_int32_t data; - u_int32_t *secbits; - u_int off, pmss, mss; - int i; + SIPHASH_CTX ctx; + uint32_t siphash[2]; + + SipHash24_Init(&ctx); + SipHash_SetKey(&ctx, secbits); + switch (inc->inc_flags & INC_ISIPV6) { +#ifdef INET + case 0: + SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr)); + SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr)); + break; +#endif +#ifdef INET6 + case INC_ISIPV6: + SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr)); + SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr)); + break; +#endif + } + SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport)); + SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport)); + SipHash_Update(&ctx, &irs, sizeof(irs)); + SipHash_Update(&ctx, &flags, sizeof(flags)); + SipHash_Update(&ctx, &secmod, sizeof(secmod)); + SipHash_Final((u_int8_t *)&siphash, &ctx); + + return (siphash[0] ^ siphash[1]); +} + +static tcp_seq +syncookie_generate(struct syncache_head *sch, struct syncache *sc) +{ + u_int i, mss, secbit, wscale; + uint32_t iss, hash; + uint8_t *secbits; + union syncookie cookie; SCH_LOCK_ASSERT(sch); - /* Which of the two secrets to use. */ - secbits = sch->sch_oddeven ? - sch->sch_secbits_odd : sch->sch_secbits_even; - - /* Reseed secret if too old. */ - if (sch->sch_reseed < time_uptime) { - sch->sch_oddeven = sch->sch_oddeven ? 0 : 1; /* toggle */ - secbits = sch->sch_oddeven ? - sch->sch_secbits_odd : sch->sch_secbits_even; - for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++) - secbits[i] = arc4random(); - sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME; - } + cookie.cookie = 0; - /* Secret rotation offset. */ - off = sc->sc_iss & 0x7; /* iss was randomized before */ + /* Map our computed MSS into the 3-bit index. */ + mss = min(tcp_mssopt(&sc->sc_inc), max(sc->sc_peer_mss, V_tcp_minmss)); + for (i = nitems(tcp_sc_msstab) - 1; tcp_sc_msstab[i] > mss && i > 0; + i--) + ; + cookie.flags.mss_idx = i; - /* Maximum segment size calculation. */ - pmss = - max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), V_tcp_minmss); - for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--) - if (tcp_sc_msstab[mss] <= pmss) - break; + /* + * Map the send window scale into the 3-bit index but only if + * the wscale option was received. + */ + if (sc->sc_flags & SCF_WINSCALE) { + wscale = sc->sc_requested_s_scale; + for (i = nitems(tcp_sc_wstab) - 1; + tcp_sc_wstab[i] > wscale && i > 0; + i--) + ; + cookie.flags.wscale_idx = i; + } - /* Fold parameters and MD5 digest into the ISN we will send. */ - data = sch->sch_oddeven;/* odd or even secret, 1 bit */ - data |= off << 1; /* secret offset, derived from iss, 3 bits */ - data |= mss << 4; /* mss, 3 bits */ + /* Can we do SACK? */ + if (sc->sc_flags & SCF_SACK) + cookie.flags.sack_ok = 1; - MD5Init(&ctx); - MD5Update(&ctx, ((u_int8_t *)secbits) + off, - SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off); - MD5Update(&ctx, secbits, off); - MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc)); - MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs)); - MD5Update(&ctx, &data, sizeof(data)); - MD5Final((u_int8_t *)&md5_buffer, &ctx); + /* Which of the two secrets to use. */ + secbit = sch->sch_sc->secret.oddeven & 0x1; + cookie.flags.odd_even = secbit; - data |= (md5_buffer[0] << 7); - sc->sc_iss = data; + secbits = sch->sch_sc->secret.key[secbit]; + hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, + (uintptr_t)sch); -#ifdef INET6 - *flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK; -#endif + /* + * Put the flags into the hash and XOR them to get better ISS number + * variance. This doesn't enhance the cryptographic strength and is + * done to prevent the 8 cookie bits from showing up directly on the + * wire. + */ + iss = hash & ~0xff; + iss |= cookie.cookie ^ (hash >> 24); - /* Additional parameters are stored in the timestamp if present. */ + /* Randomize the timestamp. */ if (sc->sc_flags & SCF_TIMESTAMP) { - data = ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */ - data |= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */ - data |= sc->sc_requested_s_scale << 2; /* SWIN scale, 4 bits */ - data |= sc->sc_requested_r_scale << 6; /* RWIN scale, 4 bits */ - data |= md5_buffer[2] << 10; /* more digest bits */ - data ^= md5_buffer[3]; - sc->sc_ts = data; - sc->sc_tsoff = data - tcp_ts_getticks(); /* after XOR */ + sc->sc_ts = arc4random(); + sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks(); } TCPSTAT_INC(tcps_sc_sendcookie); + return (iss); } static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, - struct syncache *sc, struct tcpopt *to, struct tcphdr *th, - struct socket *so) + struct syncache *sc, struct tcphdr *th, struct tcpopt *to, + struct socket *lso) { - MD5_CTX ctx; - u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; - u_int32_t data = 0; - u_int32_t *secbits; + uint32_t hash; + uint8_t *secbits; tcp_seq ack, seq; - int off, mss, wnd, flags; + int wnd, wscale = 0; + union syncookie cookie; SCH_LOCK_ASSERT(sch); /* - * Pull information out of SYN-ACK/ACK and - * revert sequence number advances. + * Pull information out of SYN-ACK/ACK and revert sequence number + * advances. */ ack = th->th_ack - 1; seq = th->th_seq - 1; - off = (ack >> 1) & 0x7; - mss = (ack >> 4) & 0x7; - flags = ack & 0x7f; - - /* Which of the two secrets to use. */ - secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even; /* - * The secret wasn't updated for the lifetime of a syncookie, - * so this SYN-ACK/ACK is either too old (replay) or totally bogus. + * Unpack the flags containing enough information to restore the + * connection. */ - if (sch->sch_reseed + SYNCOOKIE_LIFETIME < time_uptime) { - return (NULL); - } + cookie.cookie = (ack & 0xff) ^ (ack >> 24); - /* Recompute the digest so we can compare it. */ - MD5Init(&ctx); - MD5Update(&ctx, ((u_int8_t *)secbits) + off, - SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off); - MD5Update(&ctx, secbits, off); - MD5Update(&ctx, inc, sizeof(*inc)); - MD5Update(&ctx, &seq, sizeof(seq)); - MD5Update(&ctx, &flags, sizeof(flags)); - MD5Final((u_int8_t *)&md5_buffer, &ctx); - - /* Does the digest part of or ACK'ed ISS match? */ - if ((ack & (~0x7f)) != (md5_buffer[0] << 7)) - return (NULL); + /* Which of the two secrets to use. */ + secbits = sch->sch_sc->secret.key[cookie.flags.odd_even]; - /* Does the digest part of our reflected timestamp match? */ - if (to->to_flags & TOF_TS) { - data = md5_buffer[3] ^ to->to_tsecr; - if ((data & (~0x3ff)) != (md5_buffer[2] << 10)) - return (NULL); - } + hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); + + /* The recomputed hash matches the ACK if this was a genuine cookie. */ + if ((ack & ~0xff) != (hash & ~0xff)) + return (NULL); /* Fill in the syncache values. */ + sc->sc_flags = 0; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; sc->sc_irs = seq; sc->sc_iss = ack; + switch (inc->inc_flags & INC_ISIPV6) { +#ifdef INET + case 0: + sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; + sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; + break; +#endif #ifdef INET6 - if (inc->inc_flags & INC_ISIPV6) { - if (sotoinpcb(so)->inp_flags & IN6P_AUTOFLOWLABEL) - sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK; - } else + case INC_ISIPV6: + if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) + sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK; + break; #endif - { - sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl; - sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos; } - /* Additional parameters that were encoded in the timestamp. */ - if (data) { + sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx]; + + /* We can simply recompute receive window scale we sent earlier. */ + while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) + wscale++; + + /* Only use wscale if it was enabled in the orignal SYN. */ + if (cookie.flags.wscale_idx > 0) { + sc->sc_requested_r_scale = wscale; + sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx]; + sc->sc_flags |= SCF_WINSCALE; + } + + wnd = sbspace(&lso->so_rcv); + wnd = imax(wnd, 0); + wnd = imin(wnd, TCP_MAXWIN); + sc->sc_wnd = wnd; + + if (cookie.flags.sack_ok) + sc->sc_flags |= SCF_SACK; + + if (to->to_flags & TOF_TS) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_ts = to->to_tsecr; sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks(); - sc->sc_flags |= (data & 0x1) ? SCF_SIGNATURE : 0; - sc->sc_flags |= ((data >> 1) & 0x1) ? SCF_SACK : 0; - sc->sc_requested_s_scale = min((data >> 2) & 0xf, - TCP_MAX_WINSHIFT); - sc->sc_requested_r_scale = min((data >> 6) & 0xf, - TCP_MAX_WINSHIFT); - if (sc->sc_requested_s_scale || sc->sc_requested_r_scale) - sc->sc_flags |= SCF_WINSCALE; - } else - sc->sc_flags |= SCF_NOOPT; + } - wnd = sbspace(&so->so_rcv); - wnd = imax(wnd, 0); - wnd = imin(wnd, TCP_MAXWIN); - sc->sc_wnd = wnd; + if (to->to_flags & TOF_SIGNATURE) + sc->sc_flags |= SCF_SIGNATURE; sc->sc_rxmits = 0; - sc->sc_peer_mss = tcp_sc_msstab[mss]; TCPSTAT_INC(tcps_sc_recvcookie); return (sc); } -/* - * Returns the current number of syncache entries. This number - * will probably change before you get around to calling - * syncache_pcblist. - */ - -int -syncache_pcbcount(void) +#ifdef INVARIANTS +static int +syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, + struct syncache *sc, struct tcphdr *th, struct tcpopt *to, + struct socket *lso) { - struct syncache_head *sch; - int count, i; + struct syncache scs, *scx; + char *s; - for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { - /* No need to lock for a read. */ - sch = &V_tcp_syncache.hashbase[i]; - count += sch->sch_length; + bzero(&scs, sizeof(scs)); + scx = syncookie_lookup(inc, sch, &scs, th, to, lso); + + if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) + return (0); + + if (scx != NULL) { + if (sc->sc_peer_mss != scx->sc_peer_mss) + log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", + s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); + + if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) + log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", + s, __func__, sc->sc_requested_r_scale, + scx->sc_requested_r_scale); + + if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) + log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", + s, __func__, sc->sc_requested_s_scale, + scx->sc_requested_s_scale); + + if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) + log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); } - return count; + + if (s != NULL) + free(s, M_TCPLOG); + return (0); +} +#endif /* INVARIANTS */ + +static void +syncookie_reseed(void *arg) +{ + struct tcp_syncache *sc = arg; + uint8_t *secbits; + int secbit; + + /* + * Reseeding the secret doesn't have to be protected by a lock. + * It only must be ensured that the new random values are visible + * to all CPUs in a SMP environment. The atomic with release + * semantics ensures that. + */ + secbit = (sc->secret.oddeven & 0x1) ? 0 : 1; + secbits = sc->secret.key[secbit]; + arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0); + atomic_add_rel_int(&sc->secret.oddeven, 1); + + /* Reschedule ourself. */ + callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); } /* diff --git a/freebsd/sys/netinet/tcp_syncache.h b/freebsd/sys/netinet/tcp_syncache.h index c55bfbcd..6b12c13a 100644 --- a/freebsd/sys/netinet/tcp_syncache.h +++ b/freebsd/sys/netinet/tcp_syncache.h @@ -41,13 +41,11 @@ void syncache_destroy(void); void syncache_unreach(struct in_conninfo *, struct tcphdr *); int syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket **, struct mbuf *); -void syncache_add(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *); -void tcp_offload_syncache_add(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct inpcb *, struct socket **, void *, void *); +int syncache_add(struct in_conninfo *, struct tcpopt *, + struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *, + void *, void *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); -int syncache_pcbcount(void); int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported); struct syncache { @@ -75,7 +73,10 @@ struct syncache { #endif struct label *sc_label; /* MAC label reference */ struct ucred *sc_cred; /* cred cache for jail checks */ - +#ifdef TCP_RFC7413 + void *sc_tfo_cookie; /* for TCP Fast Open response */ +#endif + void *sc_pspare; /* TCP_SIGNATURE */ u_int32_t sc_spare[2]; /* UTO */ }; @@ -91,20 +92,23 @@ struct syncache { #define SCF_SACK 0x80 /* send SACK option */ #define SCF_ECN 0x100 /* send ECN setup packet */ -#define SYNCOOKIE_SECRET_SIZE 8 /* dwords */ -#define SYNCOOKIE_LIFETIME 16 /* seconds */ - struct syncache_head { - struct vnet *sch_vnet; struct mtx sch_mtx; TAILQ_HEAD(sch_head, syncache) sch_bucket; struct callout sch_timer; int sch_nextc; u_int sch_length; - u_int sch_oddeven; - u_int32_t sch_secbits_odd[SYNCOOKIE_SECRET_SIZE]; - u_int32_t sch_secbits_even[SYNCOOKIE_SECRET_SIZE]; - u_int sch_reseed; /* time_uptime, seconds */ + struct tcp_syncache *sch_sc; +}; + +#define SYNCOOKIE_SECRET_SIZE 16 +#define SYNCOOKIE_LIFETIME 15 /* seconds */ + +struct syncookie_secret { + volatile u_int oddeven; + uint8_t key[2][SYNCOOKIE_SECRET_SIZE]; + struct callout reseed; + u_int lifetime; }; struct tcp_syncache { @@ -115,7 +119,20 @@ struct tcp_syncache { u_int bucket_limit; u_int cache_limit; u_int rexmt_limit; - u_int hash_secret; + uint32_t hash_secret; + struct vnet *vnet; + struct syncookie_secret secret; +}; + +/* Internal use for the syncookie functions. */ +union syncookie { + uint8_t cookie; + struct { + uint8_t odd_even:1, + sack_ok:1, + wscale_idx:3, + mss_idx:3; + } flags; }; #endif /* _KERNEL */ diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c index db952e42..edfc3829 100644 --- a/freebsd/sys/netinet/tcp_timer.c +++ b/freebsd/sys/netinet/tcp_timer.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -52,24 +53,40 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include +#include -#include #include +#include #include +#include #include #ifdef INET6 #include #endif #include +#include #include #include #include +#include +#ifdef INET6 +#include +#endif #include #ifdef TCPDEBUG #include #endif +int tcp_persmin; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, + &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); + +int tcp_persmax; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, + &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); + int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); @@ -121,17 +138,110 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, /* max idle probes */ int tcp_maxpersistidle; -static int tcp_rexmit_drop_options = 1; +static int tcp_rexmit_drop_options = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); +static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); +#define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, + CTLFLAG_RW|CTLFLAG_VNET, + &VNET_NAME(tcp_pmtud_blackhole_detect), 0, + "Path MTU Discovery Black Hole Detection Enabled"); + +static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); +#define V_tcp_pmtud_blackhole_activated \ + VNET(tcp_pmtud_blackhole_activated) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, + CTLFLAG_RD|CTLFLAG_VNET, + &VNET_NAME(tcp_pmtud_blackhole_activated), 0, + "Path MTU Discovery Black Hole Detection, Activation Count"); + +static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); +#define V_tcp_pmtud_blackhole_activated_min_mss \ + VNET(tcp_pmtud_blackhole_activated_min_mss) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, + CTLFLAG_RD|CTLFLAG_VNET, + &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, + "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); + +static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); +#define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, + CTLFLAG_RD|CTLFLAG_VNET, + &VNET_NAME(tcp_pmtud_blackhole_failed), 0, + "Path MTU Discovery Black Hole Detection, Failure Count"); + +#ifdef INET +static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; +#define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, + CTLFLAG_RW|CTLFLAG_VNET, + &VNET_NAME(tcp_pmtud_blackhole_mss), 0, + "Path MTU Discovery Black Hole Detection lowered MSS"); +#endif + +#ifdef INET6 +static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; +#define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) +SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, + CTLFLAG_RW|CTLFLAG_VNET, + &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, + "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); +#endif + +#ifdef RSS +static int per_cpu_timers = 1; +#else static int per_cpu_timers = 0; +#endif SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, &per_cpu_timers , 0, "run tcp timers on all cpus"); +#if 0 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) +#endif + +/* + * Map the given inp to a CPU id. + * + * This queries RSS if it's compiled in, else it defaults to the current + * CPU ID. + */ +static inline int +inp_to_cpuid(struct inpcb *inp) +{ + u_int cpuid; + +#ifdef RSS + if (per_cpu_timers) { + cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); + if (cpuid == NETISR_CPUID_NONE) + return (curcpu); /* XXX */ + else + return (cpuid); + } +#else + /* Legacy, pre-RSS behaviour */ + if (per_cpu_timers) { + /* + * We don't have a flowid -> cpuid mapping, so cheat and + * just map unknown cpuids to curcpu. Not the best, but + * apparently better than defaulting to swi 0. + */ + cpuid = inp->inp_flowid % (mp_maxid + 1); + if (! CPU_ABSENT(cpuid)) + return (cpuid); + return (curcpu); + } +#endif + /* Default for RSS and non-RSS - cpuid 0 */ + else { + return (0); + } +} /* * Tcp protocol timeout routine called every 500 ms. @@ -146,9 +256,7 @@ tcp_slowtimo(void) VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - INP_INFO_WLOCK(&V_tcbinfo); (void) tcp_tw_2msl_scan(0); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); @@ -162,10 +270,6 @@ int tcp_backoff[TCP_MAXRXTSHIFT + 1] = static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ -static int tcp_timer_race; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race, - 0, "Count of t_inpcb races on tcp_discardcb"); - /* * TCP timer processing. */ @@ -178,18 +282,7 @@ tcp_timer_delack(void *xtp) CURVNET_SET(tp->t_vnet); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - CURVNET_RESTORE(); - return; - } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_delack) || !callout_active(&tp->t_timers->tt_delack)) { @@ -203,14 +296,65 @@ tcp_timer_delack(void *xtp) CURVNET_RESTORE(); return; } - tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } +/* + * When a timer wants to remove a TCB it must + * hold the INP_INFO_RLOCK(). The timer function + * should only have grabbed the INP_WLOCK() when + * it entered. To safely switch to holding both the + * INP_INFO_RLOCK() and the INP_WLOCK() we must first + * grab a reference on the inp, which will hold the inp + * so that it can't be removed. We then unlock the INP_WLOCK(), + * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK() + * we proceed again to get the INP_WLOCK() (this preserves proper + * lock order). After acquiring the INP_WLOCK we must check if someone + * else deleted the pcb i.e. the inp_flags check. + * If so we return 1 otherwise we return 0. + * + * No matter what the tcp_inpinfo_lock_add() function + * returns the caller must afterwards call tcp_inpinfo_lock_del() + * to drop the locks and reference properly. + */ + +int +tcp_inpinfo_lock_add(struct inpcb *inp) +{ + in_pcbref(inp); + INP_WUNLOCK(inp); + INP_INFO_RLOCK(&V_tcbinfo); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + return(1); + } + return(0); + +} + +void +tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) +{ + INP_INFO_RUNLOCK(&V_tcbinfo); + if (inp && (tp == NULL)) { + /* + * If tcp_close/drop() gets called and tp + * returns NULL, then the function dropped + * the inp lock, we hold a reference keeping + * this around, so we must re-aquire the + * INP_WLOCK() in order to proceed with + * our dropping the inp reference. + */ + INP_WLOCK(inp); + } + if (inp && in_pcbrele_wlocked(inp) == 0) + INP_WUNLOCK(inp); +} + void tcp_timer_2msl(void *xtp) { @@ -222,62 +366,66 @@ tcp_timer_2msl(void *xtp) ostate = tp->t_state; #endif - /* - * XXXRW: Does this actually happen? - */ - INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); tcp_free_sackholes(tp); if (callout_pending(&tp->t_timers->tt_2msl) || !callout_active(&tp->t_timers->tt_2msl)) { INP_WUNLOCK(tp->t_inpcb); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } + KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, + ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle - * too long, or if 2MSL time is up from TIME_WAIT, delete connection - * control block. Otherwise, check again in a bit. + * too long delete connection control block. Otherwise, check + * again in a bit. + * + * If in TIME_WAIT state just ignore as this timeout is handled in + * tcp_tw_2msl_scan(). * * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ + if ((inp->inp_flags & INP_TIMEWAIT) != 0) { + INP_WUNLOCK(inp); + CURVNET_RESTORE(); + return; + } if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; + } tp = tcp_close(tp); + tcp_inpinfo_lock_del(inp, tp); + goto out; } else { - if (tp->t_state != TCPS_TIME_WAIT && - ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) - callout_reset_on(&tp->t_timers->tt_2msl, - TP_KEEPINTVL(tp), tcp_timer_2msl, tp, INP_CPU(inp)); - else - tp = tcp_close(tp); + if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { + callout_reset(&tp->t_timers->tt_2msl, + TP_KEEPINTVL(tp), tcp_timer_2msl, tp); + } else { + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; + } + tp = tcp_close(tp); + tcp_inpinfo_lock_del(inp, tp); + goto out; + } } #ifdef TCPDEBUG @@ -285,9 +433,11 @@ tcp_timer_2msl(void *xtp) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + if (tp != NULL) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); +out: CURVNET_RESTORE(); } @@ -303,36 +453,23 @@ tcp_timer_keep(void *xtp) ostate = tp->t_state; #endif - INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_keep) || !callout_active(&tp->t_timers->tt_keep)) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } + KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, + ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * Keep-alive timer went off; send something * or drop connection if idle for too long. @@ -364,24 +501,29 @@ tcp_timer_keep(void *xtp) tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } - callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), - tcp_timer_keep, tp, INP_CPU(inp)); + callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), + tcp_timer_keep, tp); } else - callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), - tcp_timer_keep, tp, INP_CPU(inp)); + callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), + tcp_timer_keep, tp); #ifdef TCPDEBUG if (inp->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; dropit: TCPSTAT_INC(tcps_keepdrops); + + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; + } tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG @@ -389,9 +531,9 @@ dropit: tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif - if (tp != NULL) - INP_WUNLOCK(tp->t_inpcb); - INP_INFO_WUNLOCK(&V_tcbinfo); + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + tcp_inpinfo_lock_del(inp, tp); +out: CURVNET_RESTORE(); } @@ -406,38 +548,25 @@ tcp_timer_persist(void *xtp) ostate = tp->t_state; #endif - INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_persist) || !callout_active(&tp->t_timers->tt_persist)) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } + KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, + ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* - * Persistance timer into zero window. + * Persistence timer into zero window. * Force a byte to be output, if possible. */ TCPSTAT_INC(tcps_persisttimeo); @@ -452,7 +581,12 @@ tcp_timer_persist(void *xtp) (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; + } tp = tcp_drop(tp, ETIMEDOUT); + tcp_inpinfo_lock_del(inp, tp); goto out; } /* @@ -462,22 +596,26 @@ tcp_timer_persist(void *xtp) if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; + } tp = tcp_drop(tp, ETIMEDOUT); + tcp_inpinfo_lock_del(inp, tp); goto out; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; -out: #ifdef TCPDEBUG if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); #endif - if (tp != NULL) - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + INP_WUNLOCK(inp); +out: CURVNET_RESTORE(); } @@ -487,44 +625,34 @@ tcp_timer_rexmt(void * xtp) struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); int rexmt; - int headlocked; struct inpcb *inp; #ifdef TCPDEBUG int ostate; ostate = tp->t_state; #endif - INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_RUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } + KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); INP_WLOCK(inp); if (callout_pending(&tp->t_timers->tt_rexmt) || !callout_active(&tp->t_timers->tt_rexmt)) { INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } + KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, + ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); tcp_free_sackholes(tp); + if (tp->t_fb->tfb_tcp_rexmit_tmr) { + /* The stack has a timer action too. */ + (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); + } /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off @@ -533,30 +661,15 @@ tcp_timer_rexmt(void * xtp) if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); - in_pcbref(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); - INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) { - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; + if (tcp_inpinfo_lock_add(inp)) { + tcp_inpinfo_lock_del(inp, tp); + goto out; } - tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); - headlocked = 1; + tcp_inpinfo_lock_del(inp, tp); goto out; } - INP_INFO_RUNLOCK(&V_tcbinfo); - headlocked = 0; if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be @@ -589,12 +702,120 @@ tcp_timer_rexmt(void * xtp) } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); - if (tp->t_state == TCPS_SYN_SENT) + if ((tp->t_state == TCPS_SYN_SENT) || + (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); + + /* + * We enter the path for PLMTUD if connection is established or, if + * connection is FIN_WAIT_1 status, reason for the last is that if + * amount of data we send is very small, we could send it in couple of + * packets and process straight to FIN. In that case we won't catch + * ESTABLISHED state. + */ + if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) + || (tp->t_state == TCPS_FIN_WAIT_1))) { +#ifdef INET6 + int isipv6; +#endif + + /* + * Idea here is that at each stage of mtu probe (usually, 1448 + * -> 1188 -> 524) should be given 2 chances to recover before + * further clamping down. 'tp->t_rxtshift % 2 == 0' should + * take care of that. + */ + if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == + (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && + (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { + /* + * Enter Path MTU Black-hole Detection mechanism: + * - Disable Path MTU Discovery (IP "DF" bit). + * - Reduce MTU to lower value than what we + * negotiated with peer. + */ + /* Record that we may have found a black hole. */ + tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; + + /* Keep track of previous MSS. */ + tp->t_pmtud_saved_maxseg = tp->t_maxseg; + + /* + * Reduce the MSS to blackhole value or to the default + * in an attempt to retransmit. + */ +#ifdef INET6 + isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; + if (isipv6 && + tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { + /* Use the sysctl tuneable blackhole MSS. */ + tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; + V_tcp_pmtud_blackhole_activated++; + } else if (isipv6) { + /* Use the default MSS. */ + tp->t_maxseg = V_tcp_v6mssdflt; + /* + * Disable Path MTU Discovery when we switch to + * minmss. + */ + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + V_tcp_pmtud_blackhole_activated_min_mss++; + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { + /* Use the sysctl tuneable blackhole MSS. */ + tp->t_maxseg = V_tcp_pmtud_blackhole_mss; + V_tcp_pmtud_blackhole_activated++; + } else { + /* Use the default MSS. */ + tp->t_maxseg = V_tcp_mssdflt; + /* + * Disable Path MTU Discovery when we switch to + * minmss. + */ + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + V_tcp_pmtud_blackhole_activated_min_mss++; + } +#endif + /* + * Reset the slow-start flight size + * as it may depend on the new MSS. + */ + if (CC_ALGO(tp)->conn_init != NULL) + CC_ALGO(tp)->conn_init(tp->ccv); + } else { + /* + * If further retransmissions are still unsuccessful + * with a lowered MTU, maybe this isn't a blackhole and + * we restore the previous MSS and blackhole detection + * flags. + * The limit '6' is determined by giving each probe + * stage (1448, 1188, 524) 2 chances to recover. + */ + if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && + (tp->t_rxtshift > 6)) { + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; + tp->t_maxseg = tp->t_pmtud_saved_maxseg; + V_tcp_pmtud_blackhole_failed++; + /* + * Reset the slow-start flight size as it + * may depend on the new MSS. + */ + if (CC_ALGO(tp)->conn_init != NULL) + CC_ALGO(tp)->conn_init(tp->ccv); + } + } + } + /* * Disable RFC1323 and SACK if we haven't got any response to * our third SYN to work-around some broken terminal servers @@ -615,7 +836,9 @@ tcp_timer_rexmt(void * xtp) #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); + else #endif + in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } @@ -632,34 +855,35 @@ tcp_timer_rexmt(void * xtp) cc_cong_signal(tp, NULL, CC_RTO); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); -out: #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, PRU_SLOWTIMO); #endif - if (tp != NULL) - INP_WUNLOCK(inp); - if (headlocked) - INP_INFO_WUNLOCK(&V_tcbinfo); + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + INP_WUNLOCK(inp); +out: CURVNET_RESTORE(); } void -tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) +tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) { struct callout *t_callout; - void *f_callout; + timeout_t *f_callout; struct inpcb *inp = tp->t_inpcb; - int cpu = INP_CPU(inp); + int cpu = inp_to_cpuid(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; #endif + if (tp->t_timers->tt_flags & TT_STOPPED) + return; + switch (timer_type) { case TT_DELACK: t_callout = &tp->t_timers->tt_delack; @@ -682,7 +906,11 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) f_callout = tcp_timer_2msl; break; default: - panic("bad timer_type"); + if (tp->t_fb->tfb_tcp_timer_activate) { + tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); + return; + } + panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { callout_stop(t_callout); @@ -692,7 +920,7 @@ tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) } int -tcp_timer_active(struct tcpcb *tp, int timer_type) +tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) { struct callout *t_callout; @@ -713,28 +941,79 @@ tcp_timer_active(struct tcpcb *tp, int timer_type) t_callout = &tp->t_timers->tt_2msl; break; default: - panic("bad timer_type"); + if (tp->t_fb->tfb_tcp_timer_active) { + return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); + } + panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); } +void +tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) +{ + struct callout *t_callout; + + tp->t_timers->tt_flags |= TT_STOPPED; + switch (timer_type) { + case TT_DELACK: + t_callout = &tp->t_timers->tt_delack; + break; + case TT_REXMT: + t_callout = &tp->t_timers->tt_rexmt; + break; + case TT_PERSIST: + t_callout = &tp->t_timers->tt_persist; + break; + case TT_KEEP: + t_callout = &tp->t_timers->tt_keep; + break; + case TT_2MSL: + t_callout = &tp->t_timers->tt_2msl; + break; + default: + if (tp->t_fb->tfb_tcp_timer_stop) { + /* + * XXXrrs we need to look at this with the + * stop case below (flags). + */ + tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); + return; + } + panic("tp %p bad timer_type %#x", tp, timer_type); + } + + if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { + /* + * Can't stop the callout, defer tcpcb actual deletion + * to the last one. We do this using the async drain + * function and incrementing the count in + */ + tp->t_timers->tt_draincnt++; + } +} + #define ticks_to_msecs(t) (1000*(t) / hz) void -tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, struct xtcp_timer *xtimer) +tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, + struct xtcp_timer *xtimer) { - bzero(xtimer, sizeof(struct xtcp_timer)); + sbintime_t now; + + bzero(xtimer, sizeof(*xtimer)); if (timer == NULL) return; + now = getsbinuptime(); if (callout_active(&timer->tt_delack)) - xtimer->tt_delack = ticks_to_msecs(timer->tt_delack.c_time - ticks); + xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_rexmt)) - xtimer->tt_rexmt = ticks_to_msecs(timer->tt_rexmt.c_time - ticks); + xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_persist)) - xtimer->tt_persist = ticks_to_msecs(timer->tt_persist.c_time - ticks); + xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_keep)) - xtimer->tt_keep = ticks_to_msecs(timer->tt_keep.c_time - ticks); + xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; if (callout_active(&timer->tt_2msl)) - xtimer->tt_2msl = ticks_to_msecs(timer->tt_2msl.c_time - ticks); + xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); } diff --git a/freebsd/sys/netinet/tcp_timer.h b/freebsd/sys/netinet/tcp_timer.h index 0da58fd8..bb78062d 100644 --- a/freebsd/sys/netinet/tcp_timer.h +++ b/freebsd/sys/netinet/tcp_timer.h @@ -76,9 +76,8 @@ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ #define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ -#define TCPTV_SRTTDFLT ( 3*hz) /* assumed RTT if no info */ -#define TCPTV_PERSMIN ( 5*hz) /* retransmit persistence */ +#define TCPTV_PERSMIN ( 5*hz) /* minimum persist interval */ #define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ #define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ @@ -122,7 +121,7 @@ #ifdef TCPTIMERS static const char *tcptimers[] = - { "REXMT", "PERSIST", "KEEP", "2MSL" }; + { "REXMT", "PERSIST", "KEEP", "2MSL", "DELACK" }; #endif /* @@ -146,12 +145,27 @@ struct tcp_timer { struct callout tt_keep; /* keepalive */ struct callout tt_2msl; /* 2*msl TIME_WAIT timer */ struct callout tt_delack; /* delayed ACK timer */ + uint32_t tt_flags; /* Timers flags */ + uint32_t tt_draincnt; /* Count being drained */ }; -#define TT_DELACK 0x01 -#define TT_REXMT 0x02 -#define TT_PERSIST 0x04 -#define TT_KEEP 0x08 -#define TT_2MSL 0x10 + +/* + * Flags for the tt_flags field. + */ +#define TT_DELACK 0x0001 +#define TT_REXMT 0x0002 +#define TT_PERSIST 0x0004 +#define TT_KEEP 0x0008 +#define TT_2MSL 0x0010 +#define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL) + +#define TT_DELACK_RST 0x0100 +#define TT_REXMT_RST 0x0200 +#define TT_PERSIST_RST 0x0400 +#define TT_KEEP_RST 0x0800 +#define TT_2MSL_RST 0x1000 + +#define TT_STOPPED 0x00010000 #define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit) #define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle) @@ -159,6 +173,8 @@ struct tcp_timer { #define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt) #define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp)) +extern int tcp_persmin; /* minimum persist interval */ +extern int tcp_persmax; /* maximum persist interval */ extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ @@ -170,14 +186,19 @@ extern int tcp_rexmit_slop; extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; +extern int tcp_syn_backoff[]; extern int tcp_finwait2_timeout; extern int tcp_fast_finwait2_recycle; +int tcp_inpinfo_lock_add(struct inpcb *inp); +void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp); + void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); +void tcp_timer_discard(void *); struct tcptw * - tcp_tw_2msl_scan(int _reuse); /* XXX temporary */ + tcp_tw_2msl_scan(int reuse); /* XXX temporary? */ void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c index 9034fab4..330e842e 100644 --- a/freebsd/sys/netinet/tcp_timewait.c +++ b/freebsd/sys/netinet/tcp_timewait.c @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include @@ -93,20 +94,41 @@ __FBSDID("$FreeBSD$"); #include static VNET_DEFINE(uma_zone_t, tcptw_zone); -#define V_tcptw_zone VNET(tcptw_zone) +#define V_tcptw_zone VNET(tcptw_zone) static int maxtcptw; /* * The timed wait queue contains references to each of the TCP sessions * currently in the TIME_WAIT state. The queue pointers, including the * queue pointers in each tcptw structure, are protected using the global - * tcbinfo lock, which must be held over queue iteration and modification. + * timewait lock, which must be held over queue iteration and modification. + * + * Rules on tcptw usage: + * - a inpcb is always freed _after_ its tcptw + * - a tcptw relies on its inpcb reference counting for memory stability + * - a tcptw is dereferenceable only while its inpcb is locked */ static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl); -#define V_twq_2msl VNET(twq_2msl) +#define V_twq_2msl VNET(twq_2msl) + +/* Global timewait lock */ +static VNET_DEFINE(struct rwlock, tw_lock); +#define V_tw_lock VNET(tw_lock) + +#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0) +#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw)) +#define TW_RLOCK(tw) rw_rlock(&(tw)) +#define TW_WLOCK(tw) rw_wlock(&(tw)) +#define TW_RUNLOCK(tw) rw_runlock(&(tw)) +#define TW_WUNLOCK(tw) rw_wunlock(&(tw)) +#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED) +#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED) +#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED) +#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED) static void tcp_tw_2msl_reset(struct tcptw *, int); -static void tcp_tw_2msl_stop(struct tcptw *); +static void tcp_tw_2msl_stop(struct tcptw *, int); +static int tcp_twrespond(struct tcptw *, int); static int tcptw_auto_size(void) @@ -149,7 +171,7 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, VNET_DEFINE(int, nolocaltimewait) = 0; #define V_nolocaltimewait VNET(nolocaltimewait) -SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), 0, "Do not create compressed TCP TIME_WAIT entries for local connections"); @@ -166,13 +188,14 @@ tcp_tw_init(void) { V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); if (maxtcptw == 0) uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); else uma_zone_set_max(V_tcptw_zone, maxtcptw); TAILQ_INIT(&V_twq_2msl); + TW_LOCK_INIT(V_tw_lock, "tcptw"); } #ifdef VIMAGE @@ -181,11 +204,12 @@ tcp_tw_destroy(void) { struct tcptw *tw; - INP_INFO_WLOCK(&V_tcbinfo); - while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) + INP_INFO_RLOCK(&V_tcbinfo); + while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) tcp_twclose(tw, 0); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + TW_LOCK_DESTROY(V_tw_lock); uma_zdestroy(V_tcptw_zone); } #endif @@ -206,7 +230,7 @@ tcp_twstart(struct tcpcb *tp) int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if (V_nolocaltimewait) { @@ -229,8 +253,23 @@ tcp_twstart(struct tcpcb *tp) } } + + /* + * For use only by DTrace. We do not reference the state + * after this point so modifying it in place is not a problem. + */ + tcp_state_change(tp, TCPS_TIME_WAIT); + tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); if (tw == NULL) { + /* + * Reached limit on total number of TIMEWAIT connections + * allowed. Remove a connection from TIMEWAIT queue in LRU + * fashion to make room for this connection. + * + * XXX: Check if it possible to always have enough room + * in advance based on guarantees provided by uma_zalloc(). + */ tw = tcp_tw_2msl_scan(1); if (tw == NULL) { tp = tcp_close(tp); @@ -239,7 +278,12 @@ tcp_twstart(struct tcpcb *tp) return; } } + /* + * The tcptw will hold a reference on its inpcb until tcp_twclose + * is called + */ tw->tw_inpcb = inp; + in_pcbref(inp); /* Reference from tw */ /* * Recover last window size sent. @@ -313,53 +357,19 @@ tcp_twstart(struct tcpcb *tp) INP_WUNLOCK(inp); } -#if 0 -/* - * The appromixate rate of ISN increase of Microsoft TCP stacks; - * the actual rate is slightly higher due to the addition of - * random positive increments. - * - * Most other new OSes use semi-randomized ISN values, so we - * do not need to worry about them. - */ -#define MS_ISN_BYTES_PER_SECOND 250000 - -/* - * Determine if the ISN we will generate has advanced beyond the last - * sequence number used by the previous connection. If so, indicate - * that it is safe to recycle this tw socket by returning 1. - */ -int -tcp_twrecycleable(struct tcptw *tw) -{ - tcp_seq new_iss = tw->iss; - tcp_seq new_irs = tw->irs; - - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); - new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); - - if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt)) - return (1); - else - return (0); -} -#endif - /* * Returns 1 if the TIME_WAIT state was killed and we should start over, * looking for a pcb in the listen state. Returns 0 otherwise. */ int -tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, +tcp_twcheck(struct inpcb *inp, struct tcpopt *to __unused, struct tcphdr *th, struct mbuf *m, int tlen) { struct tcptw *tw; int thflags; tcp_seq seq; - /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* @@ -460,11 +470,10 @@ tcp_twclose(struct tcptw *tw, int reuse) inp = tw->tw_inpcb; KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */ INP_WLOCK_ASSERT(inp); - tw->tw_inpcb = NULL; - tcp_tw_2msl_stop(tw); + tcp_tw_2msl_stop(tw, reuse); inp->inp_ppcb = NULL; in_pcbdrop(inp); @@ -493,17 +502,17 @@ tcp_twclose(struct tcptw *tw, int reuse) */ INP_WUNLOCK(inp); } - } else + } else { + /* + * The socket has been already cleaned-up for us, only free the + * inpcb. + */ in_pcbfree(inp); + } TCPSTAT_INC(tcps_closed); - crfree(tw->tw_cred); - tw->tw_cred = NULL; - if (reuse) - return; - uma_zfree(V_tcptw_zone, tw); } -int +static int tcp_twrespond(struct tcptw *tw, int flags) { struct inpcb *inp = tw->tw_inpcb; @@ -525,7 +534,7 @@ tcp_twrespond(struct tcptw *tw, int flags) INP_WLOCK_ASSERT(inp); - m = m_gethdr(M_DONTWAIT, MT_DATA); + m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_data += max_linkhdr; @@ -596,9 +605,9 @@ tcp_twrespond(struct tcptw *tw, int flags) m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); - ip->ip_len = m->m_pkthdr.len; + ip->ip_len = htons(m->m_pkthdr.len); if (V_path_mtu_discovery) - ip->ip_off |= IP_DF; + ip->ip_off |= htons(IP_DF); error = ip_output(m, inp->inp_options, NULL, ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, inp); @@ -616,36 +625,114 @@ static void tcp_tw_2msl_reset(struct tcptw *tw, int rearm) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tw->tw_inpcb); + + TW_WLOCK(V_tw_lock); if (rearm) TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); tw->tw_time = ticks + 2 * tcp_msl; TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl); + TW_WUNLOCK(V_tw_lock); } static void -tcp_tw_2msl_stop(struct tcptw *tw) +tcp_tw_2msl_stop(struct tcptw *tw, int reuse) { + struct ucred *cred; + struct inpcb *inp; + int released; + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + TW_WLOCK(V_tw_lock); + inp = tw->tw_inpcb; + tw->tw_inpcb = NULL; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); + cred = tw->tw_cred; + tw->tw_cred = NULL; + TW_WUNLOCK(V_tw_lock); + + if (cred != NULL) + crfree(cred); + + released = in_pcbrele_wlocked(inp); + KASSERT(!released, ("%s: inp should not be released here", __func__)); + + if (!reuse) + uma_zfree(V_tcptw_zone, tw); + TCPSTATES_DEC(TCPS_TIME_WAIT); } struct tcptw * tcp_tw_2msl_scan(int reuse) { struct tcptw *tw; + struct inpcb *inp; + +#ifdef INVARIANTS + if (reuse) { + /* + * Exclusive pcbinfo lock is not required in reuse case even if + * two inpcb locks can be acquired simultaneously: + * - the inpcb transitioning to TIME_WAIT state in + * tcp_tw_start(), + * - the inpcb closed by tcp_twclose(). + * + * It is because only inpcbs in FIN_WAIT2 or CLOSING states can + * transition in TIME_WAIT state. Then a pcbcb cannot be in + * TIME_WAIT list and transitioning to TIME_WAIT state at same + * time. + */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + } +#endif - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); for (;;) { + TW_RLOCK(V_tw_lock); tw = TAILQ_FIRST(&V_twq_2msl); - if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) + if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) { + TW_RUNLOCK(V_tw_lock); break; - INP_WLOCK(tw->tw_inpcb); - tcp_twclose(tw, reuse); - if (reuse) - return (tw); + } + KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL", + __func__)); + + inp = tw->tw_inpcb; + in_pcbref(inp); + TW_RUNLOCK(V_tw_lock); + + if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) { + + INP_WLOCK(inp); + tw = intotw(inp); + if (in_pcbrele_wlocked(inp)) { + KASSERT(tw == NULL, ("%s: held last inp " + "reference but tw not NULL", __func__)); + INP_INFO_RUNLOCK(&V_tcbinfo); + continue; + } + + if (tw == NULL) { + /* tcp_twclose() has already been called */ + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + continue; + } + + tcp_twclose(tw, reuse); + INP_INFO_RUNLOCK(&V_tcbinfo); + if (reuse) + return tw; + } else { + /* INP_INFO lock is busy, continue later. */ + INP_WLOCK(inp); + if (!in_pcbrele_wlocked(inp)) + INP_WUNLOCK(inp); + break; + } } - return (NULL); + + return NULL; } diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c index 61711a6e..d5fa680f 100644 --- a/freebsd/sys/netinet/tcp_usrreq.c +++ b/freebsd/sys/netinet/tcp_usrreq.c @@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -66,11 +67,12 @@ __FBSDID("$FreeBSD$"); #endif #include +#include #include #include -#include #include +#include #include #include #include @@ -81,11 +83,19 @@ __FBSDID("$FreeBSD$"); #include #include #endif +#ifdef TCP_RFC7413 +#include +#endif +#include #include #include #include #include #include +#include +#ifdef TCPPCAP +#include +#endif #ifdef TCPDEBUG #include #endif @@ -147,6 +157,7 @@ tcp_usr_attach(struct socket *so, int proto, struct thread *td) tp = intotcpcb(inp); out: TCPDEBUG2(PRU_ATTACH); + TCP_PROBE2(debug__user, tp, PRU_ATTACH); return error; } @@ -164,7 +175,7 @@ tcp_detach(struct socket *so, struct inpcb *inp) { struct tcpcb *tp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); @@ -184,6 +195,21 @@ tcp_detach(struct socket *so, struct inpcb *inp) * present until timewait ends. * * XXXRW: Would it be cleaner to free the tcptw here? + * + * Astute question indeed, from twtcp perspective there are + * three cases to consider: + * + * #1 tcp_detach is called at tcptw creation time by + * tcp_twstart, then do not discard the newly created tcptw + * and leave inpcb present until timewait ends + * #2 tcp_detach is called at timewait end (or reuse) by + * tcp_twclose, then the tcptw has already been discarded + * (or reused) and inpcb is freed here + * #3 tcp_detach is called() after timewait ends (or reuse) + * (e.g. by soclose), then tcptw has already been discarded + * (or reused) and inpcb is freed here + * + * In all three cases the tcptw should not be freed here. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " @@ -227,15 +253,20 @@ static void tcp_usr_detach(struct socket *so) { struct inpcb *inp; + int rlock = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); - INP_INFO_WLOCK(&V_tcbinfo); + if (!INP_INFO_WLOCKED(&V_tcbinfo)) { + INP_INFO_RLOCK(&V_tcbinfo); + rlock = 1; + } INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_detach: inp_socket == NULL")); tcp_detach(so, inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + if (rlock) + INP_INFO_RUNLOCK(&V_tcbinfo); } #ifdef INET @@ -276,6 +307,7 @@ tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); + TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); @@ -336,6 +368,7 @@ tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); + TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } @@ -369,7 +402,7 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { - tp->t_state = TCPS_LISTEN; + tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) @@ -378,8 +411,13 @@ tcp_usr_listen(struct socket *so, int backlog, struct thread *td) } SOCK_UNLOCK(so); +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) + tp->t_tfo_pending = tcp_fastopen_alloc_counter(); +#endif out: TCPDEBUG2(PRU_LISTEN); + TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } @@ -414,7 +452,7 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) } INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { - tp->t_state = TCPS_LISTEN; + tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) @@ -423,8 +461,13 @@ tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) } SOCK_UNLOCK(so); +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) + tp->t_tfo_pending = tcp_fastopen_alloc_counter(); +#endif out: TCPDEBUG2(PRU_LISTEN); + TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } @@ -462,8 +505,12 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; + if (inp->inp_flags & INP_TIMEWAIT) { + error = EADDRINUSE; + goto out; + } + if (inp->inp_flags & INP_DROPPED) { + error = ECONNREFUSED; goto out; } tp = intotcpcb(inp); @@ -477,9 +524,10 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); + TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } @@ -509,8 +557,12 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; + if (inp->inp_flags & INP_TIMEWAIT) { + error = EADDRINUSE; + goto out; + } + if (inp->inp_flags & INP_DROPPED) { + error = ECONNREFUSED; goto out; } tp = intotcpcb(inp); @@ -543,7 +595,7 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); goto out; } #endif @@ -561,10 +613,11 @@ tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); + TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } @@ -589,11 +642,13 @@ tcp_usr_disconnect(struct socket *so) int error = 0; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + if (inp->inp_flags & INP_TIMEWAIT) + goto out; + if (inp->inp_flags & INP_DROPPED) { error = ECONNRESET; goto out; } @@ -602,8 +657,9 @@ tcp_usr_disconnect(struct socket *so) tcp_disconnect(tp); out: TCPDEBUG2(PRU_DISCONNECT); + TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } @@ -611,13 +667,6 @@ out: /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. - * - * The rationale for acquiring the tcbinfo lock here is somewhat complicated, - * and is described in detail in the commit log entry for r175612. Acquiring - * it delays an accept(2) racing with sonewconn(), which inserts the socket - * before the inpcb address/port fields are initialized. A better fix would - * prevent the socket from being placed in the listen queue until all fields - * are fully initialized. */ static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) @@ -634,7 +683,6 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); - INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; @@ -653,8 +701,8 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) out: TCPDEBUG2(PRU_ACCEPT); + TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) *nam = in_sockaddr(port, &addr); return error; @@ -704,6 +752,7 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) out: TCPDEBUG2(PRU_ACCEPT); + TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) { @@ -727,7 +776,7 @@ tcp_usr_shutdown(struct socket *so) struct tcpcb *tp = NULL; TCPDEBUG0; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); @@ -740,12 +789,13 @@ tcp_usr_shutdown(struct socket *so) socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_SHUTDOWN); + TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } @@ -770,15 +820,28 @@ tcp_usr_rcvd(struct socket *so, int flags) } tp = intotcpcb(inp); TCPDEBUG1(); +#ifdef TCP_RFC7413 + /* + * For passively-created TFO connections, don't attempt a window + * update while still in SYN_RECEIVED as this may trigger an early + * SYN|ACK. It is preferable to have the SYN|ACK be sent along with + * application response data, or failing that, when the DELACK timer + * expires. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED)) + goto out; +#endif #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) tcp_offload_rcvd(tp); else #endif - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_RCVD); + TCP_PROBE2(debug__user, tp, PRU_RCVD); INP_WUNLOCK(inp); return (error); } @@ -807,14 +870,18 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * this call. */ if (flags & PRUS_EOF) - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { if (control) m_freem(control); - if (m) + /* + * In case of PRUS_NOTREADY, tcp_usr_ready() is responsible + * for freeing memory. + */ + if (m && (flags & PRUS_NOTREADY) == 0) m_freem(m); error = ECONNRESET; goto out; @@ -836,13 +903,12 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { - sbappendstream(&so->so_snd, m); + sbappendstream(&so->so_snd, m, flags); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and - * initialize maxseg/maxopd using peer's cached - * MSS. + * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) @@ -864,14 +930,15 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * Close the send side of the connection after * the data is sent. */ - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); socantsendmore(so); tcp_usrclosed(tp); } - if (!(inp->inp_flags & INP_DROPPED)) { + if (!(inp->inp_flags & INP_DROPPED) && + !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } @@ -894,14 +961,13 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, * of data past the urgent section. * Otherwise, snd_up should be one lower. */ - sbappendstream_locked(&so->so_snd, m); + sbappendstream_locked(&so->so_snd, m, flags); SOCKBUF_UNLOCK(&so->so_snd); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and - * initialize maxseg/maxopd using peer's cached - * MSS. + * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) @@ -918,17 +984,48 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } - tp->snd_up = tp->snd_una + so->so_snd.sb_cc; - tp->t_flags |= TF_FORCEDATA; - error = tcp_output(tp); - tp->t_flags &= ~TF_FORCEDATA; + tp->snd_up = tp->snd_una + sbavail(&so->so_snd); + if (!(flags & PRUS_NOTREADY)) { + tp->t_flags |= TF_FORCEDATA; + error = tp->t_fb->tfb_tcp_output(tp); + tp->t_flags &= ~TF_FORCEDATA; + } } out: TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); + TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : + ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); if (flags & PRUS_EOF) - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + return (error); +} + +static int +tcp_usr_ready(struct socket *so, struct mbuf *m, int count) +{ + struct inpcb *inp; + struct tcpcb *tp; + int error; + + inp = sotoinpcb(so); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); + for (int i = 0; i < count; i++) + m = m_free(m); + return (ECONNRESET); + } + tp = intotcpcb(inp); + + SOCKBUF_LOCK(&so->so_snd); + error = sbready(&so->so_snd, m, count); + SOCKBUF_UNLOCK(&so->so_snd); + if (error == 0) + error = tp->t_fb->tfb_tcp_output(tp); + INP_WUNLOCK(inp); + return (error); } @@ -945,7 +1042,7 @@ tcp_usr_abort(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); @@ -959,6 +1056,7 @@ tcp_usr_abort(struct socket *so) TCPDEBUG1(); tcp_drop(tp, ECONNABORTED); TCPDEBUG2(PRU_ABORT); + TCP_PROBE2(debug__user, tp, PRU_ABORT); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); @@ -967,7 +1065,7 @@ tcp_usr_abort(struct socket *so) inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); } /* @@ -983,7 +1081,7 @@ tcp_usr_close(struct socket *so) inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); @@ -998,6 +1096,7 @@ tcp_usr_close(struct socket *so) TCPDEBUG1(); tcp_disconnect(tp); TCPDEBUG2(PRU_CLOSE); + TCP_PROBE2(debug__user, tp, PRU_CLOSE); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); @@ -1006,7 +1105,7 @@ tcp_usr_close(struct socket *so) inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); } /* @@ -1047,6 +1146,7 @@ tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) out: TCPDEBUG2(PRU_RCVOOB); + TCP_PROBE2(debug__user, tp, PRU_RCVOOB); INP_WUNLOCK(inp); return (error); } @@ -1066,6 +1166,7 @@ struct pr_usrreqs tcp_usrreqs = { .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, + .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, @@ -1088,6 +1189,7 @@ struct pr_usrreqs tcp6_usrreqs = { .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, + .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_sosetlabel = in_pcbsosetlabel, @@ -1154,7 +1256,7 @@ tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) soisconnecting(so); TCPSTAT_INC(tcps_connattempt); - tp->t_state = TCPS_SYN_SENT; + tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); @@ -1170,10 +1272,7 @@ out: static int tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { - struct inpcb *inp = tp->t_inpcb, *oinp; - struct socket *so = inp->inp_socket; - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; - struct in6_addr addr6; + struct inpcb *inp = tp->t_inpcb; int error; INP_WLOCK_ASSERT(inp); @@ -1184,39 +1283,9 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) if (error) goto out; } - - /* - * Cannot simply call in_pcbconnect, because there might be an - * earlier incarnation of this same connection still in - * TIME_WAIT state, creating an ADDRINUSE error. - * in6_pcbladdr() also handles scope zone IDs. - * - * XXXRW: We wouldn't need to expose in6_pcblookup_hash_locked() - * outside of in6_pcb.c if there were an in6_pcbconnect_setup(). - */ - error = in6_pcbladdr(inp, nam, &addr6); - if (error) - goto out; - oinp = in6_pcblookup_hash_locked(inp->inp_pcbinfo, - &sin6->sin6_addr, sin6->sin6_port, - IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) - ? &addr6 - : &inp->in6p_laddr, - inp->inp_lport, 0, NULL); - if (oinp) { - error = EADDRINUSE; + error = in6_pcbconnect(inp, nam, td->td_ucred); + if (error != 0) goto out; - } - if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) - inp->in6p_laddr = addr6; - inp->in6p_faddr = sin6->sin6_addr; - inp->inp_fport = sin6->sin6_port; - /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ - inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; - if (inp->inp_flags & IN6P_AUTOFLOWLABEL) - inp->inp_flow |= - (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); - in_pcbrehash(inp); INP_HASH_WUNLOCK(&V_tcbinfo); /* Compute window scaling to request. */ @@ -1224,9 +1293,9 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; - soisconnecting(so); + soisconnecting(inp->inp_socket); TCPSTAT_INC(tcps_connattempt); - tp->t_state = TCPS_SYN_SENT; + tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); @@ -1294,25 +1363,25 @@ tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) * has to revalidate that the connection is still valid for the socket * option. */ -#define INP_WLOCK_RECHECK(inp) do { \ +#define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \ INP_WLOCK(inp); \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ INP_WUNLOCK(inp); \ + cleanup; \ return (ECONNRESET); \ } \ tp = intotcpcb(inp); \ } while(0) +#define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { - int error, opt, optval; - u_int ui; + int error; struct inpcb *inp; struct tcpcb *tp; - struct tcp_info ti; - char buf[TCP_CA_NAME_MAX]; - struct cc_algo *algo; + struct tcp_function_block *blk; + struct tcp_function_set fsn; error = 0; inp = sotoinpcb(so); @@ -1340,6 +1409,128 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt) INP_WUNLOCK(inp); return (ECONNRESET); } + tp = intotcpcb(inp); + /* + * Protect the TCP option TCP_FUNCTION_BLK so + * that a sub-function can *never* overwrite this. + */ + if ((sopt->sopt_dir == SOPT_SET) && + (sopt->sopt_name == TCP_FUNCTION_BLK)) { + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &fsn, sizeof fsn, + sizeof fsn); + if (error) + return (error); + INP_WLOCK_RECHECK(inp); + blk = find_and_ref_tcp_functions(&fsn); + if (blk == NULL) { + INP_WUNLOCK(inp); + return (ENOENT); + } + if (tp->t_fb == blk) { + /* You already have this */ + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return (0); + } + if (tp->t_state != TCPS_CLOSED) { + int error=EINVAL; + /* + * The user has advanced the state + * past the initial point, we may not + * be able to switch. + */ + if (blk->tfb_tcp_handoff_ok != NULL) { + /* + * Does the stack provide a + * query mechanism, if so it may + * still be possible? + */ + error = (*blk->tfb_tcp_handoff_ok)(tp); + } + if (error) { + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return(error); + } + } + if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return (ENOENT); + } + /* + * Release the old refcnt, the + * lookup acquired a ref on the + * new one already. + */ + if (tp->t_fb->tfb_tcp_fb_fini) { + /* + * Tell the stack to cleanup with 0 i.e. + * the tcb is not going away. + */ + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); + } + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_fb = blk; + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + tcp_offload_ctloutput(tp, sopt->sopt_dir, + sopt->sopt_name); + } +#endif + INP_WUNLOCK(inp); + return (error); + } else if ((sopt->sopt_dir == SOPT_GET) && + (sopt->sopt_name == TCP_FUNCTION_BLK)) { + strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name); + fsn.pcbcnt = tp->t_fb->tfb_refcnt; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &fsn, sizeof fsn); + return (error); + } + /* Pass in the INP locked, called must unlock it */ + return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); +} + +int +tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) +{ + int error, opt, optval; + u_int ui; + struct tcp_info ti; + struct cc_algo *algo; + char *pbuf, buf[TCP_CA_NAME_MAX]; + size_t len; + + /* + * For TCP_CCALGOOPT forward the control to CC module, for both + * SOPT_SET and SOPT_GET. + */ + switch (sopt->sopt_name) { + case TCP_CCALGOOPT: + INP_WUNLOCK(inp); + pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO); + error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize, + sopt->sopt_valsize); + if (error) { + free(pbuf, M_TEMP); + return (error); + } + INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP)); + if (CC_ALGO(tp)->ctl_output != NULL) + error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf); + else + error = ENOENT; + INP_WUNLOCK(inp); + if (error == 0 && sopt->sopt_dir == SOPT_GET) + error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize); + free(pbuf, M_TEMP); + return (error); + } switch (sopt->sopt_dir) { case SOPT_SET: @@ -1408,7 +1599,7 @@ unlock_and_done: else if (tp->t_flags & TF_NOPUSH) { tp->t_flags &= ~TF_NOPUSH; if (TCPS_HAVEESTABLISHED(tp->t_state)) - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); } goto unlock_and_done; @@ -1434,50 +1625,45 @@ unlock_and_done: case TCP_CONGESTION: INP_WUNLOCK(inp); - bzero(buf, sizeof(buf)); - error = sooptcopyin(sopt, &buf, sizeof(buf), 1); + error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); if (error) break; + buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); + CC_LIST_RLOCK(); + STAILQ_FOREACH(algo, &cc_list, entries) + if (strncmp(buf, algo->name, + TCP_CA_NAME_MAX) == 0) + break; + CC_LIST_RUNLOCK(); + if (algo == NULL) { + INP_WUNLOCK(inp); + error = EINVAL; + break; + } /* - * Return EINVAL if we can't find the requested cc algo. + * We hold a write lock over the tcb so it's safe to + * do these things without ordering concerns. */ - error = EINVAL; - CC_LIST_RLOCK(); - STAILQ_FOREACH(algo, &cc_list, entries) { - if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) - == 0) { - /* We've found the requested algo. */ - error = 0; - /* - * We hold a write lock over the tcb - * so it's safe to do these things - * without ordering concerns. - */ - if (CC_ALGO(tp)->cb_destroy != NULL) - CC_ALGO(tp)->cb_destroy(tp->ccv); - CC_ALGO(tp) = algo; - /* - * If something goes pear shaped - * initialising the new algo, - * fall back to newreno (which - * does not require initialisation). - */ - if (algo->cb_init != NULL) - if (algo->cb_init(tp->ccv) > 0) { - CC_ALGO(tp) = &newreno_cc_algo; - /* - * The only reason init - * should fail is - * because of malloc. - */ - error = ENOMEM; - } - break; /* Break the STAILQ_FOREACH. */ - } + if (CC_ALGO(tp)->cb_destroy != NULL) + CC_ALGO(tp)->cb_destroy(tp->ccv); + CC_ALGO(tp) = algo; + /* + * If something goes pear shaped initialising the new + * algo, fall back to newreno (which does not + * require initialisation). + */ + if (algo->cb_init != NULL && + algo->cb_init(tp->ccv) != 0) { + CC_ALGO(tp) = &newreno_cc_algo; + /* + * The only reason init should fail is + * because of malloc. + */ + error = ENOMEM; } - CC_LIST_RUNLOCK(); - goto unlock_and_done; + INP_WUNLOCK(inp); + break; case TCP_KEEPIDLE: case TCP_KEEPINTVL: @@ -1535,8 +1721,49 @@ unlock_and_done: (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); + goto unlock_and_done; + +#ifdef TCPPCAP + case TCP_PCAP_OUT: + case TCP_PCAP_IN: INP_WUNLOCK(inp); - break; + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + if (optval >= 0) + tcp_pcap_set_sock_max(TCP_PCAP_OUT ? + &(tp->t_outpkts) : &(tp->t_inpkts), + optval); + else + error = EINVAL; + goto unlock_and_done; +#endif + +#ifdef TCP_RFC7413 + case TCP_FASTOPEN: + INP_WUNLOCK(inp); + if (!V_tcp_fastopen_enabled) + return (EPERM); + + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + if (optval) { + tp->t_flags |= TF_FASTOPEN; + if ((tp->t_state == TCPS_LISTEN) && + (tp->t_tfo_pending == NULL)) + tp->t_tfo_pending = + tcp_fastopen_alloc_counter(); + } else + tp->t_flags &= ~TF_FASTOPEN; + goto unlock_and_done; +#endif default: INP_WUNLOCK(inp); @@ -1582,11 +1809,48 @@ unlock_and_done: error = sooptcopyout(sopt, &ti, sizeof ti); break; case TCP_CONGESTION: - bzero(buf, sizeof(buf)); - strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); + len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, buf, len + 1); + break; + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPINIT: + case TCP_KEEPCNT: + switch (sopt->sopt_name) { + case TCP_KEEPIDLE: + ui = tp->t_keepidle / hz; + break; + case TCP_KEEPINTVL: + ui = tp->t_keepintvl / hz; + break; + case TCP_KEEPINIT: + ui = tp->t_keepinit / hz; + break; + case TCP_KEEPCNT: + ui = tp->t_keepcnt; + break; + } + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &ui, sizeof(ui)); + break; +#ifdef TCPPCAP + case TCP_PCAP_OUT: + case TCP_PCAP_IN: + optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ? + &(tp->t_outpkts) : &(tp->t_inpkts)); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; +#endif + +#ifdef TCP_RFC7413 + case TCP_FASTOPEN: + optval = tp->t_flags & TF_FASTOPEN; INP_WUNLOCK(inp); - error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX); + error = sooptcopyout(sopt, &optval, sizeof optval); break; +#endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -1597,6 +1861,7 @@ unlock_and_done: return (error); } #undef INP_WLOCK_RECHECK +#undef INP_WLOCK_RECHECK_CLEANUP /* * Attach TCP protocol to socket, allocating @@ -1617,10 +1882,10 @@ tcp_attach(struct socket *so) } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); error = in_pcballoc(so, &V_tcbinfo); if (error) { - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } inp = sotoinpcb(so); @@ -1636,12 +1901,13 @@ tcp_attach(struct socket *so) if (tp == NULL) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (ENOBUFS); } tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); + TCPSTATES_INC(TCPS_CLOSED); return (0); } @@ -1659,7 +1925,7 @@ tcp_disconnect(struct tcpcb *tp) struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* @@ -1679,7 +1945,7 @@ tcp_disconnect(struct tcpcb *tp) sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); } } @@ -1697,7 +1963,7 @@ static void tcp_usrclosed(struct tcpcb *tp) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); switch (tp->t_state) { @@ -1705,9 +1971,9 @@ tcp_usrclosed(struct tcpcb *tp) #ifdef TCP_OFFLOAD tcp_offload_listen_stop(tp); #endif + tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ case TCPS_CLOSED: - tp->t_state = TCPS_CLOSED; tp = tcp_close(tp); /* * tcp_close() should never return NULL here as the socket is @@ -1723,11 +1989,11 @@ tcp_usrclosed(struct tcpcb *tp) break; case TCPS_ESTABLISHED: - tp->t_state = TCPS_FIN_WAIT_1; + tcp_state_change(tp, TCPS_FIN_WAIT_1); break; case TCPS_CLOSE_WAIT: - tp->t_state = TCPS_LAST_ACK; + tcp_state_change(tp, TCPS_LAST_ACK); break; } if (tp->t_state >= TCPS_FIN_WAIT_2) { @@ -1910,6 +2176,10 @@ db_print_tflags(u_int t_flags) db_printf("%sTF_ECN_PERMIT", comma ? ", " : ""); comma = 1; } + if (t_flags & TF_FASTOPEN) { + db_printf("%sTF_FASTOPEN", comma ? ", " : ""); + comma = 1; + } } static void @@ -1984,8 +2254,8 @@ db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); - db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n", - tp->t_maxopd, tp->t_rcvtime, tp->t_starttime); + db_printf("t_rcvtime: %u t_startime: %u\n", + tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); db_printf("t_rttime: %u t_rtsq: 0x%08x\n", diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h index dbd9ed11..5dcd35b8 100644 --- a/freebsd/sys/netinet/tcp_var.h +++ b/freebsd/sys/netinet/tcp_var.h @@ -34,9 +34,11 @@ #define _NETINET_TCP_VAR_H_ #include +#include #ifdef _KERNEL #include +#include /* * Kernel variables for tcp. @@ -73,7 +75,12 @@ struct sackhint { tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int ispare; /* explicit pad for 64bit alignment */ - uint64_t _pad[2]; /* 1 sacked_bytes, 1 TBD */ + int sacked_bytes; /* + * Total sacked bytes reported by the + * receiver via sack option + */ + uint32_t _pad1[1]; /* TBD */ + uint64_t _pad[1]; /* TBD */ }; struct tcptemp { @@ -83,17 +90,75 @@ struct tcptemp { #define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ -/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ -#ifdef INET6 -#define ND6_HINT(tp) \ -do { \ - if ((tp) && (tp)->t_inpcb && \ - ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ - nd6_nud_hint(NULL, NULL, 0); \ -} while (0) -#else -#define ND6_HINT(tp) -#endif +/* + * TODO: We yet need to brave plowing in + * to tcp_input() and the pru_usrreq() block. + * Right now these go to the old standards which + * are somewhat ok, but in the long term may + * need to be changed. If we do tackle tcp_input() + * then we need to get rid of the tcp_do_segment() + * function below. + */ +/* Flags for tcp functions */ +#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ +struct tcpcb; +struct inpcb; +struct sockopt; +struct socket; + +/* + * If defining the optional tcp_timers, in the + * tfb_tcp_timer_stop call you must use the + * callout_async_drain() function with the + * tcp_timer_discard callback. You should check + * the return of callout_async_drain() and if 0 + * increment tt_draincnt. Since the timer sub-system + * does not know your callbacks you must provide a + * stop_all function that loops through and calls + * tcp_timer_stop() with each of your defined timers. + * Adding a tfb_tcp_handoff_ok function allows the socket + * option to change stacks to query you even if the + * connection is in a later stage. You return 0 to + * say you can take over and run your stack, you return + * non-zero (an error number) to say no you can't. + * If the function is undefined you can only change + * in the early states (before connect or listen). + * tfb_tcp_fb_fini is changed to add a flag to tell + * the old stack if the tcb is being destroyed or + * not. A one in the flag means the TCB is being + * destroyed, a zero indicates its transitioning to + * another stack (via socket option). + */ +struct tcp_function_block { + char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; + int (*tfb_tcp_output)(struct tcpcb *); + void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, + int, int, uint8_t, + int); + int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp); + /* Optional memory allocation/free routine */ + void (*tfb_tcp_fb_init)(struct tcpcb *); + void (*tfb_tcp_fb_fini)(struct tcpcb *, int); + /* Optional timers, must define all if you define one */ + int (*tfb_tcp_timer_stop_all)(struct tcpcb *); + void (*tfb_tcp_timer_activate)(struct tcpcb *, + uint32_t, u_int); + int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); + void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); + void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); + int (*tfb_tcp_handoff_ok)(struct tcpcb *); + volatile uint32_t tfb_refcnt; + uint32_t tfb_flags; +}; + +struct tcp_function { + TAILQ_ENTRY(tcp_function) tf_next; + struct tcp_function_block *tf_fb; +}; + +TAILQ_HEAD(tcp_funchead, tcp_function); /* * Tcp control block, one per tcp; fields: @@ -113,7 +178,7 @@ struct tcpcb { struct vnet *t_vnet; /* back pointer to parent vnet */ - tcp_seq snd_una; /* send unacknowledged */ + tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ @@ -140,8 +205,6 @@ struct tcpcb { u_long snd_spare2; /* unused */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ - u_int t_maxopd; /* mss plus options */ - u_int t_rcvtime; /* inactivity time */ u_int t_starttime; /* time connection was established */ u_int t_rtttime; /* RTT measurement start time */ @@ -152,6 +215,7 @@ struct tcpcb { int t_rxtcur; /* current retransmit value (ticks) */ u_int t_maxseg; /* maximum segment size */ + u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ @@ -208,13 +272,35 @@ struct tcpcb { u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ - u_int t_tsomax; /* tso burst length limit */ - - uint32_t t_ispare[7]; /* 5 UTO, 2 TBD */ - void *t_pspare2[4]; /* 4 TBD */ - uint64_t _pad[5]; /* 5 TBD (1-2 CC/RTT?) */ - uint32_t t_tsomaxsegcount; /* TSO maximum segment count */ - uint32_t t_tsomaxsegsize; /* TSO maximum segment size in bytes */ + u_int t_tsomax; /* TSO total burst length limit in bytes */ + u_int t_tsomaxsegcount; /* TSO maximum segment count */ + u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ + u_int t_flags2; /* More tcpcb flags storage */ +#if defined(_KERNEL) && defined(TCP_RFC7413) + uint32_t t_ispare[6]; /* 5 UTO, 1 TBD */ + uint64_t t_tfo_cookie; /* TCP Fast Open cookie */ +#else + uint32_t t_ispare[8]; /* 5 UTO, 3 TBD */ +#endif + struct tcp_function_block *t_fb;/* TCP function call block */ + void *t_fb_ptr; /* Pointer to t_fb specific data */ +#if defined(_KERNEL) && defined(TCP_RFC7413) + unsigned int *t_tfo_pending; /* TCP Fast Open pending counter */ + void *t_pspare2[1]; /* 1 TCP_SIGNATURE */ +#else + void *t_pspare2[2]; /* 1 TCP_SIGNATURE, 1 TBD */ +#endif +#if defined(_KERNEL) && defined(TCPPCAP) + struct mbufq t_inpkts; /* List of saved input packets. */ + struct mbufq t_outpkts; /* List of saved output packets. */ +#ifdef _LP64 + uint64_t _pad[0]; /* all used! */ +#else + uint64_t _pad[2]; /* 2 are available */ +#endif /* _LP64 */ +#else + uint64_t _pad[6]; +#endif /* defined(_KERNEL) && defined(TCPPCAP) */ }; /* @@ -249,6 +335,7 @@ struct tcpcb { #define TF_ECN_SND_ECE 0x10000000 /* ECN ECE in queue */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ +#define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY @@ -285,6 +372,13 @@ struct tcpcb { #define TCP_SIG_SPI 0x1000 #endif /* TCP_SIGNATURE */ +/* + * Flags for PLPMTU handling, t_flags2 + */ +#define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ +#define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ +#define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ + /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. @@ -294,21 +388,24 @@ struct tcpcb { * options in tcp_addoptions. */ struct tcpopt { - u_int64_t to_flags; /* which options are present */ + u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ -#define TOF_MAXOPT 0x0100 +#define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ +#define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ + u_char *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ + u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; @@ -322,7 +419,6 @@ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ - u_long rmx_bandwidth; /* estimated bandwidth */ u_long rmx_cwnd; /* congestion window */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ @@ -357,6 +453,8 @@ struct tcptw { u_int t_starttime; int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; + void *tw_pspare; /* TCP_SIGNATURE */ + u_int *tw_spare; /* TCP_SIGNATURE */ }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) @@ -404,125 +502,133 @@ struct tcptw { * but that's inconvenient at the moment. */ struct tcpstat { - u_long tcps_connattempt; /* connections initiated */ - u_long tcps_accepts; /* connections accepted */ - u_long tcps_connects; /* connections established */ - u_long tcps_drops; /* connections dropped */ - u_long tcps_conndrops; /* embryonic connections dropped */ - u_long tcps_minmssdrops; /* average minmss too low drops */ - u_long tcps_closed; /* conn. closed (includes drops) */ - u_long tcps_segstimed; /* segs where we tried to get rtt */ - u_long tcps_rttupdated; /* times we succeeded */ - u_long tcps_delack; /* delayed acks sent */ - u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ - u_long tcps_rexmttimeo; /* retransmit timeouts */ - u_long tcps_persisttimeo; /* persist timeouts */ - u_long tcps_keeptimeo; /* keepalive timeouts */ - u_long tcps_keepprobe; /* keepalive probes sent */ - u_long tcps_keepdrops; /* connections dropped in keepalive */ - - u_long tcps_sndtotal; /* total packets sent */ - u_long tcps_sndpack; /* data packets sent */ - u_long tcps_sndbyte; /* data bytes sent */ - u_long tcps_sndrexmitpack; /* data packets retransmitted */ - u_long tcps_sndrexmitbyte; /* data bytes retransmitted */ - u_long tcps_sndrexmitbad; /* unnecessary packet retransmissions */ - u_long tcps_sndacks; /* ack-only packets sent */ - u_long tcps_sndprobe; /* window probes sent */ - u_long tcps_sndurg; /* packets sent with URG only */ - u_long tcps_sndwinup; /* window update-only packets sent */ - u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ - - u_long tcps_rcvtotal; /* total packets received */ - u_long tcps_rcvpack; /* packets received in sequence */ - u_long tcps_rcvbyte; /* bytes received in sequence */ - u_long tcps_rcvbadsum; /* packets received with ccksum errs */ - u_long tcps_rcvbadoff; /* packets received with bad offset */ - u_long tcps_rcvmemdrop; /* packets dropped for lack of memory */ - u_long tcps_rcvshort; /* packets received too short */ - u_long tcps_rcvduppack; /* duplicate-only packets received */ - u_long tcps_rcvdupbyte; /* duplicate-only bytes received */ - u_long tcps_rcvpartduppack; /* packets with some duplicate data */ - u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ - u_long tcps_rcvoopack; /* out-of-order packets received */ - u_long tcps_rcvoobyte; /* out-of-order bytes received */ - u_long tcps_rcvpackafterwin; /* packets with data after window */ - u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */ - u_long tcps_rcvafterclose; /* packets rcvd after "close" */ - u_long tcps_rcvwinprobe; /* rcvd window probe packets */ - u_long tcps_rcvdupack; /* rcvd duplicate acks */ - u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */ - u_long tcps_rcvackpack; /* rcvd ack packets */ - u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */ - u_long tcps_rcvwinupd; /* rcvd window update packets */ - u_long tcps_pawsdrop; /* segments dropped due to PAWS */ - u_long tcps_predack; /* times hdr predict ok for acks */ - u_long tcps_preddat; /* times hdr predict ok for data pkts */ - u_long tcps_pcbcachemiss; - u_long tcps_cachedrtt; /* times cached RTT in route updated */ - u_long tcps_cachedrttvar; /* times cached rttvar updated */ - u_long tcps_cachedssthresh; /* times cached ssthresh updated */ - u_long tcps_usedrtt; /* times RTT initialized from route */ - u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */ - u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/ - u_long tcps_persistdrop; /* timeout in persist state */ - u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */ - u_long tcps_mturesent; /* resends due to MTU discovery */ - u_long tcps_listendrop; /* listen queue overflows */ - u_long tcps_badrst; /* ignored RSTs in the window */ - - u_long tcps_sc_added; /* entry added to syncache */ - u_long tcps_sc_retransmitted; /* syncache entry was retransmitted */ - u_long tcps_sc_dupsyn; /* duplicate SYN packet */ - u_long tcps_sc_dropped; /* could not reply to packet */ - u_long tcps_sc_completed; /* successful extraction of entry */ - u_long tcps_sc_bucketoverflow; /* syncache per-bucket limit hit */ - u_long tcps_sc_cacheoverflow; /* syncache cache limit hit */ - u_long tcps_sc_reset; /* RST removed entry from syncache */ - u_long tcps_sc_stale; /* timed out or listen socket gone */ - u_long tcps_sc_aborted; /* syncache entry aborted */ - u_long tcps_sc_badack; /* removed due to bad ACK */ - u_long tcps_sc_unreach; /* ICMP unreachable received */ - u_long tcps_sc_zonefail; /* zalloc() failed */ - u_long tcps_sc_sendcookie; /* SYN cookie sent */ - u_long tcps_sc_recvcookie; /* SYN cookie received */ - - u_long tcps_hc_added; /* entry added to hostcache */ - u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */ - - u_long tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ + uint64_t tcps_connattempt; /* connections initiated */ + uint64_t tcps_accepts; /* connections accepted */ + uint64_t tcps_connects; /* connections established */ + uint64_t tcps_drops; /* connections dropped */ + uint64_t tcps_conndrops; /* embryonic connections dropped */ + uint64_t tcps_minmssdrops; /* average minmss too low drops */ + uint64_t tcps_closed; /* conn. closed (includes drops) */ + uint64_t tcps_segstimed; /* segs where we tried to get rtt */ + uint64_t tcps_rttupdated; /* times we succeeded */ + uint64_t tcps_delack; /* delayed acks sent */ + uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ + uint64_t tcps_rexmttimeo; /* retransmit timeouts */ + uint64_t tcps_persisttimeo; /* persist timeouts */ + uint64_t tcps_keeptimeo; /* keepalive timeouts */ + uint64_t tcps_keepprobe; /* keepalive probes sent */ + uint64_t tcps_keepdrops; /* connections dropped in keepalive */ + + uint64_t tcps_sndtotal; /* total packets sent */ + uint64_t tcps_sndpack; /* data packets sent */ + uint64_t tcps_sndbyte; /* data bytes sent */ + uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ + uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ + uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ + uint64_t tcps_sndacks; /* ack-only packets sent */ + uint64_t tcps_sndprobe; /* window probes sent */ + uint64_t tcps_sndurg; /* packets sent with URG only */ + uint64_t tcps_sndwinup; /* window update-only packets sent */ + uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ + + uint64_t tcps_rcvtotal; /* total packets received */ + uint64_t tcps_rcvpack; /* packets received in sequence */ + uint64_t tcps_rcvbyte; /* bytes received in sequence */ + uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ + uint64_t tcps_rcvbadoff; /* packets received with bad offset */ + uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ + uint64_t tcps_rcvshort; /* packets received too short */ + uint64_t tcps_rcvduppack; /* duplicate-only packets received */ + uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ + uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ + uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ + uint64_t tcps_rcvoopack; /* out-of-order packets received */ + uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ + uint64_t tcps_rcvpackafterwin; /* packets with data after window */ + uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ + uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ + uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ + uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ + uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ + uint64_t tcps_rcvackpack; /* rcvd ack packets */ + uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ + uint64_t tcps_rcvwinupd; /* rcvd window update packets */ + uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ + uint64_t tcps_predack; /* times hdr predict ok for acks */ + uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ + uint64_t tcps_pcbcachemiss; + uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ + uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ + uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ + uint64_t tcps_usedrtt; /* times RTT initialized from route */ + uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ + uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ + uint64_t tcps_persistdrop; /* timeout in persist state */ + uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ + uint64_t tcps_mturesent; /* resends due to MTU discovery */ + uint64_t tcps_listendrop; /* listen queue overflows */ + uint64_t tcps_badrst; /* ignored RSTs in the window */ + + uint64_t tcps_sc_added; /* entry added to syncache */ + uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ + uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ + uint64_t tcps_sc_dropped; /* could not reply to packet */ + uint64_t tcps_sc_completed; /* successful extraction of entry */ + uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ + uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ + uint64_t tcps_sc_reset; /* RST removed entry from syncache */ + uint64_t tcps_sc_stale; /* timed out or listen socket gone */ + uint64_t tcps_sc_aborted; /* syncache entry aborted */ + uint64_t tcps_sc_badack; /* removed due to bad ACK */ + uint64_t tcps_sc_unreach; /* ICMP unreachable received */ + uint64_t tcps_sc_zonefail; /* zalloc() failed */ + uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ + uint64_t tcps_sc_recvcookie; /* SYN cookie received */ + + uint64_t tcps_hc_added; /* entry added to hostcache */ + uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ + + uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ - u_long tcps_sack_recovery_episode; /* SACK recovery episodes */ - u_long tcps_sack_rexmits; /* SACK rexmit segments */ - u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ - u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */ - u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */ - u_long tcps_sack_sboverflow; /* times scoreboard overflowed */ + uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ + uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ + uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ + uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ + uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ + uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ - u_long tcps_ecn_ce; /* ECN Congestion Experienced */ - u_long tcps_ecn_ect0; /* ECN Capable Transport */ - u_long tcps_ecn_ect1; /* ECN Capable Transport */ - u_long tcps_ecn_shs; /* ECN successful handshakes */ - u_long tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ + uint64_t tcps_ecn_ce; /* ECN Congestion Experienced */ + uint64_t tcps_ecn_ect0; /* ECN Capable Transport */ + uint64_t tcps_ecn_ect1; /* ECN Capable Transport */ + uint64_t tcps_ecn_shs; /* ECN successful handshakes */ + uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ - u_long tcps_sig_rcvgoodsig; /* Total matching signature received */ - u_long tcps_sig_rcvbadsig; /* Total bad signature received */ - u_long tcps_sig_err_buildsig; /* Mismatching signature received */ - u_long tcps_sig_err_sigopt; /* No signature expected by socket */ - u_long tcps_sig_err_nosigopt; /* No signature provided by segment */ + uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ + uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ + uint64_t tcps_sig_err_buildsig; /* Mismatching signature received */ + uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ + uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ - u_long _pad[12]; /* 6 UTO, 6 TBD */ + uint64_t _pad[12]; /* 6 UTO, 6 TBD */ }; +#define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ + #ifdef _KERNEL +#define TI_UNLOCKED 1 +#define TI_RLOCKED 2 +#include + +VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define TCPSTAT_ADD(name, val) V_tcpstat.name += (val) +#define TCPSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* @@ -530,7 +636,15 @@ struct tcpstat { */ void kmod_tcpstat_inc(int statnum); #define KMOD_TCPSTAT_INC(name) \ - kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(u_long)) + kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(uint64_t)) + +/* + * Running TCP connection count by state. + */ +VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); +#define V_tcps_states VNET(tcps_states) +#define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) +#define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. @@ -574,11 +688,11 @@ struct xtcpcb { #endif /* - * Names for TCP sysctl objects + * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ -#define TCPCTL_STATS 4 /* statistics (read-only) */ +#define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ @@ -590,26 +704,7 @@ struct xtcpcb { #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ -#define TCPCTL_MAXID 16 -#define TCPCTL_FINWAIT2_TIMEOUT 17 - -#define TCPCTL_NAMES { \ - { 0, 0 }, \ - { "rfc1323", CTLTYPE_INT }, \ - { "mssdflt", CTLTYPE_INT }, \ - { "stats", CTLTYPE_STRUCT }, \ - { "rttdflt", CTLTYPE_INT }, \ - { "keepidle", CTLTYPE_INT }, \ - { "keepintvl", CTLTYPE_INT }, \ - { "sendspace", CTLTYPE_INT }, \ - { "recvspace", CTLTYPE_INT }, \ - { "keepinit", CTLTYPE_INT }, \ - { "pcblist", CTLTYPE_STRUCT }, \ - { "delacktime", CTLTYPE_INT }, \ - { "v6mssdflt", CTLTYPE_INT }, \ - { "maxid", CTLTYPE_INT }, \ -} - +#define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL @@ -620,13 +715,12 @@ MALLOC_DECLARE(M_TCPLOG); VNET_DECLARE(struct inpcbhead, tcb); /* queue of active tcpcb's */ VNET_DECLARE(struct inpcbinfo, tcbinfo); -VNET_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ extern int tcp_log_in_vain; VNET_DECLARE(int, tcp_mssdflt); /* XXX */ VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_rfc3390); -VNET_DECLARE(int, tcp_do_initcwnd10); +VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, path_mtu_discovery); @@ -634,12 +728,11 @@ VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_abc_l_var); #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) -#define V_tcpstat VNET(tcpstat) #define V_tcp_mssdflt VNET(tcp_mssdflt) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) -#define V_tcp_do_initcwnd10 VNET(tcp_do_initcwnd10) +#define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_recvspace VNET(tcp_recvspace) #define V_path_mtu_discovery VNET(path_mtu_discovery) @@ -659,50 +752,69 @@ VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) +VNET_DECLARE(int, tcp_do_rfc6675_pipe); +#define V_tcp_do_rfc6675_pipe VNET(tcp_do_rfc6675_pipe) + int tcp_addoptions(struct tcpopt *, u_char *); int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); -#if 0 -int tcp_twrecycleable(struct tcptw *tw); -#endif -void tcp_twclose(struct tcptw *_tw, int _reuse); +void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_init(void); -#ifdef VIMAGE -void tcp_destroy(void); -#endif void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); -void tcp_reass_init(void); +void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); -#ifdef VIMAGE -void tcp_reass_destroy(void); -#endif -void tcp_input(struct mbuf *, int); +void tcp_dooptions(struct tcpopt *, u_char *, int, int); +void tcp_dropwithreset(struct mbuf *, struct tcphdr *, + struct tcpcb *, int, int); +void tcp_pulloutofband(struct socket *, + struct tcphdr *, struct mbuf *, int); +void tcp_xmit_timer(struct tcpcb *, int); +void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); +void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, + uint16_t type); +void cc_conn_init(struct tcpcb *tp); +void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); +void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); +void hhook_run_tcp_est_in(struct tcpcb *tp, + struct tcphdr *th, struct tcpopt *to); + +int tcp_input(struct mbuf **, int *, int); +void tcp_do_segment(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, int, int, uint8_t, + int); + +int register_tcp_functions(struct tcp_function_block *blk, int wait); +int deregister_tcp_functions(struct tcp_function_block *blk); +struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); +struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk); +int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); + u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); +u_int tcp_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); -struct inpcb * - tcp_mtudisc(struct inpcb *, int); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_output(struct tcpcb *); +void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); @@ -712,19 +824,25 @@ void tcp_tw_destroy(void); void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); -int tcp_twrespond(struct tcptw *, int); void tcp_setpersist(struct tcpcb *); #ifdef TCP_SIGNATURE +struct secasvar; +struct secasvar *tcp_get_sav(struct mbuf *, u_int); +int tcp_signature_do_compute(struct mbuf *, int, int, u_char *, + struct secasvar *); int tcp_signature_compute(struct mbuf *, int, int, int, u_char *, u_int); int tcp_signature_verify(struct mbuf *, int, int, int, struct tcpopt *, struct tcphdr *, u_int); +int tcp_signature_check(struct mbuf *m, int off0, int tlen, int optlen, + struct tcpopt *to, struct tcphdr *th, u_int tcpbflag); #endif void tcp_slowtimo(void); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, void *, void *); -void tcp_timer_activate(struct tcpcb *, int, u_int); -int tcp_timer_active(struct tcpcb *, int); +void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); +int tcp_timer_active(struct tcpcb *, uint32_t); +void tcp_timer_stop(struct tcpcb *, uint32_t); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) @@ -741,7 +859,7 @@ void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; tcp_seq tcp_new_isn(struct tcpcb *); -void tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); +int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); @@ -750,9 +868,29 @@ void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); u_long tcp_seq_subtract(u_long, u_long ); +int tcp_compute_pipe(struct tcpcb *); -void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); +static inline void +tcp_fields_to_host(struct tcphdr *th) +{ + th->th_seq = ntohl(th->th_seq); + th->th_ack = ntohl(th->th_ack); + th->th_win = ntohs(th->th_win); + th->th_urp = ntohs(th->th_urp); +} + +#ifdef TCP_SIGNATURE +static inline void +tcp_fields_to_net(struct tcphdr *th) +{ + + th->th_seq = htonl(th->th_seq); + th->th_ack = htonl(th->th_ack); + th->th_win = htons(th->th_win); + th->th_urp = htons(th->th_urp); +} +#endif #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c index bf95e954..7eb11648 100644 --- a/freebsd/sys/netinet/udp_usrreq.c +++ b/freebsd/sys/netinet/udp_usrreq.c @@ -5,6 +5,7 @@ * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. + * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under @@ -44,10 +45,10 @@ #include __FBSDID("$FreeBSD$"); -#include #include #include #include +#include #include #include @@ -60,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -71,9 +73,12 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include +#include #include +#include #include #include #include @@ -90,6 +95,8 @@ __FBSDID("$FreeBSD$"); #endif #include #include +#include +#include #ifdef IPSEC #include @@ -101,8 +108,9 @@ __FBSDID("$FreeBSD$"); #include /* - * UDP protocol implementation. + * UDP and UDP-Lite protocols implementation. * Per RFC 768, August, 1980. + * Per RFC 3828, July, 2004. */ /* @@ -112,7 +120,7 @@ __FBSDID("$FreeBSD$"); * cause problems (especially for NFS data blocks). */ VNET_DEFINE(int, udp_cksum) = 1; -SYSCTL_VNET_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_cksum), 0, "compute udp checksum"); int udp_log_in_vain = 0; @@ -120,12 +128,17 @@ SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, &udp_log_in_vain, 0, "Log all incoming UDP packets"); VNET_DEFINE(int, udp_blackhole) = 0; -SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole), 0, "Do not send port unreachables for refused connects"); +static VNET_DEFINE(int, udp_require_l2_bcast) = 0; +#define V_udp_require_l2_bcast VNET(udp_require_l2_bcast) +SYSCTL_INT(_net_inet_udp, OID_AUTO, require_l2_bcast, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(udp_require_l2_bcast), 0, + "Only treat packets sent to an L2 broadcast address as broadcast packets"); + u_long udp_sendspace = 9216; /* really max datagram size */ - /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); @@ -135,13 +148,15 @@ u_long udp_recvspace = 40 * (1024 + #else sizeof(struct sockaddr_in) #endif - ); + ); /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ VNET_DEFINE(struct inpcbinfo, udbinfo); +VNET_DEFINE(struct inpcbhead, ulitecb); +VNET_DEFINE(struct inpcbinfo, ulitecbinfo); static VNET_DEFINE(uma_zone_t, udpcb_zone); #define V_udpcb_zone VNET(udpcb_zone) @@ -149,11 +164,14 @@ static VNET_DEFINE(uma_zone_t, udpcb_zone); #define UDBHASHSIZE 128 #endif -VNET_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ -SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, - &VNET_NAME(udpstat), udpstat, - "UDP statistics (struct udpstat, netinet/udp_var.h)"); +VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ +VNET_PCPUSTAT_SYSINIT(udpstat); +SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, + udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); +#ifdef VIMAGE +VNET_PCPUSTAT_SYSUNINIT(udpstat); +#endif /* VIMAGE */ #ifdef INET static void udp_detach(struct socket *so); static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, @@ -187,20 +205,47 @@ udp_inpcb_init(void *mem, int size, int flags) return (0); } +static int +udplite_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp; + + inp = mem; + INP_LOCK_INIT(inp, "inp", "udpliteinp"); + return (0); +} + void udp_init(void) { + /* + * For now default to 2-tuple UDP hashing - until the fragment + * reassembly code can also update the flowid. + * + * Once we can calculate the flowid that way and re-establish + * a 4-tuple, flip this to 4-tuple. + */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, - "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, + "udp_inpcb", udp_inpcb_init, NULL, 0, IPI_HASHFIELDS_2TUPLE); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_udpcb_zone, maxsockets); + uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached"); EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, EVENTHANDLER_PRI_ANY); } +void +udplite_init(void) +{ + + in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE, + UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, NULL, + 0, IPI_HASHFIELDS_2TUPLE); +} + /* * Kernel module interface for updating udpstat. The argument is an index * into udpstat treated as an array of u_long. While this encodes the @@ -212,7 +257,7 @@ void kmod_udpstat_inc(int statnum) { - (*((u_long *)&V_udpstat + statnum))++; + counter_u64_add(VNET(udpstat)[statnum], 1); } int @@ -235,13 +280,23 @@ udp_discardcb(struct udpcb *up) } #ifdef VIMAGE -void -udp_destroy(void) +static void +udp_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_udbinfo); uma_zdestroy(V_udpcb_zone); } +VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL); + +static void +udplite_destroy(void *unused __unused) +{ + + in_pcbinfo_destroy(&V_ulitecbinfo); +} +VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy, + NULL); #endif #ifdef INET @@ -251,14 +306,23 @@ udp_destroy(void) * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. + * + * In the normal case udp_append() will return 0, indicating that you + * must unlock the inp. However if a tunneling protocol is in place we increment + * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we + * then decrement the reference count. If the inp_rele returns 1, indicating the + * inp is gone, we return that to the caller to tell them *not* to unlock + * the inp. In the case of multi-cast this will cause the distribution + * to stop (though most tunneling protocols known currently do *not* use + * multicast). */ -static void +static int udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; - struct mbuf *opts = 0; + struct mbuf *opts = NULL; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif @@ -271,21 +335,21 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { - (*up->u_tun_func)(n, off, inp); - return; + in_pcbref(inp); + INP_RUNLOCK(inp); + (*up->u_tun_func)(n, off, inp, (struct sockaddr *)udp_in, + up->u_tun_ctx); + INP_RLOCK(inp); + return (in_pcbrele_rlocked(inp)); } - if (n == NULL) - return; - off += sizeof(struct udphdr); #ifdef IPSEC /* Check AH/ESP integrity. */ if (ipsec4_in_reject(n, inp)) { m_freem(n); - IPSECSTAT_INC(in_polvio); - return; + return (0); } #ifdef IPSEC_NAT_T up = intoudpcb(inp); @@ -293,14 +357,14 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */ n = udp4_espdecap(inp, n, off); if (n == NULL) /* Consumed. */ - return; + return (0); } #endif /* IPSEC_NAT_T */ #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); - return; + return (0); } #endif /* MAC */ if (inp->inp_flags & INP_CONTROLOPTS || @@ -334,22 +398,28 @@ udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); + return (0); } -void -udp_input(struct mbuf *m, int off) +int +udp_input(struct mbuf **mp, int *offp, int proto) { - int iphlen = off; struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; - int len; + uint16_t len, ip_len; + struct inpcbinfo *pcbinfo; struct ip save_ip; struct sockaddr_in udp_in; + struct mbuf *m; struct m_tag *fwd_tag; + int cscov_partial, iphlen; + m = *mp; + iphlen = *offp; ifp = m->m_pkthdr.rcvif; + *mp = NULL; UDPSTAT_INC(udps_ipackets); /* @@ -358,7 +428,7 @@ udp_input(struct mbuf *m, int off) * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { - ip_stripoptions(m, (struct mbuf *)0); + ip_stripoptions(m); iphlen = sizeof(struct ip); } @@ -367,13 +437,14 @@ udp_input(struct mbuf *m, int off) */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct udphdr)) { - if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { + if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); - return; + return (IPPROTO_DONE); } ip = mtod(m, struct ip *); } uh = (struct udphdr *)((caddr_t)ip + iphlen); + cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0; /* * Destination port of 0 is illegal, based on RFC768. @@ -396,13 +467,20 @@ udp_input(struct mbuf *m, int off) * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); - if (ip->ip_len != len) { - if (len > ip->ip_len || len < sizeof(struct udphdr)) { + ip_len = ntohs(ip->ip_len) - iphlen; + if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) { + /* Zero means checksum over the complete packet. */ + if (len == 0) + len = ip_len; + cscov_partial = 0; + } + if (ip_len != len) { + if (len > ip_len || len < sizeof(struct udphdr)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } - m_adj(m, len - ip->ip_len); - /* ip->ip_len = len; */ + if (proto == IPPROTO_UDP) + m_adj(m, len - ip_len); } /* @@ -420,39 +498,53 @@ udp_input(struct mbuf *m, int off) if (uh->uh_sum) { u_short uh_sum; - if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && + !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + - m->m_pkthdr.csum_data + IPPROTO_UDP)); + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; } else { char b[9]; bcopy(((struct ipovly *)ip)->ih_x1, b, 9); bzero(((struct ipovly *)ip)->ih_x1, 9); - ((struct ipovly *)ip)->ih_len = uh->uh_ulen; + ((struct ipovly *)ip)->ih_len = (proto == IPPROTO_UDP) ? + uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); } if (uh_sum) { UDPSTAT_INC(udps_badsum); m_freem(m); - return; + return (IPPROTO_DONE); } - } else - UDPSTAT_INC(udps_nosum); + } else { + if (proto == IPPROTO_UDP) { + UDPSTAT_INC(udps_nosum); + } else { + /* UDPLite requires a checksum */ + /* XXX: What is the right UDPLite MIB counter here? */ + m_freem(m); + return (IPPROTO_DONE); + } + } + pcbinfo = udp_get_inpcbinfo(proto); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || - in_broadcast(ip->ip_dst, ifp)) { + ((!V_udp_require_l2_bcast || m->m_flags & M_BCAST) && + in_broadcast(ip->ip_dst, ifp))) { struct inpcb *last; + struct inpcbhead *pcblist; struct ip_moptions *imo; - INP_INFO_RLOCK(&V_udbinfo); + INP_INFO_RLOCK(pcbinfo); + pcblist = udp_get_pcblist(proto); last = NULL; - LIST_FOREACH(inp, &V_udb, inp_list) { + LIST_FOREACH(inp, pcblist, inp_list) { if (inp->inp_lport != uh->uh_dport) continue; #ifdef INET6 @@ -511,8 +603,14 @@ udp_input(struct mbuf *m, int off) if (last != NULL) { struct mbuf *n; - n = m_copy(m, 0, M_COPYALL); - udp_append(last, ip, n, iphlen, &udp_in); + if ((n = m_copy(m, 0, M_COPYALL)) != NULL) { + UDP_PROBE(receive, NULL, last, ip, + last, uh); + if (udp_append(last, ip, n, iphlen, + &udp_in)) { + goto inp_lost; + } + } INP_RUNLOCK(last); } last = inp; @@ -538,13 +636,15 @@ udp_input(struct mbuf *m, int off) UDPSTAT_INC(udps_noportbcast); if (inp) INP_RUNLOCK(inp); - INP_INFO_RUNLOCK(&V_udbinfo); + INP_INFO_RUNLOCK(pcbinfo); goto badunlocked; } - udp_append(last, ip, m, iphlen, &udp_in); - INP_RUNLOCK(last); - INP_INFO_RUNLOCK(&V_udbinfo); - return; + UDP_PROBE(receive, NULL, last, ip, last, uh); + if (udp_append(last, ip, m, iphlen, &udp_in) == 0) + INP_RUNLOCK(last); + inp_lost: + INP_INFO_RUNLOCK(pcbinfo); + return (IPPROTO_DONE); } /* @@ -564,7 +664,7 @@ udp_input(struct mbuf *m, int off) * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ - inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport, + inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m); if (!inp) { /* @@ -572,7 +672,7 @@ udp_input(struct mbuf *m, int off) * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ - inp = in_pcblookup(&V_udbinfo, ip->ip_src, + inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : uh->uh_dport, INPLOOKUP_WILDCARD | @@ -582,7 +682,7 @@ udp_input(struct mbuf *m, int off) m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } else - inp = in_pcblookup_mbuf(&V_udbinfo, ip->ip_src, uh->uh_sport, + inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, ifp, m); if (inp == NULL) { @@ -605,9 +705,8 @@ udp_input(struct mbuf *m, int off) if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badunlocked; *ip = save_ip; - ip->ip_len += iphlen; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); - return; + return (IPPROTO_DONE); } /* @@ -617,14 +716,27 @@ udp_input(struct mbuf *m, int off) if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { INP_RUNLOCK(inp); m_freem(m); - return; + return (IPPROTO_DONE); + } + if (cscov_partial) { + struct udpcb *up; + + up = intoudpcb(inp); + if (up->u_rxcslen == 0 || up->u_rxcslen > len) { + INP_RUNLOCK(inp); + m_freem(m); + return (IPPROTO_DONE); + } } - udp_append(inp, ip, m, iphlen, &udp_in); - INP_RUNLOCK(inp); - return; + + UDP_PROBE(receive, NULL, inp, ip, inp, uh); + if (udp_append(inp, ip, m, iphlen, &udp_in) == 0) + INP_RUNLOCK(inp); + return (IPPROTO_DONE); badunlocked: m_freem(m); + return (IPPROTO_DONE); } #endif /* INET */ @@ -643,6 +755,11 @@ udp_notify(struct inpcb *inp, int errno) * or a write lock, but a read lock is sufficient. */ INP_LOCK_ASSERT(inp); + if ((errno == EHOSTUNREACH || errno == ENETUNREACH || + errno == EHOSTDOWN) && inp->inp_route.ro_rt) { + RTFREE(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = (struct rtentry *)NULL; + } inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); @@ -651,8 +768,9 @@ udp_notify(struct inpcb *inp, int errno) } #ifdef INET -void -udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +static void +udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip, + struct inpcbinfo *pcbinfo) { struct ip *ip = vip; struct udphdr *uh; @@ -663,11 +781,11 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; - /* - * Redirects don't need to be handled up here. - */ - if (PRC_IS_REDIRECT(cmd)) + if (PRC_IS_REDIRECT(cmd)) { + /* signal EHOSTDOWN, as it flushes the cached route */ + in_pcbnotifyall(&V_udbinfo, faddr, EHOSTDOWN, udp_notify); return; + } /* * Hostdead is ugly because it goes linearly through all PCBs. @@ -681,7 +799,7 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) return; if (ip != NULL) { uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - inp = in_pcblookup(&V_udbinfo, faddr, uh->uh_dport, + inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { INP_RLOCK_ASSERT(inp); @@ -689,11 +807,39 @@ udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) udp_notify(inp, inetctlerrmap[cmd]); } INP_RUNLOCK(inp); + } else { + inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport, + ip->ip_src, uh->uh_sport, + INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); + if (inp != NULL) { + struct udpcb *up; + + up = intoudpcb(inp); + if (up->u_icmp_func != NULL) { + INP_RUNLOCK(inp); + (*up->u_icmp_func)(cmd, sa, vip, up->u_tun_ctx); + } else { + INP_RUNLOCK(inp); + } + } } } else - in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd], + in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd], udp_notify); } +void +udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + + return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo)); +} + +void +udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + + return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo)); +} #endif /* INET */ static int @@ -740,7 +886,7 @@ udp_pcblist(SYSCTL_HANDLER_ARGS) return (error); inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == 0) + if (inp_list == NULL) return (ENOMEM); INP_INFO_RLOCK(&V_udbinfo); @@ -849,16 +995,16 @@ SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, int udp_ctloutput(struct socket *so, struct sockopt *sopt) { - int error = 0, optval; struct inpcb *inp; -#ifdef IPSEC_NAT_T struct udpcb *up; -#endif + int isudplite, error, optval; + error = 0; + isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); - if (sopt->sopt_level != IPPROTO_UDP) { + if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); @@ -916,6 +1062,34 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) } INP_WUNLOCK(inp); break; + case UDPLITE_SEND_CSCOV: + case UDPLITE_RECV_CSCOV: + if (!isudplite) { + INP_WUNLOCK(inp); + error = ENOPROTOOPT; + break; + } + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error != 0) + break; + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); + INP_WLOCK(inp); + up = intoudpcb(inp); + KASSERT(up != NULL, ("%s: up == NULL", __func__)); + if ((optval != 0 && optval < 8) || (optval > 65535)) { + INP_WUNLOCK(inp); + error = EINVAL; + break; + } + if (sopt->sopt_name == UDPLITE_SEND_CSCOV) + up->u_txcslen = optval; + else + up->u_rxcslen = optval; + INP_WUNLOCK(inp); + break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -933,6 +1107,22 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt) error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif + case UDPLITE_SEND_CSCOV: + case UDPLITE_RECV_CSCOV: + if (!isudplite) { + INP_WUNLOCK(inp); + error = ENOPROTOOPT; + break; + } + up = intoudpcb(inp); + KASSERT(up != NULL, ("%s: up == NULL", __func__)); + if (sopt->sopt_name == UDPLITE_SEND_CSCOV) + optval = up->u_txcslen; + else + optval = up->u_rxcslen; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -955,12 +1145,18 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, int len = m->m_pkthdr.len; struct in_addr faddr, laddr; struct cmsghdr *cm; + struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; + int cscov_partial = 0; int error = 0; int ipflags; u_short fport, lport; - int unlock_udbinfo; + int unlock_udbinfo, unlock_inp; u_char tos; + uint8_t pr; + uint16_t cscov = 0; + uint32_t flowid = 0; + uint8_t flowtype = M_HASHTYPE_NONE; /* * udp_output() may need to temporarily bind or connect the current @@ -976,7 +1172,15 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, } src.sin_family = 0; - INP_RLOCK(inp); + sin = (struct sockaddr_in *)addr; + if (sin == NULL || + (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { + INP_WLOCK(inp); + unlock_inp = UH_WLOCKED; + } else { + INP_RLOCK(inp); + unlock_inp = UH_RLOCKED; + } tos = inp->inp_ip_tos; if (control != NULL) { /* @@ -984,7 +1188,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * stored in a single mbuf. */ if (control->m_next) { - INP_RUNLOCK(inp); + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); m_freem(control); m_freem(m); return (EINVAL); @@ -1024,6 +1231,31 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, tos = *(u_char *)CMSG_DATA(cm); break; + case IP_FLOWID: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + flowid = *(uint32_t *) CMSG_DATA(cm); + break; + + case IP_FLOWTYPE: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + flowtype = *(uint32_t *) CMSG_DATA(cm); + break; + +#ifdef RSS + case IP_RSSBUCKETID: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + /* This is just a placeholder for now */ + break; +#endif /* RSS */ default: error = ENOPROTOOPT; break; @@ -1034,7 +1266,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, m_freem(control); } if (error) { - INP_RUNLOCK(inp); + if (unlock_inp == UH_WLOCKED) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); m_freem(m); return (error); } @@ -1055,12 +1290,12 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * * XXXRW: Check that hash locking update here is correct. */ + pr = inp->inp_socket->so_proto->pr_protocol; + pcbinfo = udp_get_inpcbinfo(pr); sin = (struct sockaddr_in *)addr; if (sin != NULL && (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { - INP_RUNLOCK(inp); - INP_WLOCK(inp); - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); unlock_udbinfo = UH_WLOCKED; } else if ((sin != NULL && ( (sin->sin_addr.s_addr == INADDR_ANY) || @@ -1068,7 +1303,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { - INP_HASH_RLOCK(&V_udbinfo); + INP_HASH_RLOCK(pcbinfo); unlock_udbinfo = UH_RLOCKED; } else unlock_udbinfo = UH_UNLOCKED; @@ -1081,7 +1316,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { - INP_HASH_LOCK_ASSERT(&V_udbinfo); + INP_HASH_LOCK_ASSERT(pcbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { @@ -1132,7 +1367,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { - INP_HASH_LOCK_ASSERT(&V_udbinfo); + INP_HASH_LOCK_ASSERT(pcbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); @@ -1147,7 +1382,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); - INP_HASH_WLOCK_ASSERT(&V_udbinfo); + INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Remember addr if jailed, to prevent * rebinding. @@ -1181,7 +1416,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ - M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT); + M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; @@ -1196,12 +1431,30 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, */ ui = mtod(m, struct udpiphdr *); bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ - ui->ui_pr = IPPROTO_UDP; + ui->ui_pr = pr; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); + if (pr == IPPROTO_UDPLITE) { + struct udpcb *up; + uint16_t plen; + + up = intoudpcb(inp); + cscov = up->u_txcslen; + plen = (u_short)len + sizeof(struct udphdr); + if (cscov >= plen) + cscov = 0; + ui->ui_len = htons(plen); + ui->ui_ulen = htons(cscov); + /* + * For UDP-Lite, checksum coverage length of zero means + * the entire UDPLite packet is covered by the checksum. + */ + cscov_partial = (cscov == 0) ? 0 : 1; + } else + ui->ui_v = IPVERSION << 4; /* * Set the Don't Fragment bit in the IP header. @@ -1210,7 +1463,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct ip *ip; ip = (struct ip *)&ui->ui_i; - ip->ip_off |= IP_DF; + ip->ip_off |= htons(IP_DF); } ipflags = 0; @@ -1228,27 +1481,90 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, /* * Set up checksum and output datagram. */ - if (V_udp_cksum) { + ui->ui_sum = 0; + if (pr == IPPROTO_UDPLITE) { + if (inp->inp_flags & INP_ONESBCAST) + faddr.s_addr = INADDR_BROADCAST; + if (cscov_partial) { + if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0) + ui->ui_sum = 0xffff; + } else { + if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0) + ui->ui_sum = 0xffff; + } + } else if (V_udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, - htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP)); + htons((u_short)len + sizeof(struct udphdr) + pr)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); - } else - ui->ui_sum = 0; - ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; + } + ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len); ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); + /* + * Setup flowid / RSS information for outbound socket. + * + * Once the UDP code decides to set a flowid some other way, + * this allows the flowid to be overridden by userland. + */ + if (flowtype != M_HASHTYPE_NONE) { + m->m_pkthdr.flowid = flowid; + M_HASHTYPE_SET(m, flowtype); +#ifdef RSS + } else { + uint32_t hash_val, hash_type; + /* + * Calculate an appropriate RSS hash for UDP and + * UDP Lite. + * + * The called function will take care of figuring out + * whether a 2-tuple or 4-tuple hash is required based + * on the currently configured scheme. + * + * Later later on connected socket values should be + * cached in the inpcb and reused, rather than constantly + * re-calculating it. + * + * UDP Lite is a different protocol number and will + * likely end up being hashed as a 2-tuple until + * RSS / NICs grow UDP Lite protocol awareness. + */ + if (rss_proto_software_hash_v4(faddr, laddr, fport, lport, + pr, &hash_val, &hash_type) == 0) { + m->m_pkthdr.flowid = hash_val; + M_HASHTYPE_SET(m, hash_type); + } +#endif + } + +#ifdef RSS + /* + * Don't override with the inp cached flowid value. + * + * Depending upon the kind of send being done, the inp + * flowid/flowtype values may actually not be appropriate + * for this particular socket send. + * + * We should either leave the flowid at zero (which is what is + * currently done) or set it to some software generated + * hash value based on the packet contents. + */ + ipflags |= IP_NODEFAULTFLOWID; +#endif /* RSS */ + if (unlock_udbinfo == UH_WLOCKED) - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) - INP_HASH_RUNLOCK(&V_udbinfo); - error = ip_output(m, inp->inp_options, NULL, ipflags, + INP_HASH_RUNLOCK(pcbinfo); + UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); + error = ip_output(m, inp->inp_options, + (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags, inp->inp_moptions, inp); - if (unlock_udbinfo == UH_WLOCKED) + if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); @@ -1256,10 +1572,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, release: if (unlock_udbinfo == UH_WLOCKED) { - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); } else if (unlock_udbinfo == UH_RLOCKED) { - INP_HASH_RUNLOCK(&V_udbinfo); + INP_HASH_RUNLOCK(pcbinfo); INP_RUNLOCK(inp); } else INP_RUNLOCK(inp); @@ -1297,7 +1613,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) if (minlen > m->m_pkthdr.len) minlen = m->m_pkthdr.len; if ((m = m_pullup(m, minlen)) == NULL) { - IPSECSTAT_INC(in_inval); + IPSECSTAT_INC(ips_in_inval); return (NULL); /* Bypass caller processing. */ } data = mtod(m, caddr_t); /* Points to ip header. */ @@ -1337,7 +1653,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) uint32_t spi; if (payload <= sizeof(struct esp)) { - IPSECSTAT_INC(in_inval); + IPSECSTAT_INC(ips_in_inval); m_freem(m); return (NULL); /* Discard. */ } @@ -1358,7 +1674,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS, 2 * sizeof(uint16_t), M_NOWAIT); if (tag == NULL) { - IPSECSTAT_INC(in_nomem); + IPSECSTAT_INC(ips_in_nomem); m_freem(m); return (NULL); /* Discard. */ } @@ -1387,7 +1703,7 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) m_adj(m, skip); ip = mtod(m, struct ip *); - ip->ip_len -= skip; + ip->ip_len = htons(ntohs(ip->ip_len) - skip); ip->ip_p = IPPROTO_ESP; /* @@ -1397,7 +1713,8 @@ udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR); - (void) ipsec4_common_input(m, iphlen, ip->ip_p); + (void) ipsec_common_input(m, iphlen, offsetof(struct ip, ip_p), + AF_INET, ip->ip_p); return (NULL); /* NB: consumed, bypass processing. */ } #endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */ @@ -1406,15 +1723,17 @@ static void udp_abort(struct socket *so) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); @@ -1424,17 +1743,19 @@ static int udp_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; int error; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); - INP_INFO_WLOCK(&V_udbinfo); - error = in_pcballoc(so, &V_udbinfo); + INP_INFO_WLOCK(pcbinfo); + error = in_pcballoc(so, pcbinfo); if (error) { - INP_INFO_WUNLOCK(&V_udbinfo); + INP_INFO_WUNLOCK(pcbinfo); return (error); } @@ -1446,18 +1767,18 @@ udp_attach(struct socket *so, int proto, struct thread *td) if (error) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_udbinfo); + INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_udbinfo); + INP_INFO_WUNLOCK(pcbinfo); return (0); } #endif /* INET */ int -udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f) +udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx) { struct inpcb *inp; struct udpcb *up; @@ -1468,11 +1789,14 @@ udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f) KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL")); INP_WLOCK(inp); up = intoudpcb(inp); - if (up->u_tun_func != NULL) { + if ((up->u_tun_func != NULL) || + (up->u_icmp_func != NULL)) { INP_WUNLOCK(inp); return (EBUSY); } up->u_tun_func = f; + up->u_icmp_func = i; + up->u_tun_ctx = ctx; INP_WUNLOCK(inp); return (0); } @@ -1482,14 +1806,16 @@ static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; int error; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); INP_WLOCK(inp); - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } @@ -1498,15 +1824,17 @@ static void udp_close(struct socket *so) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); @@ -1516,9 +1844,11 @@ static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; - int error; + struct inpcbinfo *pcbinfo; struct sockaddr_in *sin; + int error; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); INP_WLOCK(inp); @@ -1532,9 +1862,9 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) INP_WUNLOCK(inp); return (error); } - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, nam, td->td_ucred); - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); @@ -1545,20 +1875,22 @@ static void udp_detach(struct socket *so) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; struct udpcb *up; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); - INP_INFO_WLOCK(&V_udbinfo); + INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_udbinfo); + INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } @@ -1566,7 +1898,9 @@ static int udp_disconnect(struct socket *so) { struct inpcb *inp; + struct inpcbinfo *pcbinfo; + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_WLOCK(inp); @@ -1574,10 +1908,10 @@ udp_disconnect(struct socket *so) INP_WUNLOCK(inp); return (ENOTCONN); } - INP_HASH_WLOCK(&V_udbinfo); + INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; - INP_HASH_WUNLOCK(&V_udbinfo); + INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); diff --git a/freebsd/sys/netinet/udp_var.h b/freebsd/sys/netinet/udp_var.h index 6b9b5362..172d969d 100644 --- a/freebsd/sys/netinet/udp_var.h +++ b/freebsd/sys/netinet/udp_var.h @@ -42,6 +42,7 @@ struct udpiphdr { struct udphdr ui_u; /* udp header */ }; #define ui_x1 ui_i.ih_x1 +#define ui_v ui_i.ih_x1[0] #define ui_pr ui_i.ih_pr #define ui_len ui_i.ih_len #define ui_src ui_i.ih_src @@ -51,14 +52,23 @@ struct udpiphdr { #define ui_ulen ui_u.uh_ulen #define ui_sum ui_u.uh_sum -typedef void(*udp_tun_func_t)(struct mbuf *, int off, struct inpcb *); +struct inpcb; +struct mbuf; +typedef void(*udp_tun_func_t)(struct mbuf *, int, struct inpcb *, + const struct sockaddr *, void *); +typedef void(*udp_tun_icmp_t)(int, struct sockaddr *, void *, void *); + /* * UDP control block; one per udp. */ struct udpcb { udp_tun_func_t u_tun_func; /* UDP kernel tunneling callback. */ + udp_tun_icmp_t u_icmp_func; /* UDP kernel tunneling icmp callback */ u_int u_flags; /* Generic UDP flags. */ + uint16_t u_rxcslen; /* Coverage for incoming datagrams. */ + uint16_t u_txcslen; /* Coverage for outgoing datagrams. */ + void *u_tun_ctx; /* Tunneling callback context. */ }; #define intoudpcb(ip) ((struct udpcb *)(ip)->inp_ppcb) @@ -72,96 +82,107 @@ struct udpcb { struct udpstat { /* input statistics: */ - u_long udps_ipackets; /* total input packets */ - u_long udps_hdrops; /* packet shorter than header */ - u_long udps_badsum; /* checksum error */ - u_long udps_nosum; /* no checksum */ - u_long udps_badlen; /* data length larger than packet */ - u_long udps_noport; /* no socket on port */ - u_long udps_noportbcast; /* of above, arrived as broadcast */ - u_long udps_fullsock; /* not delivered, input socket full */ - u_long udpps_pcbcachemiss; /* input packets missing pcb cache */ - u_long udpps_pcbhashmiss; /* input packets not for hashed pcb */ + uint64_t udps_ipackets; /* total input packets */ + uint64_t udps_hdrops; /* packet shorter than header */ + uint64_t udps_badsum; /* checksum error */ + uint64_t udps_nosum; /* no checksum */ + uint64_t udps_badlen; /* data length larger than packet */ + uint64_t udps_noport; /* no socket on port */ + uint64_t udps_noportbcast; /* of above, arrived as broadcast */ + uint64_t udps_fullsock; /* not delivered, input socket full */ + uint64_t udpps_pcbcachemiss; /* input packets missing pcb cache */ + uint64_t udpps_pcbhashmiss; /* input packets not for hashed pcb */ /* output statistics: */ - u_long udps_opackets; /* total output packets */ - u_long udps_fastout; /* output packets on fast path */ + uint64_t udps_opackets; /* total output packets */ + uint64_t udps_fastout; /* output packets on fast path */ /* of no socket on port, arrived as multicast */ - u_long udps_noportmcast; - u_long udps_filtermcast; /* blocked by multicast filter */ + uint64_t udps_noportmcast; + uint64_t udps_filtermcast; /* blocked by multicast filter */ }; #ifdef _KERNEL +#include #ifdef __rtems__ #include #undef errno #endif /* __rtems__ */ + +VNET_PCPUSTAT_DECLARE(struct udpstat, udpstat); /* * In-kernel consumers can use these accessor macros directly to update * stats. */ -#define UDPSTAT_ADD(name, val) V_udpstat.name += (val) +#define UDPSTAT_ADD(name, val) \ + VNET_PCPUSTAT_ADD(struct udpstat, udpstat, name, (val)) #define UDPSTAT_INC(name) UDPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_udpstat_inc(int statnum); -#define KMOD_UDPSTAT_INC(name) \ - kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(u_long)) +#define KMOD_UDPSTAT_INC(name) \ + kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(uint64_t)) #endif /* - * Names for UDP sysctl objects. + * Identifiers for UDP sysctl nodes. */ #define UDPCTL_CHECKSUM 1 /* checksum UDP packets */ #define UDPCTL_STATS 2 /* statistics (read-only) */ #define UDPCTL_MAXDGRAM 3 /* max datagram size */ #define UDPCTL_RECVSPACE 4 /* default receive buffer space */ #define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */ -#define UDPCTL_MAXID 6 - -#define UDPCTL_NAMES { \ - { 0, 0 }, \ - { "checksum", CTLTYPE_INT }, \ - { "stats", CTLTYPE_STRUCT }, \ - { "maxdgram", CTLTYPE_INT }, \ - { "recvspace", CTLTYPE_INT }, \ - { "pcblist", CTLTYPE_STRUCT }, \ -} #ifdef _KERNEL +#include SYSCTL_DECL(_net_inet_udp); extern struct pr_usrreqs udp_usrreqs; VNET_DECLARE(struct inpcbhead, udb); VNET_DECLARE(struct inpcbinfo, udbinfo); +VNET_DECLARE(struct inpcbhead, ulitecb); +VNET_DECLARE(struct inpcbinfo, ulitecbinfo); #define V_udb VNET(udb) #define V_udbinfo VNET(udbinfo) +#define V_ulitecb VNET(ulitecb) +#define V_ulitecbinfo VNET(ulitecbinfo) extern u_long udp_sendspace; extern u_long udp_recvspace; VNET_DECLARE(int, udp_cksum); -VNET_DECLARE(struct udpstat, udpstat); VNET_DECLARE(int, udp_blackhole); #define V_udp_cksum VNET(udp_cksum) -#define V_udpstat VNET(udpstat) #define V_udp_blackhole VNET(udp_blackhole) extern int udp_log_in_vain; -int udp_newudpcb(struct inpcb *); -void udp_discardcb(struct udpcb *); +static __inline struct inpcbinfo * +udp_get_inpcbinfo(int protocol) +{ + return (protocol == IPPROTO_UDP) ? &V_udbinfo : &V_ulitecbinfo; +} -void udp_ctlinput(int, struct sockaddr *, void *); -int udp_ctloutput(struct socket *, struct sockopt *); -void udp_init(void); -#ifdef VIMAGE -void udp_destroy(void); -#endif -void udp_input(struct mbuf *, int); +static __inline struct inpcbhead * +udp_get_pcblist(int protocol) +{ + return (protocol == IPPROTO_UDP) ? &V_udb : &V_ulitecb; +} + +int udp_newudpcb(struct inpcb *); +void udp_discardcb(struct udpcb *); + +void udp_ctlinput(int, struct sockaddr *, void *); +void udplite_ctlinput(int, struct sockaddr *, void *); +int udp_ctloutput(struct socket *, struct sockopt *); +void udp_init(void); +void udplite_init(void); +int udp_input(struct mbuf **, int *, int); +void udplite_input(struct mbuf *, int); struct inpcb *udp_notify(struct inpcb *inp, int errno); -int udp_shutdown(struct socket *so); +int udp_shutdown(struct socket *so); -int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f); -#endif +int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, + udp_tun_icmp_t i, void *ctx); -#endif +#endif /* _KERNEL */ + +#endif /* _NETINET_UDP_VAR_H_ */ diff --git a/freebsd/sys/netinet/udplite.h b/freebsd/sys/netinet/udplite.h new file mode 100644 index 00000000..0e23cd70 --- /dev/null +++ b/freebsd/sys/netinet/udplite.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2014, Kevin Lo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_UDPLITE_H_ +#define _NETINET_UDPLITE_H_ + +/* + * User-settable options (used with setsockopt). + */ +#define UDPLITE_SEND_CSCOV 2 /* Sender checksum coverage. */ +#define UDPLITE_RECV_CSCOV 4 /* Receiver checksum coverage. */ + +#endif /* !_NETINET_UDPLITE_H_ */ -- cgit v1.2.3