From bceabc95c1c85d793200446fa85f1ddc6313ea29 Mon Sep 17 00:00:00 2001 From: Sebastian Huber Date: Wed, 9 Oct 2013 22:42:09 +0200 Subject: Move files to match FreeBSD layout --- freebsd/sys/netinet/accf_data.c | 68 + freebsd/sys/netinet/accf_dns.c | 134 + freebsd/sys/netinet/accf_http.c | 351 + freebsd/sys/netinet/icmp6.h | 2 + freebsd/sys/netinet/icmp_var.h | 108 + freebsd/sys/netinet/if_atm.c | 366 + freebsd/sys/netinet/if_atm.h | 47 + freebsd/sys/netinet/if_ether.c | 859 ++ freebsd/sys/netinet/if_ether.h | 2 + freebsd/sys/netinet/igmp.c | 3655 +++++++ freebsd/sys/netinet/igmp.h | 2 + freebsd/sys/netinet/igmp_var.h | 225 + freebsd/sys/netinet/in.c | 1601 +++ freebsd/sys/netinet/in.h | 2 + freebsd/sys/netinet/in_gif.c | 469 + freebsd/sys/netinet/in_gif.h | 45 + freebsd/sys/netinet/in_mcast.c | 2902 ++++++ freebsd/sys/netinet/in_pcb.c | 1958 ++++ freebsd/sys/netinet/in_pcb.h | 525 + freebsd/sys/netinet/in_proto.c | 400 + freebsd/sys/netinet/in_rmx.c | 516 + freebsd/sys/netinet/in_systm.h | 2 + freebsd/sys/netinet/in_var.h | 475 + freebsd/sys/netinet/ip.h | 2 + freebsd/sys/netinet/ip6.h | 2 + freebsd/sys/netinet/ip_carp.c | 2427 +++++ freebsd/sys/netinet/ip_carp.h | 191 + freebsd/sys/netinet/ip_divert.c | 818 ++ freebsd/sys/netinet/ip_divert.h | 55 + freebsd/sys/netinet/ip_dummynet.h | 263 + freebsd/sys/netinet/ip_ecn.c | 194 + freebsd/sys/netinet/ip_ecn.h | 53 + freebsd/sys/netinet/ip_encap.c | 465 + freebsd/sys/netinet/ip_encap.h | 64 + freebsd/sys/netinet/ip_fastfwd.c | 619 ++ freebsd/sys/netinet/ip_fw.h | 579 ++ freebsd/sys/netinet/ip_gre.c | 336 + freebsd/sys/netinet/ip_gre.h | 43 + freebsd/sys/netinet/ip_icmp.c | 986 ++ freebsd/sys/netinet/ip_icmp.h | 2 + freebsd/sys/netinet/ip_id.c | 211 + freebsd/sys/netinet/ip_input.c | 1794 ++++ freebsd/sys/netinet/ip_ipsec.c | 424 + freebsd/sys/netinet/ip_ipsec.h | 41 + freebsd/sys/netinet/ip_mroute.c | 2952 ++++++ freebsd/sys/netinet/ip_mroute.h | 359 + freebsd/sys/netinet/ip_options.c | 747 ++ freebsd/sys/netinet/ip_options.h | 60 + freebsd/sys/netinet/ip_output.c | 1284 +++ freebsd/sys/netinet/ip_var.h | 315 + freebsd/sys/netinet/ipfw/dn_heap.c | 552 + freebsd/sys/netinet/ipfw/dn_heap.h | 191 + freebsd/sys/netinet/ipfw/dn_sched.h | 189 + freebsd/sys/netinet/ipfw/dn_sched_fifo.c | 122 + freebsd/sys/netinet/ipfw/dn_sched_prio.c | 231 + freebsd/sys/netinet/ipfw/dn_sched_qfq.c | 866 ++ freebsd/sys/netinet/ipfw/dn_sched_rr.c | 309 + freebsd/sys/netinet/ipfw/dn_sched_wf2q.c | 375 + freebsd/sys/netinet/ipfw/ip_dn_glue.c | 847 ++ freebsd/sys/netinet/ipfw/ip_dn_io.c | 796 ++ freebsd/sys/netinet/ipfw/ip_dn_private.h | 402 + freebsd/sys/netinet/ipfw/ip_dummynet.c | 2297 +++++ freebsd/sys/netinet/ipfw/ip_fw2.c | 2495 +++++ freebsd/sys/netinet/ipfw/ip_fw_log.c | 451 + freebsd/sys/netinet/ipfw/ip_fw_nat.c | 606 ++ freebsd/sys/netinet/ipfw/ip_fw_pfil.c | 417 + freebsd/sys/netinet/ipfw/ip_fw_private.h | 301 + freebsd/sys/netinet/ipfw/ip_fw_sockopt.c | 1345 +++ freebsd/sys/netinet/ipfw/ip_fw_table.c | 288 + freebsd/sys/netinet/libalias/alias.c | 1793 ++++ freebsd/sys/netinet/libalias/alias.h | 232 + freebsd/sys/netinet/libalias/alias_cuseeme.c | 230 + freebsd/sys/netinet/libalias/alias_db.c | 2940 ++++++ freebsd/sys/netinet/libalias/alias_dummy.c | 155 + freebsd/sys/netinet/libalias/alias_ftp.c | 696 ++ freebsd/sys/netinet/libalias/alias_irc.c | 490 + freebsd/sys/netinet/libalias/alias_local.h | 397 + freebsd/sys/netinet/libalias/alias_mod.c | 292 + freebsd/sys/netinet/libalias/alias_mod.h | 163 + freebsd/sys/netinet/libalias/alias_nbt.c | 855 ++ freebsd/sys/netinet/libalias/alias_pptp.c | 525 + freebsd/sys/netinet/libalias/alias_proxy.c | 870 ++ freebsd/sys/netinet/libalias/alias_sctp.c | 2700 +++++ freebsd/sys/netinet/libalias/alias_sctp.h | 201 + freebsd/sys/netinet/libalias/alias_skinny.c | 449 + freebsd/sys/netinet/libalias/alias_smedia.c | 551 + freebsd/sys/netinet/libalias/alias_util.c | 178 + freebsd/sys/netinet/pim.h | 119 + freebsd/sys/netinet/pim_var.h | 84 + freebsd/sys/netinet/raw_ip.c | 1116 ++ freebsd/sys/netinet/sctp.h | 549 + freebsd/sys/netinet/sctp_asconf.c | 3397 +++++++ freebsd/sys/netinet/sctp_asconf.h | 96 + freebsd/sys/netinet/sctp_auth.c | 2128 ++++ freebsd/sys/netinet/sctp_auth.h | 235 + freebsd/sys/netinet/sctp_bsd_addr.c | 562 + freebsd/sys/netinet/sctp_bsd_addr.h | 63 + freebsd/sys/netinet/sctp_cc_functions.c | 1565 +++ freebsd/sys/netinet/sctp_cc_functions.h | 116 + freebsd/sys/netinet/sctp_constants.h | 1051 ++ freebsd/sys/netinet/sctp_crc32.c | 148 + freebsd/sys/netinet/sctp_crc32.h | 47 + freebsd/sys/netinet/sctp_header.h | 624 ++ freebsd/sys/netinet/sctp_indata.c | 5800 +++++++++++ freebsd/sys/netinet/sctp_indata.h | 129 + freebsd/sys/netinet/sctp_input.c | 5965 +++++++++++ freebsd/sys/netinet/sctp_input.h | 57 + freebsd/sys/netinet/sctp_lock_bsd.h | 430 + freebsd/sys/netinet/sctp_os.h | 72 + freebsd/sys/netinet/sctp_os_bsd.h | 503 + freebsd/sys/netinet/sctp_output.c | 13539 +++++++++++++++++++++++++ freebsd/sys/netinet/sctp_output.h | 229 + freebsd/sys/netinet/sctp_pcb.c | 6810 +++++++++++++ freebsd/sys/netinet/sctp_pcb.h | 632 ++ freebsd/sys/netinet/sctp_peeloff.c | 240 + freebsd/sys/netinet/sctp_peeloff.h | 52 + freebsd/sys/netinet/sctp_structs.h | 1094 ++ freebsd/sys/netinet/sctp_sysctl.c | 1108 ++ freebsd/sys/netinet/sctp_sysctl.h | 532 + freebsd/sys/netinet/sctp_timer.c | 1804 ++++ freebsd/sys/netinet/sctp_timer.h | 101 + freebsd/sys/netinet/sctp_uio.h | 1166 +++ freebsd/sys/netinet/sctp_usrreq.c | 4918 +++++++++ freebsd/sys/netinet/sctp_var.h | 336 + freebsd/sys/netinet/sctputil.c | 6977 +++++++++++++ freebsd/sys/netinet/sctputil.h | 392 + freebsd/sys/netinet/tcp.h | 2 + freebsd/sys/netinet/tcp_debug.c | 226 + freebsd/sys/netinet/tcp_debug.h | 80 + freebsd/sys/netinet/tcp_fsm.h | 112 + freebsd/sys/netinet/tcp_hostcache.c | 693 ++ freebsd/sys/netinet/tcp_hostcache.h | 82 + freebsd/sys/netinet/tcp_input.c | 3453 +++++++ freebsd/sys/netinet/tcp_lro.c | 389 + freebsd/sys/netinet/tcp_lro.h | 85 + freebsd/sys/netinet/tcp_offload.c | 147 + freebsd/sys/netinet/tcp_offload.h | 354 + freebsd/sys/netinet/tcp_output.c | 1485 +++ freebsd/sys/netinet/tcp_reass.c | 335 + freebsd/sys/netinet/tcp_sack.c | 687 ++ freebsd/sys/netinet/tcp_seq.h | 68 + freebsd/sys/netinet/tcp_subr.c | 2315 +++++ freebsd/sys/netinet/tcp_syncache.c | 1823 ++++ freebsd/sys/netinet/tcp_syncache.h | 127 + freebsd/sys/netinet/tcp_timer.c | 660 ++ freebsd/sys/netinet/tcp_timer.h | 183 + freebsd/sys/netinet/tcp_timewait.c | 618 ++ freebsd/sys/netinet/tcp_usrreq.c | 1886 ++++ freebsd/sys/netinet/tcp_var.h | 687 ++ freebsd/sys/netinet/tcpip.h | 59 + freebsd/sys/netinet/toedev.h | 162 + freebsd/sys/netinet/udp.h | 2 + freebsd/sys/netinet/udp_usrreq.c | 1633 +++ freebsd/sys/netinet/udp_var.h | 161 + 154 files changed, 140097 insertions(+) create mode 100644 freebsd/sys/netinet/accf_data.c create mode 100644 freebsd/sys/netinet/accf_dns.c create mode 100644 freebsd/sys/netinet/accf_http.c create mode 100644 freebsd/sys/netinet/icmp6.h create mode 100644 freebsd/sys/netinet/icmp_var.h create mode 100644 freebsd/sys/netinet/if_atm.c create mode 100644 freebsd/sys/netinet/if_atm.h create mode 100644 freebsd/sys/netinet/if_ether.c create mode 100644 freebsd/sys/netinet/if_ether.h create mode 100644 freebsd/sys/netinet/igmp.c create mode 100644 freebsd/sys/netinet/igmp.h create mode 100644 freebsd/sys/netinet/igmp_var.h create mode 100644 freebsd/sys/netinet/in.c create mode 100644 freebsd/sys/netinet/in.h create mode 100644 freebsd/sys/netinet/in_gif.c create mode 100644 freebsd/sys/netinet/in_gif.h create mode 100644 freebsd/sys/netinet/in_mcast.c create mode 100644 freebsd/sys/netinet/in_pcb.c create mode 100644 freebsd/sys/netinet/in_pcb.h create mode 100644 freebsd/sys/netinet/in_proto.c create mode 100644 freebsd/sys/netinet/in_rmx.c create mode 100644 freebsd/sys/netinet/in_systm.h create mode 100644 freebsd/sys/netinet/in_var.h create mode 100644 freebsd/sys/netinet/ip.h create mode 100644 freebsd/sys/netinet/ip6.h create mode 100644 freebsd/sys/netinet/ip_carp.c create mode 100644 freebsd/sys/netinet/ip_carp.h create mode 100644 freebsd/sys/netinet/ip_divert.c create mode 100644 freebsd/sys/netinet/ip_divert.h create mode 100644 freebsd/sys/netinet/ip_dummynet.h create mode 100644 freebsd/sys/netinet/ip_ecn.c create mode 100644 freebsd/sys/netinet/ip_ecn.h create mode 100644 freebsd/sys/netinet/ip_encap.c create mode 100644 freebsd/sys/netinet/ip_encap.h create mode 100644 freebsd/sys/netinet/ip_fastfwd.c create mode 100644 freebsd/sys/netinet/ip_fw.h create mode 100644 freebsd/sys/netinet/ip_gre.c create mode 100644 freebsd/sys/netinet/ip_gre.h create mode 100644 freebsd/sys/netinet/ip_icmp.c create mode 100644 freebsd/sys/netinet/ip_icmp.h create mode 100644 freebsd/sys/netinet/ip_id.c create mode 100644 freebsd/sys/netinet/ip_input.c create mode 100644 freebsd/sys/netinet/ip_ipsec.c create mode 100644 freebsd/sys/netinet/ip_ipsec.h create mode 100644 freebsd/sys/netinet/ip_mroute.c create mode 100644 freebsd/sys/netinet/ip_mroute.h create mode 100644 freebsd/sys/netinet/ip_options.c create mode 100644 freebsd/sys/netinet/ip_options.h create mode 100644 freebsd/sys/netinet/ip_output.c create mode 100644 freebsd/sys/netinet/ip_var.h create mode 100644 freebsd/sys/netinet/ipfw/dn_heap.c create mode 100644 freebsd/sys/netinet/ipfw/dn_heap.h create mode 100644 freebsd/sys/netinet/ipfw/dn_sched.h create mode 100644 freebsd/sys/netinet/ipfw/dn_sched_fifo.c create mode 100644 freebsd/sys/netinet/ipfw/dn_sched_prio.c create mode 100644 freebsd/sys/netinet/ipfw/dn_sched_qfq.c create mode 100644 freebsd/sys/netinet/ipfw/dn_sched_rr.c create mode 100644 freebsd/sys/netinet/ipfw/dn_sched_wf2q.c create mode 100644 freebsd/sys/netinet/ipfw/ip_dn_glue.c create mode 100644 freebsd/sys/netinet/ipfw/ip_dn_io.c create mode 100644 freebsd/sys/netinet/ipfw/ip_dn_private.h create mode 100644 freebsd/sys/netinet/ipfw/ip_dummynet.c create mode 100644 freebsd/sys/netinet/ipfw/ip_fw2.c create mode 100644 freebsd/sys/netinet/ipfw/ip_fw_log.c create mode 100644 freebsd/sys/netinet/ipfw/ip_fw_nat.c create mode 100644 freebsd/sys/netinet/ipfw/ip_fw_pfil.c create mode 100644 freebsd/sys/netinet/ipfw/ip_fw_private.h create mode 100644 freebsd/sys/netinet/ipfw/ip_fw_sockopt.c create mode 100644 freebsd/sys/netinet/ipfw/ip_fw_table.c create mode 100644 freebsd/sys/netinet/libalias/alias.c create mode 100644 freebsd/sys/netinet/libalias/alias.h create mode 100644 freebsd/sys/netinet/libalias/alias_cuseeme.c create mode 100644 freebsd/sys/netinet/libalias/alias_db.c create mode 100644 freebsd/sys/netinet/libalias/alias_dummy.c create mode 100644 freebsd/sys/netinet/libalias/alias_ftp.c create mode 100644 freebsd/sys/netinet/libalias/alias_irc.c create mode 100644 freebsd/sys/netinet/libalias/alias_local.h create mode 100644 freebsd/sys/netinet/libalias/alias_mod.c create mode 100644 freebsd/sys/netinet/libalias/alias_mod.h create mode 100644 freebsd/sys/netinet/libalias/alias_nbt.c create mode 100644 freebsd/sys/netinet/libalias/alias_pptp.c create mode 100644 freebsd/sys/netinet/libalias/alias_proxy.c create mode 100644 freebsd/sys/netinet/libalias/alias_sctp.c create mode 100644 freebsd/sys/netinet/libalias/alias_sctp.h create mode 100644 freebsd/sys/netinet/libalias/alias_skinny.c create mode 100644 freebsd/sys/netinet/libalias/alias_smedia.c create mode 100644 freebsd/sys/netinet/libalias/alias_util.c create mode 100644 freebsd/sys/netinet/pim.h create mode 100644 freebsd/sys/netinet/pim_var.h create mode 100644 freebsd/sys/netinet/raw_ip.c create mode 100644 freebsd/sys/netinet/sctp.h create mode 100644 freebsd/sys/netinet/sctp_asconf.c create mode 100644 freebsd/sys/netinet/sctp_asconf.h create mode 100644 freebsd/sys/netinet/sctp_auth.c create mode 100644 freebsd/sys/netinet/sctp_auth.h create mode 100644 freebsd/sys/netinet/sctp_bsd_addr.c create mode 100644 freebsd/sys/netinet/sctp_bsd_addr.h create mode 100644 freebsd/sys/netinet/sctp_cc_functions.c create mode 100644 freebsd/sys/netinet/sctp_cc_functions.h create mode 100644 freebsd/sys/netinet/sctp_constants.h create mode 100644 freebsd/sys/netinet/sctp_crc32.c create mode 100644 freebsd/sys/netinet/sctp_crc32.h create mode 100644 freebsd/sys/netinet/sctp_header.h create mode 100644 freebsd/sys/netinet/sctp_indata.c create mode 100644 freebsd/sys/netinet/sctp_indata.h create mode 100644 freebsd/sys/netinet/sctp_input.c create mode 100644 freebsd/sys/netinet/sctp_input.h create mode 100644 freebsd/sys/netinet/sctp_lock_bsd.h create mode 100644 freebsd/sys/netinet/sctp_os.h create mode 100644 freebsd/sys/netinet/sctp_os_bsd.h create mode 100644 freebsd/sys/netinet/sctp_output.c create mode 100644 freebsd/sys/netinet/sctp_output.h create mode 100644 freebsd/sys/netinet/sctp_pcb.c create mode 100644 freebsd/sys/netinet/sctp_pcb.h create mode 100644 freebsd/sys/netinet/sctp_peeloff.c create mode 100644 freebsd/sys/netinet/sctp_peeloff.h create mode 100644 freebsd/sys/netinet/sctp_structs.h create mode 100644 freebsd/sys/netinet/sctp_sysctl.c create mode 100644 freebsd/sys/netinet/sctp_sysctl.h create mode 100644 freebsd/sys/netinet/sctp_timer.c create mode 100644 freebsd/sys/netinet/sctp_timer.h create mode 100644 freebsd/sys/netinet/sctp_uio.h create mode 100644 freebsd/sys/netinet/sctp_usrreq.c create mode 100644 freebsd/sys/netinet/sctp_var.h create mode 100644 freebsd/sys/netinet/sctputil.c create mode 100644 freebsd/sys/netinet/sctputil.h create mode 100644 freebsd/sys/netinet/tcp.h create mode 100644 freebsd/sys/netinet/tcp_debug.c create mode 100644 freebsd/sys/netinet/tcp_debug.h create mode 100644 freebsd/sys/netinet/tcp_fsm.h create mode 100644 freebsd/sys/netinet/tcp_hostcache.c create mode 100644 freebsd/sys/netinet/tcp_hostcache.h create mode 100644 freebsd/sys/netinet/tcp_input.c create mode 100644 freebsd/sys/netinet/tcp_lro.c create mode 100644 freebsd/sys/netinet/tcp_lro.h create mode 100644 freebsd/sys/netinet/tcp_offload.c create mode 100644 freebsd/sys/netinet/tcp_offload.h create mode 100644 freebsd/sys/netinet/tcp_output.c create mode 100644 freebsd/sys/netinet/tcp_reass.c create mode 100644 freebsd/sys/netinet/tcp_sack.c create mode 100644 freebsd/sys/netinet/tcp_seq.h create mode 100644 freebsd/sys/netinet/tcp_subr.c create mode 100644 freebsd/sys/netinet/tcp_syncache.c create mode 100644 freebsd/sys/netinet/tcp_syncache.h create mode 100644 freebsd/sys/netinet/tcp_timer.c create mode 100644 freebsd/sys/netinet/tcp_timer.h create mode 100644 freebsd/sys/netinet/tcp_timewait.c create mode 100644 freebsd/sys/netinet/tcp_usrreq.c create mode 100644 freebsd/sys/netinet/tcp_var.h create mode 100644 freebsd/sys/netinet/tcpip.h create mode 100644 freebsd/sys/netinet/toedev.h create mode 100644 freebsd/sys/netinet/udp.h create mode 100644 freebsd/sys/netinet/udp_usrreq.c create mode 100644 freebsd/sys/netinet/udp_var.h (limited to 'freebsd/sys/netinet') diff --git a/freebsd/sys/netinet/accf_data.c b/freebsd/sys/netinet/accf_data.c new file mode 100644 index 00000000..15696daf --- /dev/null +++ b/freebsd/sys/netinet/accf_data.c @@ -0,0 +1,68 @@ +#include + +/*- + * Copyright (c) 2000 Alfred Perlstein + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#define ACCEPT_FILTER_MOD + +#include +#include +#include +#include +#include +#include + +/* accept filter that holds a socket until data arrives */ + +static int sohasdata(struct socket *so, void *arg, int waitflag); + +static struct accept_filter accf_data_filter = { + "dataready", + sohasdata, + NULL, + NULL +}; + +static moduledata_t accf_data_mod = { + "accf_data", + accept_filt_generic_mod_event, + &accf_data_filter +}; + +DECLARE_MODULE(accf_data, accf_data_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); + +static int +sohasdata(struct socket *so, void *arg, int waitflag) +{ + + if (!soreadable(so)) + return (SU_OK); + + return (SU_ISCONNECTED); +} diff --git a/freebsd/sys/netinet/accf_dns.c b/freebsd/sys/netinet/accf_dns.c new file mode 100644 index 00000000..f91cbb08 --- /dev/null +++ b/freebsd/sys/netinet/accf_dns.c @@ -0,0 +1,134 @@ +#include + +/* + * Copyright (C) 2007 David Malone + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#define ACCEPT_FILTER_MOD + +#include +#include +#include +#include +#include +#include +#include + +/* check for full DNS request */ +static int sohasdns(struct socket *so, void *arg, int waitflag); + +struct packet { + struct mbuf *m; /* Current mbuf. */ + struct mbuf *n; /* nextpkt mbuf. */ + unsigned long moff; /* Offset of the beginning of m. */ + unsigned long offset; /* Which offset we are working at. */ + unsigned long len; /* The number of bytes we have to play with. */ +}; + +#define DNS_OK 0 +#define DNS_WAIT -1 +#define DNS_RUN -2 + +/* check we can skip over various parts of DNS request */ +static int skippacket(struct sockbuf *sb); + +static struct accept_filter accf_dns_filter = { + "dnsready", + sohasdns, + NULL, + NULL +}; + +static moduledata_t accf_dns_mod = { + "accf_dns", + accept_filt_generic_mod_event, + &accf_dns_filter +}; + +DECLARE_MODULE(accf_dns, accf_dns_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); + +static int +sohasdns(struct socket *so, void *arg, int waitflag) +{ + struct sockbuf *sb = &so->so_rcv; + + /* If the socket is full, we're ready. */ + if (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax) + goto ready; + + /* Check to see if we have a request. */ + if (skippacket(sb) == DNS_WAIT) + return (SU_OK); + +ready: + return (SU_ISCONNECTED); +} + +#define GET8(p, val) do { \ + if (p->offset < p->moff) \ + return DNS_RUN; \ + while (p->offset >= p->moff + p->m->m_len) { \ + p->moff += p->m->m_len; \ + p->m = p->m->m_next; \ + if (p->m == NULL) { \ + p->m = p->n; \ + p->n = p->m->m_nextpkt; \ + } \ + if (p->m == NULL) \ + return DNS_WAIT; \ + } \ + val = *(mtod(p->m, unsigned char *) + (p->offset - p->moff)); \ + p->offset++; \ + } while (0) + +#define GET16(p, val) do { \ + unsigned int v0, v1; \ + GET8(p, v0); \ + GET8(p, v1); \ + val = v0 * 0x100 + v1; \ + } while (0) + +static int +skippacket(struct sockbuf *sb) { + unsigned long packlen; + struct packet q, *p = &q; + + if (sb->sb_cc < 2) + return DNS_WAIT; + + q.m = sb->sb_mb; + q.n = q.m->m_nextpkt; + q.moff = 0; + q.offset = 0; + q.len = sb->sb_cc; + + GET16(p, packlen); + if (packlen + 2 > q.len) + return DNS_WAIT; + + return DNS_OK; +} diff --git a/freebsd/sys/netinet/accf_http.c b/freebsd/sys/netinet/accf_http.c new file mode 100644 index 00000000..ce21b1d1 --- /dev/null +++ b/freebsd/sys/netinet/accf_http.c @@ -0,0 +1,351 @@ +#include + +/*- + * Copyright (c) 2000 Paycounter, Inc. + * Author: Alfred Perlstein , + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#define ACCEPT_FILTER_MOD + +#include +#include +#include +#include +#include +#include +#include + +/* check for GET/HEAD */ +static int sohashttpget(struct socket *so, void *arg, int waitflag); +/* check for HTTP/1.0 or HTTP/1.1 */ +static int soparsehttpvers(struct socket *so, void *arg, int waitflag); +/* check for end of HTTP/1.x request */ +static int soishttpconnected(struct socket *so, void *arg, int waitflag); +/* strcmp on an mbuf chain */ +static int mbufstrcmp(struct mbuf *m, struct mbuf *npkt, int offset, char *cmp); +/* strncmp on an mbuf chain */ +static int mbufstrncmp(struct mbuf *m, struct mbuf *npkt, int offset, + int max, char *cmp); +/* socketbuffer is full */ +static int sbfull(struct sockbuf *sb); + +static struct accept_filter accf_http_filter = { + "httpready", + sohashttpget, + NULL, + NULL +}; + +static moduledata_t accf_http_mod = { + "accf_http", + accept_filt_generic_mod_event, + &accf_http_filter +}; + +DECLARE_MODULE(accf_http, accf_http_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); + +static int parse_http_version = 1; + +SYSCTL_NODE(_net_inet_accf, OID_AUTO, http, CTLFLAG_RW, 0, +"HTTP accept filter"); +SYSCTL_INT(_net_inet_accf_http, OID_AUTO, parsehttpversion, CTLFLAG_RW, +&parse_http_version, 1, +"Parse http version so that non 1.x requests work"); + +#ifdef ACCF_HTTP_DEBUG +#define DPRINT(fmt, args...) \ + do { \ + printf("%s:%d: " fmt "\n", __func__, __LINE__, ##args); \ + } while (0) +#else +#define DPRINT(fmt, args...) +#endif + +static int +sbfull(struct sockbuf *sb) +{ + + DPRINT("sbfull, cc(%ld) >= hiwat(%ld): %d, " + "mbcnt(%ld) >= mbmax(%ld): %d", + sb->sb_cc, sb->sb_hiwat, sb->sb_cc >= sb->sb_hiwat, + sb->sb_mbcnt, sb->sb_mbmax, sb->sb_mbcnt >= sb->sb_mbmax); + return (sb->sb_cc >= sb->sb_hiwat || sb->sb_mbcnt >= sb->sb_mbmax); +} + +/* + * start at mbuf m, (must provide npkt if exists) + * starting at offset in m compare characters in mbuf chain for 'cmp' + */ +static int +mbufstrcmp(struct mbuf *m, struct mbuf *npkt, int offset, char *cmp) +{ + struct mbuf *n; + + for (; m != NULL; m = n) { + n = npkt; + if (npkt) + npkt = npkt->m_nextpkt; + for (; m; m = m->m_next) { + for (; offset < m->m_len; offset++, cmp++) { + if (*cmp == '\0') + return (1); + else if (*cmp != *(mtod(m, char *) + offset)) + return (0); + } + if (*cmp == '\0') + return (1); + offset = 0; + } + } + return (0); +} + +/* + * start at mbuf m, (must provide npkt if exists) + * starting at offset in m compare characters in mbuf chain for 'cmp' + * stop at 'max' characters + */ +static int +mbufstrncmp(struct mbuf *m, struct mbuf *npkt, int offset, int max, char *cmp) +{ + struct mbuf *n; + + for (; m != NULL; m = n) { + n = npkt; + if (npkt) + npkt = npkt->m_nextpkt; + for (; m; m = m->m_next) { + for (; offset < m->m_len; offset++, cmp++, max--) { + if (max == 0 || *cmp == '\0') + return (1); + else if (*cmp != *(mtod(m, char *) + offset)) + return (0); + } + if (max == 0 || *cmp == '\0') + return (1); + offset = 0; + } + } + return (0); +} + +#define STRSETUP(sptr, slen, str) \ + do { \ + sptr = str; \ + slen = sizeof(str) - 1; \ + } while(0) + +static int +sohashttpget(struct socket *so, void *arg, int waitflag) +{ + + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 && !sbfull(&so->so_rcv)) { + struct mbuf *m; + char *cmp; + int cmplen, cc; + + m = so->so_rcv.sb_mb; + cc = so->so_rcv.sb_cc - 1; + if (cc < 1) + return (SU_OK); + switch (*mtod(m, char *)) { + case 'G': + STRSETUP(cmp, cmplen, "ET "); + break; + case 'H': + STRSETUP(cmp, cmplen, "EAD "); + break; + default: + goto fallout; + } + if (cc < cmplen) { + if (mbufstrncmp(m, m->m_nextpkt, 1, cc, cmp) == 1) { + DPRINT("short cc (%d) but mbufstrncmp ok", cc); + return (SU_OK); + } else { + DPRINT("short cc (%d) mbufstrncmp failed", cc); + goto fallout; + } + } + if (mbufstrcmp(m, m->m_nextpkt, 1, cmp) == 1) { + DPRINT("mbufstrcmp ok"); + if (parse_http_version == 0) + return (soishttpconnected(so, arg, waitflag)); + else + return (soparsehttpvers(so, arg, waitflag)); + } + DPRINT("mbufstrcmp bad"); + } + +fallout: + DPRINT("fallout"); + return (SU_ISCONNECTED); +} + +static int +soparsehttpvers(struct socket *so, void *arg, int waitflag) +{ + struct mbuf *m, *n; + int i, cc, spaces, inspaces; + + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) + goto fallout; + + m = so->so_rcv.sb_mb; + cc = so->so_rcv.sb_cc; + inspaces = spaces = 0; + for (m = so->so_rcv.sb_mb; m; m = n) { + n = m->m_nextpkt; + for (; m; m = m->m_next) { + for (i = 0; i < m->m_len; i++, cc--) { + switch (*(mtod(m, char *) + i)) { + case ' ': + /* tabs? '\t' */ + if (!inspaces) { + spaces++; + inspaces = 1; + } + break; + case '\r': + case '\n': + DPRINT("newline"); + goto fallout; + default: + if (spaces != 2) { + inspaces = 0; + break; + } + + /* + * if we don't have enough characters + * left (cc < sizeof("HTTP/1.0") - 1) + * then see if the remaining ones + * are a request we can parse. + */ + if (cc < sizeof("HTTP/1.0") - 1) { + if (mbufstrncmp(m, n, i, cc, + "HTTP/1.") == 1) { + DPRINT("ok"); + goto readmore; + } else { + DPRINT("bad"); + goto fallout; + } + } else if ( + mbufstrcmp(m, n, i, "HTTP/1.0") || + mbufstrcmp(m, n, i, "HTTP/1.1")) { + DPRINT("ok"); + return (soishttpconnected(so, + arg, waitflag)); + } else { + DPRINT("bad"); + goto fallout; + } + } + } + } + } +readmore: + DPRINT("readmore"); + /* + * if we hit here we haven't hit something + * we don't understand or a newline, so try again + */ + soupcall_set(so, SO_RCV, soparsehttpvers, arg); + return (SU_OK); + +fallout: + DPRINT("fallout"); + return (SU_ISCONNECTED); +} + + +#define NCHRS 3 + +static int +soishttpconnected(struct socket *so, void *arg, int waitflag) +{ + char a, b, c; + struct mbuf *m, *n; + int ccleft, copied; + + DPRINT("start"); + if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0 || sbfull(&so->so_rcv)) + goto gotit; + + /* + * Walk the socketbuffer and copy the last NCHRS (3) into a, b, and c + * copied - how much we've copied so far + * ccleft - how many bytes remaining in the socketbuffer + * just loop over the mbufs subtracting from 'ccleft' until we only + * have NCHRS left + */ + copied = 0; + ccleft = so->so_rcv.sb_cc; + if (ccleft < NCHRS) + goto readmore; + a = b = c = '\0'; + for (m = so->so_rcv.sb_mb; m; m = n) { + n = m->m_nextpkt; + for (; m; m = m->m_next) { + ccleft -= m->m_len; + if (ccleft <= NCHRS) { + char *src; + int tocopy; + + tocopy = (NCHRS - ccleft) - copied; + src = mtod(m, char *) + (m->m_len - tocopy); + + while (tocopy--) { + switch (copied++) { + case 0: + a = *src++; + break; + case 1: + b = *src++; + break; + case 2: + c = *src++; + break; + } + } + } + } + } + if (c == '\n' && (b == '\n' || (b == '\r' && a == '\n'))) { + /* we have all request headers */ + goto gotit; + } + +readmore: + soupcall_set(so, SO_RCV, soishttpconnected, arg); + return (SU_OK); + +gotit: + return (SU_ISCONNECTED); +} diff --git a/freebsd/sys/netinet/icmp6.h b/freebsd/sys/netinet/icmp6.h new file mode 100644 index 00000000..bf61ac5b --- /dev/null +++ b/freebsd/sys/netinet/icmp6.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/netinet/icmp_var.h b/freebsd/sys/netinet/icmp_var.h new file mode 100644 index 00000000..d55fc4d3 --- /dev/null +++ b/freebsd/sys/netinet/icmp_var.h @@ -0,0 +1,108 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)icmp_var.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_ICMP_VAR_HH_ +#define _NETINET_ICMP_VAR_HH_ + + +/* + * Variables related to this implementation + * of the internet control message protocol. + */ +struct icmpstat { +/* statistics related to icmp packets generated */ + u_long icps_error; /* # of calls to icmp_error */ + u_long icps_oldshort; /* no error 'cuz old ip too short */ + u_long icps_oldicmp; /* no error 'cuz old was icmp */ + u_long icps_outhist[ICMP_MAXTYPE + 1]; +/* statistics related to input messages processed */ + u_long icps_badcode; /* icmp_code out of range */ + u_long icps_tooshort; /* packet < ICMP_MINLEN */ + u_long icps_checksum; /* bad checksum */ + u_long icps_badlen; /* calculated bound mismatch */ + u_long icps_reflect; /* number of responses */ + u_long icps_inhist[ICMP_MAXTYPE + 1]; + u_long icps_bmcastecho; /* b/mcast echo requests dropped */ + u_long icps_bmcasttstamp; /* b/mcast tstamp requests dropped */ + u_long icps_badaddr; /* bad return address */ + u_long icps_noroute; /* no route back */ +}; + +#ifdef _KERNEL +/* + * In-kernel consumers can use these accessor macros directly to update + * stats. + */ +#define ICMPSTAT_ADD(name, val) V_icmpstat.name += (val) +#define ICMPSTAT_INC(name) ICMPSTAT_ADD(name, 1) + +/* + * Kernel module consumers must use this accessor macro. + */ +void kmod_icmpstat_inc(int statnum); +#define KMOD_ICMPSTAT_INC(name) \ + kmod_icmpstat_inc(offsetof(struct icmpstat, name) / sizeof(u_long)) +#endif + +/* + * Names for ICMP sysctl objects + */ +#define ICMPCTL_MASKREPL 1 /* allow replies to netmask requests */ +#define ICMPCTL_STATS 2 /* statistics (read-only) */ +#define ICMPCTL_ICMPLIM 3 +#define ICMPCTL_MAXID 4 + +#define ICMPCTL_NAMES { \ + { 0, 0 }, \ + { "maskrepl", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "icmplim", CTLTYPE_INT }, \ +} + +#ifdef _KERNEL +SYSCTL_DECL(_net_inet_icmp); + +VNET_DECLARE(struct icmpstat, icmpstat); /* icmp statistics. */ +#define V_icmpstat VNET(icmpstat) + +extern int badport_bandlim(int); +#define BANDLIM_UNLIMITED -1 +#define BANDLIM_ICMP_UNREACH 0 +#define BANDLIM_ICMP_ECHO 1 +#define BANDLIM_ICMP_TSTAMP 2 +#define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */ +#define BANDLIM_RST_OPENPORT 4 /* No connection, listener */ +#define BANDLIM_ICMP6_UNREACH 5 +#define BANDLIM_MAX 5 +#endif + +#endif diff --git a/freebsd/sys/netinet/if_atm.c b/freebsd/sys/netinet/if_atm.c new file mode 100644 index 00000000..ea6c567d --- /dev/null +++ b/freebsd/sys/netinet/if_atm.c @@ -0,0 +1,366 @@ +#include + +/* $NetBSD: if_atm.c,v 1.6 1996/10/13 02:03:01 christos Exp $ */ + +/*- + * + * Copyright (c) 1996 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + +/* + * IP <=> ATM address resolution. + */ +#include +#include +#include + +#if defined(INET) || defined(INET6) + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#ifdef NATM +#include +#endif + +#define SDL(s) ((struct sockaddr_dl *)s) + +#define GET3BYTE(V, A, L) do { \ + (V) = ((A)[0] << 16) | ((A)[1] << 8) | (A)[2]; \ + (A) += 3; \ + (L) -= 3; \ + } while (0) + +#define GET2BYTE(V, A, L) do { \ + (V) = ((A)[0] << 8) | (A)[1]; \ + (A) += 2; \ + (L) -= 2; \ + } while (0) + +#define GET1BYTE(V, A, L) do { \ + (V) = *(A)++; \ + (L)--; \ + } while (0) + + +/* + * atm_rtrequest: handle ATM rt request (in support of generic code) + * inputs: "req" = request code + * "rt" = route entry + * "info" = rt_addrinfo + */ +void +atm_rtrequest(int req, struct rtentry *rt, struct rt_addrinfo *info) +{ + struct sockaddr *gate = rt->rt_gateway; + struct atmio_openvcc op; + struct atmio_closevcc cl; + u_char *addr; + u_int alen; +#ifdef NATM + struct sockaddr_in *sin; + struct natmpcb *npcb = NULL; +#endif + static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; + + if (rt->rt_flags & RTF_GATEWAY) /* link level requests only */ + return; + + switch (req) { + + case RTM_RESOLVE: /* resolve: only happens when cloning */ + printf("atm_rtrequest: RTM_RESOLVE request detected?\n"); + break; + + case RTM_ADD: + /* + * route added by a command (e.g. ifconfig, route, arp...). + * + * first check to see if this is not a host route, in which + * case we are being called via "ifconfig" to set the address. + */ + if ((rt->rt_flags & RTF_HOST) == 0) { + rt_setgate(rt,rt_key(rt),(struct sockaddr *)&null_sdl); + gate = rt->rt_gateway; + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + break; + } + + if (gate->sa_family != AF_LINK || + gate->sa_len < sizeof(null_sdl)) { + log(LOG_DEBUG, "atm_rtrequest: bad gateway value"); + break; + } + + KASSERT(rt->rt_ifp->if_ioctl != NULL, + ("atm_rtrequest: null ioctl")); + + /* + * Parse and verify the link level address as + * an open request + */ +#ifdef NATM + NATM_LOCK(); +#endif + bzero(&op, sizeof(op)); + addr = LLADDR(SDL(gate)); + alen = SDL(gate)->sdl_alen; + if (alen < 4) { + printf("%s: bad link-level address\n", __func__); + goto failed; + } + + if (alen == 4) { + /* old type address */ + GET1BYTE(op.param.flags, addr, alen); + GET1BYTE(op.param.vpi, addr, alen); + GET2BYTE(op.param.vci, addr, alen); + op.param.traffic = ATMIO_TRAFFIC_UBR; + op.param.aal = (op.param.flags & ATM_PH_AAL5) ? + ATMIO_AAL_5 : ATMIO_AAL_0; + } else { + /* new address */ + op.param.aal = ATMIO_AAL_5; + + GET1BYTE(op.param.flags, addr, alen); + op.param.flags &= ATM_PH_LLCSNAP; + + GET1BYTE(op.param.vpi, addr, alen); + GET2BYTE(op.param.vci, addr, alen); + + GET1BYTE(op.param.traffic, addr, alen); + + switch (op.param.traffic) { + + case ATMIO_TRAFFIC_UBR: + if (alen >= 3) + GET3BYTE(op.param.tparam.pcr, + addr, alen); + break; + + case ATMIO_TRAFFIC_CBR: + if (alen < 3) + goto bad_param; + GET3BYTE(op.param.tparam.pcr, addr, alen); + break; + + case ATMIO_TRAFFIC_VBR: + if (alen < 3 * 3) + goto bad_param; + GET3BYTE(op.param.tparam.pcr, addr, alen); + GET3BYTE(op.param.tparam.scr, addr, alen); + GET3BYTE(op.param.tparam.mbs, addr, alen); + break; + + case ATMIO_TRAFFIC_ABR: + if (alen < 4 * 3 + 2 + 1 * 2 + 3) + goto bad_param; + GET3BYTE(op.param.tparam.pcr, addr, alen); + GET3BYTE(op.param.tparam.mcr, addr, alen); + GET3BYTE(op.param.tparam.icr, addr, alen); + GET3BYTE(op.param.tparam.tbe, addr, alen); + GET1BYTE(op.param.tparam.nrm, addr, alen); + GET1BYTE(op.param.tparam.trm, addr, alen); + GET2BYTE(op.param.tparam.adtf, addr, alen); + GET1BYTE(op.param.tparam.rif, addr, alen); + GET1BYTE(op.param.tparam.rdf, addr, alen); + GET1BYTE(op.param.tparam.cdf, addr, alen); + break; + + default: + bad_param: + printf("%s: bad traffic params\n", __func__); + goto failed; + } + } + op.param.rmtu = op.param.tmtu = rt->rt_ifp->if_mtu; +#ifdef NATM + /* + * let native ATM know we are using this VCI/VPI + * (i.e. reserve it) + */ + sin = (struct sockaddr_in *) rt_key(rt); + if (sin->sin_family != AF_INET) + goto failed; + npcb = npcb_add(NULL, rt->rt_ifp, op.param.vci, op.param.vpi); + if (npcb == NULL) + goto failed; + npcb->npcb_flags |= NPCB_IP; + npcb->ipaddr.s_addr = sin->sin_addr.s_addr; + /* XXX: move npcb to llinfo when ATM ARP is ready */ + rt->rt_llinfo = (caddr_t) npcb; + rt->rt_flags |= RTF_LLINFO; +#endif + /* + * let the lower level know this circuit is active + */ + op.rxhand = NULL; + op.param.flags |= ATMIO_FLAG_ASYNC; + if (rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMOPENVCC, + (caddr_t)&op) != 0) { + printf("atm: couldn't add VC\n"); + goto failed; + } + + SDL(gate)->sdl_type = rt->rt_ifp->if_type; + SDL(gate)->sdl_index = rt->rt_ifp->if_index; + +#ifdef NATM + NATM_UNLOCK(); +#endif + break; + +failed: +#ifdef NATM + if (npcb) { + npcb_free(npcb, NPCB_DESTROY); + rt->rt_llinfo = NULL; + rt->rt_flags &= ~RTF_LLINFO; + } + NATM_UNLOCK(); +#endif + /* mark as invalid. We cannot RTM_DELETE the route from + * here, because the recursive call to rtrequest1 does + * not really work. */ + rt->rt_flags |= RTF_REJECT; + break; + + case RTM_DELETE: +#ifdef NATM + /* + * tell native ATM we are done with this VC + */ + if (rt->rt_flags & RTF_LLINFO) { + NATM_LOCK(); + npcb_free((struct natmpcb *)rt->rt_llinfo, + NPCB_DESTROY); + rt->rt_llinfo = NULL; + rt->rt_flags &= ~RTF_LLINFO; + NATM_UNLOCK(); + } +#endif + /* + * tell the lower layer to disable this circuit + */ + bzero(&op, sizeof(op)); + addr = LLADDR(SDL(gate)); + addr++; + cl.vpi = *addr++; + cl.vci = *addr++ << 8; + cl.vci |= *addr++; + (void)rt->rt_ifp->if_ioctl(rt->rt_ifp, SIOCATMCLOSEVCC, + (caddr_t)&cl); + break; + } +} + +/* + * atmresolve: + * inputs: + * [1] "rt" = the link level route to use (or null if need to look one up) + * [2] "m" = mbuf containing the data to be sent + * [3] "dst" = sockaddr_in (IP) address of dest. + * output: + * [4] "desten" = ATM pseudo header which we will fill in VPI/VCI info + * return: + * 0 == resolve FAILED; note that "m" gets m_freem'd in this case + * 1 == resolve OK; desten contains result + * + * XXX: will need more work if we wish to support ATMARP in the kernel, + * but this is enough for PVCs entered via the "route" command. + */ +int +atmresolve(struct rtentry *rt, struct mbuf *m, struct sockaddr *dst, + struct atm_pseudohdr *desten) +{ + struct sockaddr_dl *sdl; + + if (m->m_flags & (M_BCAST | M_MCAST)) { + log(LOG_INFO, + "atmresolve: BCAST/MCAST packet detected/dumped\n"); + goto bad; + } + + if (rt == NULL) { + rt = RTALLOC1(dst, 0); /* link level on table 0 XXX MRT */ + if (rt == NULL) + goto bad; /* failed */ + RT_REMREF(rt); /* don't keep LL references */ + if ((rt->rt_flags & RTF_GATEWAY) != 0 || + rt->rt_gateway->sa_family != AF_LINK) { + RT_UNLOCK(rt); + goto bad; + } + RT_UNLOCK(rt); + } + + /* + * note that rt_gateway is a sockaddr_dl which contains the + * atm_pseudohdr data structure for this route. we currently + * don't need any rt_llinfo info (but will if we want to support + * ATM ARP [c.f. if_ether.c]). + */ + sdl = SDL(rt->rt_gateway); + + /* + * Check the address family and length is valid, the address + * is resolved; otherwise, try to resolve. + */ + if (sdl->sdl_family == AF_LINK && sdl->sdl_alen >= sizeof(*desten)) { + bcopy(LLADDR(sdl), desten, sizeof(*desten)); + return (1); /* ok, go for it! */ + } + + /* + * we got an entry, but it doesn't have valid link address + * info in it (it is prob. the interface route, which has + * sdl_alen == 0). dump packet. (fall through to "bad"). + */ +bad: + m_freem(m); + return (0); +} +#endif /* INET */ diff --git a/freebsd/sys/netinet/if_atm.h b/freebsd/sys/netinet/if_atm.h new file mode 100644 index 00000000..bd8b5143 --- /dev/null +++ b/freebsd/sys/netinet/if_atm.h @@ -0,0 +1,47 @@ +/* $FreeBSD$ */ +/* $NetBSD: if_atm.h,v 1.2 1996/07/03 17:17:17 chuck Exp $ */ + +/*- + * + * Copyright (c) 1996 Charles D. Cranor and Washington University. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Charles D. Cranor and + * Washington University. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * if_atm.h + */ + +struct atm_pseudohdr; +struct mbuf; +struct rtentry; +struct sockaddr; + +void atm_rtrequest(int, struct rtentry *, struct rt_addrinfo *); +int atmresolve(struct rtentry *, struct mbuf *, struct sockaddr *, + struct atm_pseudohdr *); diff --git a/freebsd/sys/netinet/if_ether.c b/freebsd/sys/netinet/if_ether.c new file mode 100644 index 00000000..2e40c0d2 --- /dev/null +++ b/freebsd/sys/netinet/if_ether.c @@ -0,0 +1,859 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)if_ether.c 8.1 (Berkeley) 6/10/93 + */ + +/* + * Ethernet address resolution protocol. + * TODO: + * add "inuse/lock" bit (or ref. count) along with valid bit + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#if defined(INET) || defined(INET6) +#include +#endif + +#include +#include + +#include + +#define SIN(s) ((struct sockaddr_in *)s) +#define SDL(s) ((struct sockaddr_dl *)s) + +SYSCTL_DECL(_net_link_ether); +SYSCTL_NODE(_net_link_ether, PF_INET, inet, CTLFLAG_RW, 0, ""); +SYSCTL_NODE(_net_link_ether, PF_ARP, arp, CTLFLAG_RW, 0, ""); + +/* timer values */ +static VNET_DEFINE(int, arpt_keep) = (20*60); /* once resolved, good for 20 + * minutes */ +static VNET_DEFINE(int, arp_maxtries) = 5; +VNET_DEFINE(int, useloopback) = 1; /* use loopback interface for + * local traffic */ +static VNET_DEFINE(int, arp_proxyall) = 0; +static VNET_DEFINE(int, arpt_down) = 20; /* keep incomplete entries for + * 20 seconds */ +static VNET_DEFINE(struct arpstat, arpstat); /* ARP statistics, see if_arp.h */ + +#define V_arpt_keep VNET(arpt_keep) +#define V_arpt_down VNET(arpt_down) +#define V_arp_maxtries VNET(arp_maxtries) +#define V_arp_proxyall VNET(arp_proxyall) +#define V_arpstat VNET(arpstat) + +SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, + &VNET_NAME(arpt_keep), 0, + "ARP entry lifetime in seconds"); +SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, + &VNET_NAME(arp_maxtries), 0, + "ARP resolution attempts before returning error"); +SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, + &VNET_NAME(useloopback), 0, + "Use the loopback interface for local traffic"); +SYSCTL_VNET_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, + &VNET_NAME(arp_proxyall), 0, + "Enable proxy ARP for all suitable requests"); +SYSCTL_VNET_STRUCT(_net_link_ether_arp, OID_AUTO, stats, CTLFLAG_RW, + &VNET_NAME(arpstat), arpstat, + "ARP statistics (struct arpstat, net/if_arp.h)"); + +static void arp_init(void); +void arprequest(struct ifnet *, + struct in_addr *, struct in_addr *, u_char *); +static void arpintr(struct mbuf *); +static void arptimer(void *); +#ifdef INET +static void in_arpinput(struct mbuf *); +#endif + +static const struct netisr_handler arp_nh = { + .nh_name = "arp", + .nh_handler = arpintr, + .nh_proto = NETISR_ARP, + .nh_policy = NETISR_POLICY_SOURCE, +}; + +#ifdef AF_INET +void arp_ifscrub(struct ifnet *ifp, uint32_t addr); + +/* + * called by in_ifscrub to remove entry from the table when + * the interface goes away + */ +void +arp_ifscrub(struct ifnet *ifp, uint32_t addr) +{ + struct sockaddr_in addr4; + + bzero((void *)&addr4, sizeof(addr4)); + addr4.sin_len = sizeof(addr4); + addr4.sin_family = AF_INET; + addr4.sin_addr.s_addr = addr; + IF_AFDATA_LOCK(ifp); + lla_lookup(LLTABLE(ifp), (LLE_DELETE | LLE_IFADDR), + (struct sockaddr *)&addr4); + IF_AFDATA_UNLOCK(ifp); +} +#endif + +/* + * Timeout routine. Age arp_tab entries periodically. + */ +static void +arptimer(void *arg) +{ + struct ifnet *ifp; + struct llentry *lle; + + KASSERT(arg != NULL, ("%s: arg NULL", __func__)); + lle = (struct llentry *)arg; + ifp = lle->lle_tbl->llt_ifp; + CURVNET_SET(ifp->if_vnet); + IF_AFDATA_LOCK(ifp); + LLE_WLOCK(lle); + if (lle->la_flags & LLE_STATIC) + LLE_WUNLOCK(lle); + else { + if (!callout_pending(&lle->la_timer) && + callout_active(&lle->la_timer)) { + callout_stop(&lle->la_timer); + LLE_REMREF(lle); + (void) llentry_free(lle); + ARPSTAT_INC(timeouts); + } +#ifdef DIAGNOSTIC + else { + struct sockaddr *l3addr = L3_ADDR(lle); + log(LOG_INFO, + "arptimer issue: %p, IPv4 address: \"%s\"\n", lle, + inet_ntoa( + ((const struct sockaddr_in *)l3addr)->sin_addr)); + } +#endif + } + IF_AFDATA_UNLOCK(ifp); + CURVNET_RESTORE(); +} + +/* + * Broadcast an ARP request. Caller specifies: + * - arp header source ip address + * - arp header target ip address + * - arp header source ethernet address + */ +void +arprequest(struct ifnet *ifp, struct in_addr *sip, struct in_addr *tip, + u_char *enaddr) +{ + struct mbuf *m; + struct arphdr *ah; + struct sockaddr sa; + + if (sip == NULL) { + /* XXX don't believe this can happen (or explain why) */ + /* + * The caller did not supply a source address, try to find + * a compatible one among those assigned to this interface. + */ + struct ifaddr *ifa; + + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (!ifa->ifa_addr || + ifa->ifa_addr->sa_family != AF_INET) + continue; + sip = &SIN(ifa->ifa_addr)->sin_addr; + if (0 == ((sip->s_addr ^ tip->s_addr) & + SIN(ifa->ifa_netmask)->sin_addr.s_addr) ) + break; /* found it. */ + } + if (sip == NULL) { + printf("%s: cannot find matching address\n", __func__); + return; + } + } + + if ((m = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) + return; + m->m_len = sizeof(*ah) + 2*sizeof(struct in_addr) + + 2*ifp->if_data.ifi_addrlen; + m->m_pkthdr.len = m->m_len; + MH_ALIGN(m, m->m_len); + ah = mtod(m, struct arphdr *); + bzero((caddr_t)ah, m->m_len); +#ifdef MAC + mac_netinet_arp_send(ifp, m); +#endif + ah->ar_pro = htons(ETHERTYPE_IP); + ah->ar_hln = ifp->if_addrlen; /* hardware address length */ + ah->ar_pln = sizeof(struct in_addr); /* protocol address length */ + ah->ar_op = htons(ARPOP_REQUEST); + bcopy((caddr_t)enaddr, (caddr_t)ar_sha(ah), ah->ar_hln); + bcopy((caddr_t)sip, (caddr_t)ar_spa(ah), ah->ar_pln); + bcopy((caddr_t)tip, (caddr_t)ar_tpa(ah), ah->ar_pln); + sa.sa_family = AF_ARP; + sa.sa_len = 2; + m->m_flags |= M_BCAST; + (*ifp->if_output)(ifp, m, &sa, NULL); + ARPSTAT_INC(txrequests); +} + +/* + * Resolve an IP address into an ethernet address. + * On input: + * ifp is the interface we use + * rt0 is the route to the final destination (possibly useless) + * m is the mbuf. May be NULL if we don't have a packet. + * dst is the next hop, + * desten is where we want the address. + * + * On success, desten is filled in and the function returns 0; + * If the packet must be held pending resolution, we return EWOULDBLOCK + * On other errors, we return the corresponding error code. + * Note that m_freem() handles NULL. + */ +int +arpresolve(struct ifnet *ifp, struct rtentry *rt0, struct mbuf *m, + struct sockaddr *dst, u_char *desten, struct llentry **lle) +{ + struct llentry *la = 0; + u_int flags = 0; + int error, renew; + + *lle = NULL; + if (m != NULL) { + if (m->m_flags & M_BCAST) { + /* broadcast */ + (void)memcpy(desten, + ifp->if_broadcastaddr, ifp->if_addrlen); + return (0); + } + if (m->m_flags & M_MCAST && ifp->if_type != IFT_ARCNET) { + /* multicast */ + ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten); + return (0); + } + } + /* XXXXX + */ +retry: + IF_AFDATA_RLOCK(ifp); + la = lla_lookup(LLTABLE(ifp), flags, dst); + IF_AFDATA_RUNLOCK(ifp); + if ((la == NULL) && ((flags & LLE_EXCLUSIVE) == 0) + && ((ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0)) { + flags |= (LLE_CREATE | LLE_EXCLUSIVE); + IF_AFDATA_WLOCK(ifp); + la = lla_lookup(LLTABLE(ifp), flags, dst); + IF_AFDATA_WUNLOCK(ifp); + } + if (la == NULL) { + if (flags & LLE_CREATE) + log(LOG_DEBUG, + "arpresolve: can't allocate llinfo for %s\n", + inet_ntoa(SIN(dst)->sin_addr)); + m_freem(m); + return (EINVAL); + } + + if ((la->la_flags & LLE_VALID) && + ((la->la_flags & LLE_STATIC) || la->la_expire > time_second)) { + bcopy(&la->ll_addr, desten, ifp->if_addrlen); + /* + * If entry has an expiry time and it is approaching, + * see if we need to send an ARP request within this + * arpt_down interval. + */ + if (!(la->la_flags & LLE_STATIC) && + time_second + la->la_preempt > la->la_expire) { + arprequest(ifp, NULL, + &SIN(dst)->sin_addr, IF_LLADDR(ifp)); + + la->la_preempt--; + } + + *lle = la; + error = 0; + goto done; + } + + if (la->la_flags & LLE_STATIC) { /* should not happen! */ + log(LOG_DEBUG, "arpresolve: ouch, empty static llinfo for %s\n", + inet_ntoa(SIN(dst)->sin_addr)); + m_freem(m); + error = EINVAL; + goto done; + } + + renew = (la->la_asked == 0 || la->la_expire != time_second); + if ((renew || m != NULL) && (flags & LLE_EXCLUSIVE) == 0) { + flags |= LLE_EXCLUSIVE; + LLE_RUNLOCK(la); + goto retry; + } + /* + * There is an arptab entry, but no ethernet address + * response yet. Replace the held mbuf with this + * latest one. + */ + if (m != NULL) { + if (la->la_hold != NULL) { + m_freem(la->la_hold); + ARPSTAT_INC(dropped); + } + la->la_hold = m; + if (renew == 0 && (flags & LLE_EXCLUSIVE)) { + flags &= ~LLE_EXCLUSIVE; + LLE_DOWNGRADE(la); + } + + } + /* + * Return EWOULDBLOCK if we have tried less than arp_maxtries. It + * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH + * if we have already sent arp_maxtries ARP requests. Retransmit the + * ARP request, but not faster than one request per second. + */ + if (la->la_asked < V_arp_maxtries) + error = EWOULDBLOCK; /* First request. */ + else + error = rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) ? + EHOSTUNREACH : EHOSTDOWN; + + if (renew) { + int canceled; + + LLE_ADDREF(la); + la->la_expire = time_second; + canceled = callout_reset(&la->la_timer, hz * V_arpt_down, + arptimer, la); + if (canceled) + LLE_REMREF(la); + la->la_asked++; + LLE_WUNLOCK(la); + arprequest(ifp, NULL, &SIN(dst)->sin_addr, + IF_LLADDR(ifp)); + return (error); + } +done: + if (flags & LLE_EXCLUSIVE) + LLE_WUNLOCK(la); + else + LLE_RUNLOCK(la); + return (error); +} + +/* + * Common length and type checks are done here, + * then the protocol-specific routine is called. + */ +static void +arpintr(struct mbuf *m) +{ + struct arphdr *ar; + + if (m->m_len < sizeof(struct arphdr) && + ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) { + log(LOG_ERR, "arp: runt packet -- m_pullup failed\n"); + return; + } + ar = mtod(m, struct arphdr *); + + if (ntohs(ar->ar_hrd) != ARPHRD_ETHER && + ntohs(ar->ar_hrd) != ARPHRD_IEEE802 && + ntohs(ar->ar_hrd) != ARPHRD_ARCNET && + ntohs(ar->ar_hrd) != ARPHRD_IEEE1394) { + log(LOG_ERR, "arp: unknown hardware address format (0x%2D)\n", + (unsigned char *)&ar->ar_hrd, ""); + m_freem(m); + return; + } + + if (m->m_len < arphdr_len(ar)) { + if ((m = m_pullup(m, arphdr_len(ar))) == NULL) { + log(LOG_ERR, "arp: runt packet\n"); + m_freem(m); + return; + } + ar = mtod(m, struct arphdr *); + } + + ARPSTAT_INC(received); + switch (ntohs(ar->ar_pro)) { +#ifdef INET + case ETHERTYPE_IP: + in_arpinput(m); + return; +#endif + } + m_freem(m); +} + +#ifdef INET +/* + * ARP for Internet protocols on 10 Mb/s Ethernet. + * Algorithm is that given in RFC 826. + * In addition, a sanity check is performed on the sender + * protocol address, to catch impersonators. + * We no longer handle negotiations for use of trailer protocol: + * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent + * along with IP replies if we wanted trailers sent to us, + * and also sent them in response to IP replies. + * This allowed either end to announce the desire to receive + * trailer packets. + * We no longer reply to requests for ETHERTYPE_TRAIL protocol either, + * but formerly didn't normally send requests. + */ +static int log_arp_wrong_iface = 1; +static int log_arp_movements = 1; +static int log_arp_permanent_modify = 1; + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW, + &log_arp_wrong_iface, 0, + "log arp packets arriving on the wrong interface"); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW, + &log_arp_movements, 0, + "log arp replies from MACs different than the one in the cache"); +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW, + &log_arp_permanent_modify, 0, + "log arp replies from MACs different than the one in the permanent arp entry"); + + +static void +in_arpinput(struct mbuf *m) +{ + struct arphdr *ah; + struct ifnet *ifp = m->m_pkthdr.rcvif; + struct llentry *la = NULL; + struct rtentry *rt; + struct ifaddr *ifa; + struct in_ifaddr *ia; + struct mbuf *hold; + struct sockaddr sa; + struct in_addr isaddr, itaddr, myaddr; + u_int8_t *enaddr = NULL; + int op, flags; + int req_len; + int bridged = 0, is_bridge = 0; + int carp_match = 0; + struct sockaddr_in sin; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = 0; + + if (ifp->if_bridge) + bridged = 1; + if (ifp->if_type == IFT_BRIDGE) + is_bridge = 1; + + req_len = arphdr_len2(ifp->if_addrlen, sizeof(struct in_addr)); + if (m->m_len < req_len && (m = m_pullup(m, req_len)) == NULL) { + log(LOG_ERR, "in_arp: runt packet -- m_pullup failed\n"); + return; + } + + ah = mtod(m, struct arphdr *); + op = ntohs(ah->ar_op); + (void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr)); + (void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr)); + + if (op == ARPOP_REPLY) + ARPSTAT_INC(rxreplies); + + /* + * For a bridge, we want to check the address irrespective + * of the receive interface. (This will change slightly + * when we have clusters of interfaces). + * If the interface does not match, but the recieving interface + * is part of carp, we call carp_iamatch to see if this is a + * request for the virtual host ip. + * XXX: This is really ugly! + */ + IN_IFADDR_RLOCK(); + LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { + if (((bridged && ia->ia_ifp->if_bridge != NULL) || + ia->ia_ifp == ifp) && + itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { + ifa_ref(&ia->ia_ifa); + IN_IFADDR_RUNLOCK(); + goto match; + } + if (ifp->if_carp != NULL && + (*carp_iamatch_p)(ifp, ia, &isaddr, &enaddr) && + itaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { + carp_match = 1; + ifa_ref(&ia->ia_ifa); + IN_IFADDR_RUNLOCK(); + goto match; + } + } + LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash) + if (((bridged && ia->ia_ifp->if_bridge != NULL) || + ia->ia_ifp == ifp) && + isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) { + ifa_ref(&ia->ia_ifa); + IN_IFADDR_RUNLOCK(); + goto match; + } + +#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ + (ia->ia_ifp->if_bridge == ifp->if_softc && \ + !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) && \ + addr == ia->ia_addr.sin_addr.s_addr) + /* + * Check the case when bridge shares its MAC address with + * some of its children, so packets are claimed by bridge + * itself (bridge_input() does it first), but they are really + * meant to be destined to the bridge member. + */ + if (is_bridge) { + LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) { + if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) { + ifa_ref(&ia->ia_ifa); + ifp = ia->ia_ifp; + IN_IFADDR_RUNLOCK(); + goto match; + } + } + } +#undef BDG_MEMBER_MATCHES_ARP + IN_IFADDR_RUNLOCK(); + + /* + * No match, use the first inet address on the receive interface + * as a dummy address for the rest of the function. + */ + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (ifa->ifa_addr->sa_family == AF_INET) { + ia = ifatoia(ifa); + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + goto match; + } + IF_ADDR_UNLOCK(ifp); + + /* + * If bridging, fall back to using any inet address. + */ + IN_IFADDR_RLOCK(); + if (!bridged || (ia = TAILQ_FIRST(&V_in_ifaddrhead)) == NULL) { + IN_IFADDR_RUNLOCK(); + goto drop; + } + ifa_ref(&ia->ia_ifa); + IN_IFADDR_RUNLOCK(); +match: + if (!enaddr) + enaddr = (u_int8_t *)IF_LLADDR(ifp); + myaddr = ia->ia_addr.sin_addr; + ifa_free(&ia->ia_ifa); + if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen)) + goto drop; /* it's from me, ignore it. */ + if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) { + log(LOG_ERR, + "arp: link address is broadcast for IP address %s!\n", + inet_ntoa(isaddr)); + goto drop; + } + /* + * Warn if another host is using the same IP address, but only if the + * IP address isn't 0.0.0.0, which is used for DHCP only, in which + * case we suppress the warning to avoid false positive complaints of + * potential misconfiguration. + */ + if (!bridged && isaddr.s_addr == myaddr.s_addr && myaddr.s_addr != 0) { + log(LOG_ERR, + "arp: %*D is using my IP address %s on %s!\n", + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + inet_ntoa(isaddr), ifp->if_xname); + itaddr = myaddr; + ARPSTAT_INC(dupips); + goto reply; + } + if (ifp->if_flags & IFF_STATICARP) + goto reply; + + bzero(&sin, sizeof(sin)); + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_family = AF_INET; + sin.sin_addr = isaddr; + flags = (itaddr.s_addr == myaddr.s_addr) ? LLE_CREATE : 0; + flags |= LLE_EXCLUSIVE; + IF_AFDATA_LOCK(ifp); + la = lla_lookup(LLTABLE(ifp), flags, (struct sockaddr *)&sin); + IF_AFDATA_UNLOCK(ifp); + if (la != NULL) { + /* the following is not an error when doing bridging */ + if (!bridged && la->lle_tbl->llt_ifp != ifp && !carp_match) { + if (log_arp_wrong_iface) + log(LOG_ERR, "arp: %s is on %s " + "but got reply from %*D on %s\n", + inet_ntoa(isaddr), + la->lle_tbl->llt_ifp->if_xname, + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + ifp->if_xname); + LLE_WUNLOCK(la); + goto reply; + } + if ((la->la_flags & LLE_VALID) && + bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) { + if (la->la_flags & LLE_STATIC) { + LLE_WUNLOCK(la); + log(LOG_ERR, + "arp: %*D attempts to modify permanent " + "entry for %s on %s\n", + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + inet_ntoa(isaddr), ifp->if_xname); + goto reply; + } + if (log_arp_movements) { + log(LOG_INFO, "arp: %s moved from %*D " + "to %*D on %s\n", + inet_ntoa(isaddr), + ifp->if_addrlen, + (u_char *)&la->ll_addr, ":", + ifp->if_addrlen, (u_char *)ar_sha(ah), ":", + ifp->if_xname); + } + } + + if (ifp->if_addrlen != ah->ar_hln) { + LLE_WUNLOCK(la); + log(LOG_WARNING, + "arp from %*D: addr len: new %d, i/f %d (ignored)", + ifp->if_addrlen, (u_char *) ar_sha(ah), ":", + ah->ar_hln, ifp->if_addrlen); + goto reply; + } + (void)memcpy(&la->ll_addr, ar_sha(ah), ifp->if_addrlen); + la->la_flags |= LLE_VALID; + + if (!(la->la_flags & LLE_STATIC)) { + int canceled; + + LLE_ADDREF(la); + la->la_expire = time_second + V_arpt_keep; + canceled = callout_reset(&la->la_timer, + hz * V_arpt_keep, arptimer, la); + if (canceled) + LLE_REMREF(la); + } + la->la_asked = 0; + la->la_preempt = V_arp_maxtries; + hold = la->la_hold; + if (hold != NULL) { + la->la_hold = NULL; + memcpy(&sa, L3_ADDR(la), sizeof(sa)); + } + LLE_WUNLOCK(la); + if (hold != NULL) + (*ifp->if_output)(ifp, hold, &sa, NULL); + } +reply: + if (op != ARPOP_REQUEST) + goto drop; + ARPSTAT_INC(rxrequests); + + if (itaddr.s_addr == myaddr.s_addr) { + /* Shortcut.. the receiving interface is the target. */ + (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); + (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); + } else { + struct llentry *lle = NULL; + + sin.sin_addr = itaddr; + IF_AFDATA_LOCK(ifp); + lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin); + IF_AFDATA_UNLOCK(ifp); + + if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { + (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); + (void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln); + LLE_RUNLOCK(lle); + } else { + + if (lle != NULL) + LLE_RUNLOCK(lle); + + if (!V_arp_proxyall) + goto drop; + + sin.sin_addr = itaddr; + /* XXX MRT use table 0 for arp reply */ + rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); + if (!rt) + goto drop; + + /* + * Don't send proxies for nodes on the same interface + * as this one came out of, or we'll get into a fight + * over who claims what Ether address. + */ + if (!rt->rt_ifp || rt->rt_ifp == ifp) { + RTFREE_LOCKED(rt); + goto drop; + } + RTFREE_LOCKED(rt); + + (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); + (void)memcpy(ar_sha(ah), enaddr, ah->ar_hln); + + /* + * Also check that the node which sent the ARP packet + * is on the the interface we expect it to be on. This + * avoids ARP chaos if an interface is connected to the + * wrong network. + */ + sin.sin_addr = isaddr; + + /* XXX MRT use table 0 for arp checks */ + rt = in_rtalloc1((struct sockaddr *)&sin, 0, 0UL, 0); + if (!rt) + goto drop; + if (rt->rt_ifp != ifp) { + log(LOG_INFO, "arp_proxy: ignoring request" + " from %s via %s, expecting %s\n", + inet_ntoa(isaddr), ifp->if_xname, + rt->rt_ifp->if_xname); + RTFREE_LOCKED(rt); + goto drop; + } + RTFREE_LOCKED(rt); + +#ifdef DEBUG_PROXY + printf("arp: proxying for %s\n", + inet_ntoa(itaddr)); +#endif + } + } + + if (itaddr.s_addr == myaddr.s_addr && + IN_LINKLOCAL(ntohl(itaddr.s_addr))) { + /* RFC 3927 link-local IPv4; always reply by broadcast. */ +#ifdef DEBUG_LINKLOCAL + printf("arp: sending reply for link-local addr %s\n", + inet_ntoa(itaddr)); +#endif + m->m_flags |= M_BCAST; + m->m_flags &= ~M_MCAST; + } else { + /* default behaviour; never reply by broadcast. */ + m->m_flags &= ~(M_BCAST|M_MCAST); + } + (void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln); + (void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln); + ah->ar_op = htons(ARPOP_REPLY); + ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */ + m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln); + m->m_pkthdr.len = m->m_len; + sa.sa_family = AF_ARP; + sa.sa_len = 2; + (*ifp->if_output)(ifp, m, &sa, NULL); + ARPSTAT_INC(txreplies); + return; + +drop: + m_freem(m); +} +#endif + +void +arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) +{ + struct llentry *lle; + + if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) { + arprequest(ifp, &IA_SIN(ifa)->sin_addr, + &IA_SIN(ifa)->sin_addr, IF_LLADDR(ifp)); + /* + * interface address is considered static entry + * because the output of the arp utility shows + * that L2 entry as permanent + */ + IF_AFDATA_LOCK(ifp); + lle = lla_lookup(LLTABLE(ifp), (LLE_CREATE | LLE_IFADDR | LLE_STATIC), + (struct sockaddr *)IA_SIN(ifa)); + IF_AFDATA_UNLOCK(ifp); + if (lle == NULL) + log(LOG_INFO, "arp_ifinit: cannot create arp " + "entry for interface address\n"); + else + LLE_RUNLOCK(lle); + } + ifa->ifa_rtrequest = NULL; +} + +void +arp_ifinit2(struct ifnet *ifp, struct ifaddr *ifa, u_char *enaddr) +{ + if (ntohl(IA_SIN(ifa)->sin_addr.s_addr) != INADDR_ANY) + arprequest(ifp, &IA_SIN(ifa)->sin_addr, + &IA_SIN(ifa)->sin_addr, enaddr); + ifa->ifa_rtrequest = NULL; +} + +static void +arp_init(void) +{ + + netisr_register(&arp_nh); +} +SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); diff --git a/freebsd/sys/netinet/if_ether.h b/freebsd/sys/netinet/if_ether.h new file mode 100644 index 00000000..e3c8d009 --- /dev/null +++ b/freebsd/sys/netinet/if_ether.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/netinet/igmp.c b/freebsd/sys/netinet/igmp.c new file mode 100644 index 00000000..5f8893d7 --- /dev/null +++ b/freebsd/sys/netinet/igmp.c @@ -0,0 +1,3655 @@ +#include + +/*- + * Copyright (c) 2007-2009 Bruce Simpson. + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)igmp.c 8.1 (Berkeley) 7/19/93 + */ + +/* + * Internet Group Management Protocol (IGMP) routines. + * [RFC1112, RFC2236, RFC3376] + * + * Written by Steve Deering, Stanford, May 1988. + * Modified by Rosen Sharma, Stanford, Aug 1994. + * Modified by Bill Fenner, Xerox PARC, Feb 1995. + * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. + * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. + * + * MULTICAST Revision: 3.5.1.4 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#ifndef KTR_IGMPV3 +#define KTR_IGMPV3 KTR_INET +#endif + +static struct igmp_ifinfo * + igi_alloc_locked(struct ifnet *); +static void igi_delete_locked(const struct ifnet *); +static void igmp_dispatch_queue(struct ifqueue *, int, const int); +static void igmp_fasttimo_vnet(void); +static void igmp_final_leave(struct in_multi *, struct igmp_ifinfo *); +static int igmp_handle_state_change(struct in_multi *, + struct igmp_ifinfo *); +static int igmp_initial_join(struct in_multi *, struct igmp_ifinfo *); +static int igmp_input_v1_query(struct ifnet *, const struct ip *, + const struct igmp *); +static int igmp_input_v2_query(struct ifnet *, const struct ip *, + const struct igmp *); +static int igmp_input_v3_query(struct ifnet *, const struct ip *, + /*const*/ struct igmpv3 *); +static int igmp_input_v3_group_query(struct in_multi *, + struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *); +static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, + /*const*/ struct igmp *); +static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, + /*const*/ struct igmp *); +static void igmp_intr(struct mbuf *); +static int igmp_isgroupreported(const struct in_addr); +static struct mbuf * + igmp_ra_alloc(void); +#ifdef KTR +static char * igmp_rec_type_to_str(const int); +#endif +static void igmp_set_version(struct igmp_ifinfo *, const int); +static void igmp_slowtimo_vnet(void); +static int igmp_v1v2_queue_report(struct in_multi *, const int); +static void igmp_v1v2_process_group_timer(struct in_multi *, const int); +static void igmp_v1v2_process_querier_timers(struct igmp_ifinfo *); +static void igmp_v2_update_group(struct in_multi *, const int); +static void igmp_v3_cancel_link_timers(struct igmp_ifinfo *); +static void igmp_v3_dispatch_general_query(struct igmp_ifinfo *); +static struct mbuf * + igmp_v3_encap_report(struct ifnet *, struct mbuf *); +static int igmp_v3_enqueue_group_record(struct ifqueue *, + struct in_multi *, const int, const int, const int); +static int igmp_v3_enqueue_filter_change(struct ifqueue *, + struct in_multi *); +static void igmp_v3_process_group_timers(struct igmp_ifinfo *, + struct ifqueue *, struct ifqueue *, struct in_multi *, + const int); +static int igmp_v3_merge_state_changes(struct in_multi *, + struct ifqueue *); +static void igmp_v3_suppress_group_record(struct in_multi *); +static int sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS); +static int sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS); +static int sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS); + +static const struct netisr_handler igmp_nh = { + .nh_name = "igmp", + .nh_handler = igmp_intr, + .nh_proto = NETISR_IGMP, + .nh_policy = NETISR_POLICY_SOURCE, +}; + +/* + * System-wide globals. + * + * Unlocked access to these is OK, except for the global IGMP output + * queue. The IGMP subsystem lock ends up being system-wide for the moment, + * because all VIMAGEs have to share a global output queue, as netisrs + * themselves are not virtualized. + * + * Locking: + * * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. + * Any may be taken independently; if any are held at the same + * time, the above lock order must be followed. + * * All output is delegated to the netisr. + * Now that Giant has been eliminated, the netisr may be inlined. + * * IN_MULTI_LOCK covers in_multi. + * * IGMP_LOCK covers igmp_ifinfo and any global variables in this file, + * including the output queue. + * * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of + * per-link state iterators. + * * igmp_ifinfo is valid as long as PF_INET is attached to the interface, + * therefore it is not refcounted. + * We allow unlocked reads of igmp_ifinfo when accessed via in_multi. + * + * Reference counting + * * IGMP acquires its own reference every time an in_multi is passed to + * it and the group is being joined for the first time. + * * IGMP releases its reference(s) on in_multi in a deferred way, + * because the operations which process the release run as part of + * a loop whose control variables are directly affected by the release + * (that, and not recursing on the IF_ADDR_LOCK). + * + * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds + * to a vnet in ifp->if_vnet. + * + * SMPng: XXX We may potentially race operations on ifma_protospec. + * The problem is that we currently lack a clean way of taking the + * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing, + * as anything which modifies ifma needs to be covered by that lock. + * So check for ifma_protospec being NULL before proceeding. + */ +struct mtx igmp_mtx; + +struct mbuf *m_raopt; /* Router Alert option */ +MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); + +/* + * VIMAGE-wide globals. + * + * The IGMPv3 timers themselves need to run per-image, however, + * protosw timers run globally (see tcp). + * An ifnet can only be in one vimage at a time, and the loopback + * ifnet, loif, is itself virtualized. + * It would otherwise be possible to seriously hose IGMP state, + * and create inconsistencies in upstream multicast routing, if you have + * multiple VIMAGEs running on the same link joining different multicast + * groups, UNLESS the "primary IP address" is different. This is because + * IGMP for IPv4 does not force link-local addresses to be used for each + * node, unlike MLD for IPv6. + * Obviously the IGMPv3 per-interface state has per-vimage granularity + * also as a result. + * + * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection + * policy to control the address used by IGMP on the link. + */ +static VNET_DEFINE(int, interface_timers_running); /* IGMPv3 general + * query response */ +static VNET_DEFINE(int, state_change_timers_running); /* IGMPv3 state-change + * retransmit */ +static VNET_DEFINE(int, current_state_timers_running); /* IGMPv1/v2 host + * report; IGMPv3 g/sg + * query response */ + +#define V_interface_timers_running VNET(interface_timers_running) +#define V_state_change_timers_running VNET(state_change_timers_running) +#define V_current_state_timers_running VNET(current_state_timers_running) + +static VNET_DEFINE(LIST_HEAD(, igmp_ifinfo), igi_head); +static VNET_DEFINE(struct igmpstat, igmpstat) = { + .igps_version = IGPS_VERSION_3, + .igps_len = sizeof(struct igmpstat), +}; +static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0}; + +#define V_igi_head VNET(igi_head) +#define V_igmpstat VNET(igmpstat) +#define V_igmp_gsrdelay VNET(igmp_gsrdelay) + +static VNET_DEFINE(int, igmp_recvifkludge) = 1; +static VNET_DEFINE(int, igmp_sendra) = 1; +static VNET_DEFINE(int, igmp_sendlocal) = 1; +static VNET_DEFINE(int, igmp_v1enable) = 1; +static VNET_DEFINE(int, igmp_v2enable) = 1; +static VNET_DEFINE(int, igmp_legacysupp); +static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3; + +#define V_igmp_recvifkludge VNET(igmp_recvifkludge) +#define V_igmp_sendra VNET(igmp_sendra) +#define V_igmp_sendlocal VNET(igmp_sendlocal) +#define V_igmp_v1enable VNET(igmp_v1enable) +#define V_igmp_v2enable VNET(igmp_v2enable) +#define V_igmp_legacysupp VNET(igmp_legacysupp) +#define V_igmp_default_version VNET(igmp_default_version) + +/* + * Virtualized sysctls. + */ +SYSCTL_VNET_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW, + &VNET_NAME(igmpstat), igmpstat, ""); +SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW, + &VNET_NAME(igmp_recvifkludge), 0, + "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); +SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW, + &VNET_NAME(igmp_sendra), 0, + "Send IP Router Alert option in IGMPv2/v3 messages"); +SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW, + &VNET_NAME(igmp_sendlocal), 0, + "Send IGMP membership reports for 224.0.0.0/24 groups"); +SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW, + &VNET_NAME(igmp_v1enable), 0, + "Enable backwards compatibility with IGMPv1"); +SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW, + &VNET_NAME(igmp_v2enable), 0, + "Enable backwards compatibility with IGMPv2"); +SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW, + &VNET_NAME(igmp_legacysupp), 0, + "Allow v1/v2 reports to suppress v3 group responses"); +SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, default_version, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I", + "Default version of IGMP to run on each interface"); +SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I", + "Rate limit for IGMPv3 Group-and-Source queries in seconds"); + +/* + * Non-virtualized sysctls. + */ +SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE, + sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); + +static __inline void +igmp_save_context(struct mbuf *m, struct ifnet *ifp) +{ + +#ifdef VIMAGE + m->m_pkthdr.header = ifp->if_vnet; +#endif /* VIMAGE */ + m->m_pkthdr.flowid = ifp->if_index; +} + +static __inline void +igmp_scrub_context(struct mbuf *m) +{ + + m->m_pkthdr.header = NULL; + m->m_pkthdr.flowid = 0; +} + +#ifdef KTR +static __inline char * +inet_ntoa_haddr(in_addr_t haddr) +{ + struct in_addr ia; + + ia.s_addr = htonl(haddr); + return (inet_ntoa(ia)); +} +#endif + +/* + * Restore context from a queued IGMP output chain. + * Return saved ifindex. + * + * VIMAGE: The assertion is there to make sure that we + * actually called CURVNET_SET() with what's in the mbuf chain. + */ +static __inline uint32_t +igmp_restore_context(struct mbuf *m) +{ + +#ifdef notyet +#if defined(VIMAGE) && defined(INVARIANTS) + KASSERT(curvnet == (m->m_pkthdr.header), + ("%s: called when curvnet was not restored", __func__)); +#endif +#endif + return (m->m_pkthdr.flowid); +} + +/* + * Retrieve or set default IGMP version. + * + * VIMAGE: Assume curvnet set by caller. + * SMPng: NOTE: Serialized by IGMP lock. + */ +static int +sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS) +{ + int error; + int new; + + error = sysctl_wire_old_buffer(req, sizeof(int)); + if (error) + return (error); + + IGMP_LOCK(); + + new = V_igmp_default_version; + + error = sysctl_handle_int(oidp, &new, 0, req); + if (error || !req->newptr) + goto out_locked; + + if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { + error = EINVAL; + goto out_locked; + } + + CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d", + V_igmp_default_version, new); + + V_igmp_default_version = new; + +out_locked: + IGMP_UNLOCK(); + return (error); +} + +/* + * Retrieve or set threshold between group-source queries in seconds. + * + * VIMAGE: Assume curvnet set by caller. + * SMPng: NOTE: Serialized by IGMP lock. + */ +static int +sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS) +{ + int error; + int i; + + error = sysctl_wire_old_buffer(req, sizeof(int)); + if (error) + return (error); + + IGMP_LOCK(); + + i = V_igmp_gsrdelay.tv_sec; + + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || !req->newptr) + goto out_locked; + + if (i < -1 || i >= 60) { + error = EINVAL; + goto out_locked; + } + + CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d", + V_igmp_gsrdelay.tv_sec, i); + V_igmp_gsrdelay.tv_sec = i; + +out_locked: + IGMP_UNLOCK(); + return (error); +} + +/* + * Expose struct igmp_ifinfo to userland, keyed by ifindex. + * For use by ifmcstat(8). + * + * SMPng: NOTE: Does an unlocked ifindex space read. + * VIMAGE: Assume curvnet set by caller. The node handler itself + * is not directly virtualized. + */ +static int +sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS) +{ + int *name; + int error; + u_int namelen; + struct ifnet *ifp; + struct igmp_ifinfo *igi; + + name = (int *)arg1; + namelen = arg2; + + if (req->newptr != NULL) + return (EPERM); + + if (namelen != 1) + return (EINVAL); + + error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo)); + if (error) + return (error); + + IN_MULTI_LOCK(); + IGMP_LOCK(); + + if (name[0] <= 0 || name[0] > V_if_index) { + error = ENOENT; + goto out_locked; + } + + error = ENOENT; + + ifp = ifnet_byindex(name[0]); + if (ifp == NULL) + goto out_locked; + + LIST_FOREACH(igi, &V_igi_head, igi_link) { + if (ifp == igi->igi_ifp) { + error = SYSCTL_OUT(req, igi, + sizeof(struct igmp_ifinfo)); + break; + } + } + +out_locked: + IGMP_UNLOCK(); + IN_MULTI_UNLOCK(); + return (error); +} + +/* + * Dispatch an entire queue of pending packet chains + * using the netisr. + * VIMAGE: Assumes the vnet pointer has been set. + */ +static void +igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop) +{ + struct mbuf *m; + + for (;;) { + _IF_DEQUEUE(ifq, m); + if (m == NULL) + break; + CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m); + if (loop) + m->m_flags |= M_IGMP_LOOP; + netisr_dispatch(NETISR_IGMP, m); + if (--limit == 0) + break; + } +} + +/* + * Filter outgoing IGMP report state by group. + * + * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). + * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are + * disabled for all groups in the 224.0.0.0/24 link-local scope. However, + * this may break certain IGMP snooping switches which rely on the old + * report behaviour. + * + * Return zero if the given group is one for which IGMP reports + * should be suppressed, or non-zero if reports should be issued. + */ +static __inline int +igmp_isgroupreported(const struct in_addr addr) +{ + + if (in_allhosts(addr) || + ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) + return (0); + + return (1); +} + +/* + * Construct a Router Alert option to use in outgoing packets. + */ +static struct mbuf * +igmp_ra_alloc(void) +{ + struct mbuf *m; + struct ipoption *p; + + MGET(m, M_DONTWAIT, MT_DATA); + p = mtod(m, struct ipoption *); + p->ipopt_dst.s_addr = INADDR_ANY; + p->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ + p->ipopt_list[1] = 0x04; /* 4 bytes long */ + p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ + p->ipopt_list[3] = 0x00; /* pad byte */ + m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; + + return (m); +} + +/* + * Attach IGMP when PF_INET is attached to an interface. + */ +struct igmp_ifinfo * +igmp_domifattach(struct ifnet *ifp) +{ + struct igmp_ifinfo *igi; + + CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", + __func__, ifp, ifp->if_xname); + + IGMP_LOCK(); + + igi = igi_alloc_locked(ifp); + if (!(ifp->if_flags & IFF_MULTICAST)) + igi->igi_flags |= IGIF_SILENT; + + IGMP_UNLOCK(); + + return (igi); +} + +/* + * VIMAGE: assume curvnet set by caller. + */ +static struct igmp_ifinfo * +igi_alloc_locked(/*const*/ struct ifnet *ifp) +{ + struct igmp_ifinfo *igi; + + IGMP_LOCK_ASSERT(); + + igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO); + if (igi == NULL) + goto out; + + igi->igi_ifp = ifp; + igi->igi_version = V_igmp_default_version; + igi->igi_flags = 0; + igi->igi_rv = IGMP_RV_INIT; + igi->igi_qi = IGMP_QI_INIT; + igi->igi_qri = IGMP_QRI_INIT; + igi->igi_uri = IGMP_URI_INIT; + + SLIST_INIT(&igi->igi_relinmhead); + + /* + * Responses to general queries are subject to bounds. + */ + IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS); + + LIST_INSERT_HEAD(&V_igi_head, igi, igi_link); + + CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)", + ifp, ifp->if_xname); + +out: + return (igi); +} + +/* + * Hook for ifdetach. + * + * NOTE: Some finalization tasks need to run before the protocol domain + * is detached, but also before the link layer does its cleanup. + * + * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK(). + * XXX This is also bitten by unlocked ifma_protospec access. + */ +void +igmp_ifdetach(struct ifnet *ifp) +{ + struct igmp_ifinfo *igi; + struct ifmultiaddr *ifma; + struct in_multi *inm, *tinm; + + CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp, + ifp->if_xname); + + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + if (igi->igi_version == IGMP_VERSION_3) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET || + ifma->ifma_protospec == NULL) + continue; +#if 0 + KASSERT(ifma->ifma_protospec != NULL, + ("%s: ifma_protospec is NULL", __func__)); +#endif + inm = (struct in_multi *)ifma->ifma_protospec; + if (inm->inm_state == IGMP_LEAVING_MEMBER) { + SLIST_INSERT_HEAD(&igi->igi_relinmhead, + inm, inm_nrele); + } + inm_clear_recorded(inm); + } + IF_ADDR_UNLOCK(ifp); + /* + * Free the in_multi reference(s) for this IGMP lifecycle. + */ + SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele, + tinm) { + SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); + inm_release_locked(inm); + } + } + + IGMP_UNLOCK(); +} + +/* + * Hook for domifdetach. + */ +void +igmp_domifdetach(struct ifnet *ifp) +{ + struct igmp_ifinfo *igi; + + CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", + __func__, ifp, ifp->if_xname); + + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + igi_delete_locked(ifp); + + IGMP_UNLOCK(); +} + +static void +igi_delete_locked(const struct ifnet *ifp) +{ + struct igmp_ifinfo *igi, *tigi; + + CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)", + __func__, ifp, ifp->if_xname); + + IGMP_LOCK_ASSERT(); + + LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) { + if (igi->igi_ifp == ifp) { + /* + * Free deferred General Query responses. + */ + _IF_DRAIN(&igi->igi_gq); + + LIST_REMOVE(igi, igi_link); + + KASSERT(SLIST_EMPTY(&igi->igi_relinmhead), + ("%s: there are dangling in_multi references", + __func__)); + + free(igi, M_IGMP); + return; + } + } + +#ifdef INVARIANTS + panic("%s: igmp_ifinfo not found for ifp %p\n", __func__, ifp); +#endif +} + +/* + * Process a received IGMPv1 query. + * Return non-zero if the message should be dropped. + * + * VIMAGE: The curvnet pointer is derived from the input ifp. + */ +static int +igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, + const struct igmp *igmp) +{ + struct ifmultiaddr *ifma; + struct igmp_ifinfo *igi; + struct in_multi *inm; + + /* + * IGMPv1 Host Mmembership Queries SHOULD always be addressed to + * 224.0.0.1. They are always treated as General Queries. + * igmp_group is always ignored. Do not drop it as a userland + * daemon may wish to see it. + * XXX SMPng: unlocked increments in igmpstat assumed atomic. + */ + if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { + IGMPSTAT_INC(igps_rcv_badqueries); + return (0); + } + IGMPSTAT_INC(igps_rcv_gen_queries); + + IN_MULTI_LOCK(); + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + + if (igi->igi_flags & IGIF_LOOPBACK) { + CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)", + ifp, ifp->if_xname); + goto out_locked; + } + + /* + * Switch to IGMPv1 host compatibility mode. + */ + igmp_set_version(igi, IGMP_VERSION_1); + + CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname); + + /* + * Start the timers in all of our group records + * for the interface on which the query arrived, + * except those which are already running. + */ + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET || + ifma->ifma_protospec == NULL) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + if (inm->inm_timer != 0) + continue; + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_V1V2_MAX_RI * PR_FASTHZ); + V_current_state_timers_running = 1; + break; + case IGMP_LEAVING_MEMBER: + break; + } + } + IF_ADDR_UNLOCK(ifp); + +out_locked: + IGMP_UNLOCK(); + IN_MULTI_UNLOCK(); + + return (0); +} + +/* + * Process a received IGMPv2 general or group-specific query. + */ +static int +igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, + const struct igmp *igmp) +{ + struct ifmultiaddr *ifma; + struct igmp_ifinfo *igi; + struct in_multi *inm; + int is_general_query; + uint16_t timer; + + is_general_query = 0; + + /* + * Validate address fields upfront. + * XXX SMPng: unlocked increments in igmpstat assumed atomic. + */ + if (in_nullhost(igmp->igmp_group)) { + /* + * IGMPv2 General Query. + * If this was not sent to the all-hosts group, ignore it. + */ + if (!in_allhosts(ip->ip_dst)) + return (0); + IGMPSTAT_INC(igps_rcv_gen_queries); + is_general_query = 1; + } else { + /* IGMPv2 Group-Specific Query. */ + IGMPSTAT_INC(igps_rcv_group_queries); + } + + IN_MULTI_LOCK(); + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + + if (igi->igi_flags & IGIF_LOOPBACK) { + CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)", + ifp, ifp->if_xname); + goto out_locked; + } + + /* + * Ignore v2 query if in v1 Compatibility Mode. + */ + if (igi->igi_version == IGMP_VERSION_1) + goto out_locked; + + igmp_set_version(igi, IGMP_VERSION_2); + + timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + + if (is_general_query) { + /* + * For each reporting group joined on this + * interface, kick the report timer. + */ + CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)", + ifp, ifp->if_xname); + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET || + ifma->ifma_protospec == NULL) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + igmp_v2_update_group(inm, timer); + } + IF_ADDR_UNLOCK(ifp); + } else { + /* + * Group-specific IGMPv2 query, we need only + * look up the single group to process it. + */ + inm = inm_lookup(ifp, igmp->igmp_group); + if (inm != NULL) { + CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + igmp_v2_update_group(inm, timer); + } + } + +out_locked: + IGMP_UNLOCK(); + IN_MULTI_UNLOCK(); + + return (0); +} + +/* + * Update the report timer on a group in response to an IGMPv2 query. + * + * If we are becoming the reporting member for this group, start the timer. + * If we already are the reporting member for this group, and timer is + * below the threshold, reset it. + * + * We may be updating the group for the first time since we switched + * to IGMPv3. If we are, then we must clear any recorded source lists, + * and transition to REPORTING state; the group timer is overloaded + * for group and group-source query responses. + * + * Unlike IGMPv3, the delay per group should be jittered + * to avoid bursts of IGMPv2 reports. + */ +static void +igmp_v2_update_group(struct in_multi *inm, const int timer) +{ + + CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer); + + IN_MULTI_LOCK_ASSERT(); + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + if (inm->inm_timer != 0 && + inm->inm_timer <= timer) { + CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, " + "skipping.", __func__); + break; + } + /* FALLTHROUGH */ + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__); + inm->inm_state = IGMP_REPORTING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + V_current_state_timers_running = 1; + break; + case IGMP_SLEEPING_MEMBER: + CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__); + inm->inm_state = IGMP_AWAKENING_MEMBER; + break; + case IGMP_LEAVING_MEMBER: + break; + } +} + +/* + * Process a received IGMPv3 general, group-specific or + * group-and-source-specific query. + * Assumes m has already been pulled up to the full IGMP message length. + * Return 0 if successful, otherwise an appropriate error code is returned. + */ +static int +igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, + /*const*/ struct igmpv3 *igmpv3) +{ + struct igmp_ifinfo *igi; + struct in_multi *inm; + int is_general_query; + uint32_t maxresp, nsrc, qqi; + uint16_t timer; + uint8_t qrv; + + is_general_query = 0; + + CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname); + + maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ + if (maxresp >= 128) { + maxresp = IGMP_MANT(igmpv3->igmp_code) << + (IGMP_EXP(igmpv3->igmp_code) + 3); + } + + /* + * Robustness must never be less than 2 for on-wire IGMPv3. + * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make + * an exception for interfaces whose IGMPv3 state changes + * are redirected to loopback (e.g. MANET). + */ + qrv = IGMP_QRV(igmpv3->igmp_misc); + if (qrv < 2) { + CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__, + qrv, IGMP_RV_INIT); + qrv = IGMP_RV_INIT; + } + + qqi = igmpv3->igmp_qqi; + if (qqi >= 128) { + qqi = IGMP_MANT(igmpv3->igmp_qqi) << + (IGMP_EXP(igmpv3->igmp_qqi) + 3); + } + + timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + + nsrc = ntohs(igmpv3->igmp_numsrc); + + /* + * Validate address fields and versions upfront before + * accepting v3 query. + * XXX SMPng: Unlocked access to igmpstat counters here. + */ + if (in_nullhost(igmpv3->igmp_group)) { + /* + * IGMPv3 General Query. + * + * General Queries SHOULD be directed to 224.0.0.1. + * A general query with a source list has undefined + * behaviour; discard it. + */ + IGMPSTAT_INC(igps_rcv_gen_queries); + if (!in_allhosts(ip->ip_dst) || nsrc > 0) { + IGMPSTAT_INC(igps_rcv_badqueries); + return (0); + } + is_general_query = 1; + } else { + /* Group or group-source specific query. */ + if (nsrc == 0) + IGMPSTAT_INC(igps_rcv_group_queries); + else + IGMPSTAT_INC(igps_rcv_gsr_queries); + } + + IN_MULTI_LOCK(); + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + + if (igi->igi_flags & IGIF_LOOPBACK) { + CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)", + ifp, ifp->if_xname); + goto out_locked; + } + + /* + * Discard the v3 query if we're in Compatibility Mode. + * The RFC is not obviously worded that hosts need to stay in + * compatibility mode until the Old Version Querier Present + * timer expires. + */ + if (igi->igi_version != IGMP_VERSION_3) { + CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)", + igi->igi_version, ifp, ifp->if_xname); + goto out_locked; + } + + igmp_set_version(igi, IGMP_VERSION_3); + igi->igi_rv = qrv; + igi->igi_qi = qqi; + igi->igi_qri = maxresp; + + CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi, + maxresp); + + if (is_general_query) { + /* + * Schedule a current-state report on this ifp for + * all groups, possibly containing source lists. + * If there is a pending General Query response + * scheduled earlier than the selected delay, do + * not schedule any other reports. + * Otherwise, reset the interface timer. + */ + CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)", + ifp, ifp->if_xname); + if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { + igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); + V_interface_timers_running = 1; + } + } else { + /* + * Group-source-specific queries are throttled on + * a per-group basis to defeat denial-of-service attempts. + * Queries for groups we are not a member of on this + * link are simply ignored. + */ + inm = inm_lookup(ifp, igmpv3->igmp_group); + if (inm == NULL) + goto out_locked; + if (nsrc > 0) { + if (!ratecheck(&inm->inm_lastgsrtv, + &V_igmp_gsrdelay)) { + CTR1(KTR_IGMPV3, "%s: GS query throttled.", + __func__); + IGMPSTAT_INC(igps_drop_gsr_queries); + goto out_locked; + } + } + CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)", + inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname); + /* + * If there is a pending General Query response + * scheduled sooner than the selected delay, no + * further report need be scheduled. + * Otherwise, prepare to respond to the + * group-specific or group-and-source query. + */ + if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) + igmp_input_v3_group_query(inm, igi, timer, igmpv3); + } + +out_locked: + IGMP_UNLOCK(); + IN_MULTI_UNLOCK(); + + return (0); +} + +/* + * Process a recieved IGMPv3 group-specific or group-and-source-specific + * query. + * Return <0 if any error occured. Currently this is ignored. + */ +static int +igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi, + int timer, /*const*/ struct igmpv3 *igmpv3) +{ + int retval; + uint16_t nsrc; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + retval = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LEAVING_MEMBER: + return (retval); + break; + case IGMP_REPORTING_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + break; + } + + nsrc = ntohs(igmpv3->igmp_numsrc); + + /* + * Deal with group-specific queries upfront. + * If any group query is already pending, purge any recorded + * source-list state if it exists, and schedule a query response + * for this group-specific query. + */ + if (nsrc == 0) { + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { + inm_clear_recorded(inm); + timer = min(inm->inm_timer, timer); + } + inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + V_current_state_timers_running = 1; + return (retval); + } + + /* + * Deal with the case where a group-and-source-specific query has + * been received but a group-specific query is already pending. + */ + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { + timer = min(inm->inm_timer, timer); + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + V_current_state_timers_running = 1; + return (retval); + } + + /* + * Finally, deal with the case where a group-and-source-specific + * query has been received, where a response to a previous g-s-r + * query exists, or none exists. + * In this case, we need to parse the source-list which the Querier + * has provided us with and check if we have any source list filter + * entries at T1 for these sources. If we do not, there is no need + * schedule a report and the query may be dropped. + * If we do, we must record them and schedule a current-state + * report for those sources. + * FIXME: Handling source lists larger than 1 mbuf requires that + * we pass the mbuf chain pointer down to this function, and use + * m_getptr() to walk the chain. + */ + if (inm->inm_nsrc > 0) { + const struct in_addr *ap; + int i, nrecorded; + + ap = (const struct in_addr *)(igmpv3 + 1); + nrecorded = 0; + for (i = 0; i < nsrc; i++, ap++) { + retval = inm_record_source(inm, ap->s_addr); + if (retval < 0) + break; + nrecorded += retval; + } + if (nrecorded > 0) { + CTR1(KTR_IGMPV3, + "%s: schedule response to SG query", __func__); + inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + V_current_state_timers_running = 1; + } + } + + return (retval); +} + +/* + * Process a received IGMPv1 host membership report. + * + * NOTE: 0.0.0.0 workaround breaks const correctness. + */ +static int +igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, + /*const*/ struct igmp *igmp) +{ + struct in_ifaddr *ia; + struct in_multi *inm; + + IGMPSTAT_INC(igps_rcv_reports); + + if (ifp->if_flags & IFF_LOOPBACK) + return (0); + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || + !in_hosteq(igmp->igmp_group, ip->ip_dst)) { + IGMPSTAT_INC(igps_rcv_badreports); + return (EINVAL); + } + + /* + * RFC 3376, Section 4.2.13, 9.2, 9.3: + * Booting clients may use the source address 0.0.0.0. Some + * IGMP daemons may not know how to use IP_RECVIF to determine + * the interface upon which this message was received. + * Replace 0.0.0.0 with the subnet address if told to do so. + */ + if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + ip->ip_src.s_addr = htonl(ia->ia_subnet); + ifa_free(&ia->ia_ifa); + } + } + + CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + + /* + * IGMPv1 report suppression. + * If we are a member of this group, and our membership should be + * reported, stop our group timer and transition to the 'lazy' state. + */ + IN_MULTI_LOCK(); + inm = inm_lookup(ifp, igmp->igmp_group); + if (inm != NULL) { + struct igmp_ifinfo *igi; + + igi = inm->inm_igi; + if (igi == NULL) { + KASSERT(igi != NULL, + ("%s: no igi for ifp %p", __func__, ifp)); + goto out_locked; + } + + IGMPSTAT_INC(igps_rcv_ourreports); + + /* + * If we are in IGMPv3 host mode, do not allow the + * other host's IGMPv1 report to suppress our reports + * unless explicitly configured to do so. + */ + if (igi->igi_version == IGMP_VERSION_3) { + if (V_igmp_legacysupp) + igmp_v3_suppress_group_record(inm); + goto out_locked; + } + + inm->inm_timer = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + CTR3(KTR_IGMPV3, + "report suppressed for %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + case IGMP_SLEEPING_MEMBER: + inm->inm_state = IGMP_SLEEPING_MEMBER; + break; + case IGMP_REPORTING_MEMBER: + CTR3(KTR_IGMPV3, + "report suppressed for %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + if (igi->igi_version == IGMP_VERSION_1) + inm->inm_state = IGMP_LAZY_MEMBER; + else if (igi->igi_version == IGMP_VERSION_2) + inm->inm_state = IGMP_SLEEPING_MEMBER; + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } + } + +out_locked: + IN_MULTI_UNLOCK(); + + return (0); +} + +/* + * Process a received IGMPv2 host membership report. + * + * NOTE: 0.0.0.0 workaround breaks const correctness. + */ +static int +igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, + /*const*/ struct igmp *igmp) +{ + struct in_ifaddr *ia; + struct in_multi *inm; + + /* + * Make sure we don't hear our own membership report. Fast + * leave requires knowing that we are the only member of a + * group. + */ + IFP_TO_IA(ifp, ia); + if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { + ifa_free(&ia->ia_ifa); + return (0); + } + + IGMPSTAT_INC(igps_rcv_reports); + + if (ifp->if_flags & IFF_LOOPBACK) { + if (ia != NULL) + ifa_free(&ia->ia_ifa); + return (0); + } + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || + !in_hosteq(igmp->igmp_group, ip->ip_dst)) { + if (ia != NULL) + ifa_free(&ia->ia_ifa); + IGMPSTAT_INC(igps_rcv_badreports); + return (EINVAL); + } + + /* + * RFC 3376, Section 4.2.13, 9.2, 9.3: + * Booting clients may use the source address 0.0.0.0. Some + * IGMP daemons may not know how to use IP_RECVIF to determine + * the interface upon which this message was received. + * Replace 0.0.0.0 with the subnet address if told to do so. + */ + if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) { + if (ia != NULL) + ip->ip_src.s_addr = htonl(ia->ia_subnet); + } + if (ia != NULL) + ifa_free(&ia->ia_ifa); + + CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + + /* + * IGMPv2 report suppression. + * If we are a member of this group, and our membership should be + * reported, and our group timer is pending or about to be reset, + * stop our group timer by transitioning to the 'lazy' state. + */ + IN_MULTI_LOCK(); + inm = inm_lookup(ifp, igmp->igmp_group); + if (inm != NULL) { + struct igmp_ifinfo *igi; + + igi = inm->inm_igi; + KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp)); + + IGMPSTAT_INC(igps_rcv_ourreports); + + /* + * If we are in IGMPv3 host mode, do not allow the + * other host's IGMPv1 report to suppress our reports + * unless explicitly configured to do so. + */ + if (igi->igi_version == IGMP_VERSION_3) { + if (V_igmp_legacysupp) + igmp_v3_suppress_group_record(inm); + goto out_locked; + } + + inm->inm_timer = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_AWAKENING_MEMBER: + CTR3(KTR_IGMPV3, + "report suppressed for %s on ifp %p(%s)", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname); + case IGMP_LAZY_MEMBER: + inm->inm_state = IGMP_LAZY_MEMBER; + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } + } + +out_locked: + IN_MULTI_UNLOCK(); + + return (0); +} + +void +igmp_input(struct mbuf *m, int off) +{ + int iphlen; + struct ifnet *ifp; + struct igmp *igmp; + struct ip *ip; + int igmplen; + int minlen; + int queryver; + + CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, m, off); + + ifp = m->m_pkthdr.rcvif; + + IGMPSTAT_INC(igps_rcv_total); + + ip = mtod(m, struct ip *); + iphlen = off; + igmplen = ip->ip_len; + + /* + * Validate lengths. + */ + if (igmplen < IGMP_MINLEN) { + IGMPSTAT_INC(igps_rcv_tooshort); + m_freem(m); + return; + } + + /* + * Always pullup to the minimum size for v1/v2 or v3 + * to amortize calls to m_pullup(). + */ + minlen = iphlen; + if (igmplen >= IGMP_V3_QUERY_MINLEN) + minlen += IGMP_V3_QUERY_MINLEN; + else + minlen += IGMP_MINLEN; + if ((m->m_flags & M_EXT || m->m_len < minlen) && + (m = m_pullup(m, minlen)) == 0) { + IGMPSTAT_INC(igps_rcv_tooshort); + return; + } + ip = mtod(m, struct ip *); + + /* + * Validate checksum. + */ + m->m_data += iphlen; + m->m_len -= iphlen; + igmp = mtod(m, struct igmp *); + if (in_cksum(m, igmplen)) { + IGMPSTAT_INC(igps_rcv_badsum); + m_freem(m); + return; + } + m->m_data -= iphlen; + m->m_len += iphlen; + + /* + * IGMP control traffic is link-scope, and must have a TTL of 1. + * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; + * probe packets may come from beyond the LAN. + */ + if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { + IGMPSTAT_INC(igps_rcv_badttl); + m_freem(m); + return; + } + + switch (igmp->igmp_type) { + case IGMP_HOST_MEMBERSHIP_QUERY: + if (igmplen == IGMP_MINLEN) { + if (igmp->igmp_code == 0) + queryver = IGMP_VERSION_1; + else + queryver = IGMP_VERSION_2; + } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { + queryver = IGMP_VERSION_3; + } else { + IGMPSTAT_INC(igps_rcv_tooshort); + m_freem(m); + return; + } + + switch (queryver) { + case IGMP_VERSION_1: + IGMPSTAT_INC(igps_rcv_v1v2_queries); + if (!V_igmp_v1enable) + break; + if (igmp_input_v1_query(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_VERSION_2: + IGMPSTAT_INC(igps_rcv_v1v2_queries); + if (!V_igmp_v2enable) + break; + if (igmp_input_v2_query(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_VERSION_3: { + struct igmpv3 *igmpv3; + uint16_t igmpv3len; + uint16_t srclen; + int nsrc; + + IGMPSTAT_INC(igps_rcv_v3_queries); + igmpv3 = (struct igmpv3 *)igmp; + /* + * Validate length based on source count. + */ + nsrc = ntohs(igmpv3->igmp_numsrc); + srclen = sizeof(struct in_addr) * nsrc; + if (nsrc * sizeof(in_addr_t) > srclen) { + IGMPSTAT_INC(igps_rcv_tooshort); + return; + } + /* + * m_pullup() may modify m, so pullup in + * this scope. + */ + igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN + + srclen; + if ((m->m_flags & M_EXT || + m->m_len < igmpv3len) && + (m = m_pullup(m, igmpv3len)) == NULL) { + IGMPSTAT_INC(igps_rcv_tooshort); + return; + } + igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *) + + iphlen); + if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { + m_freem(m); + return; + } + } + break; + } + break; + + case IGMP_v1_HOST_MEMBERSHIP_REPORT: + if (!V_igmp_v1enable) + break; + if (igmp_input_v1_report(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_v2_HOST_MEMBERSHIP_REPORT: + if (!V_igmp_v2enable) + break; + if (!ip_checkrouteralert(m)) + IGMPSTAT_INC(igps_rcv_nora); + if (igmp_input_v2_report(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_v3_HOST_MEMBERSHIP_REPORT: + /* + * Hosts do not need to process IGMPv3 membership reports, + * as report suppression is no longer required. + */ + if (!ip_checkrouteralert(m)) + IGMPSTAT_INC(igps_rcv_nora); + break; + + default: + break; + } + + /* + * Pass all valid IGMP packets up to any process(es) listening on a + * raw IGMP socket. + */ + rip_input(m, off); +} + + +/* + * Fast timeout handler (global). + * VIMAGE: Timeout handlers are expected to service all vimages. + */ +void +igmp_fasttimo(void) +{ + VNET_ITERATOR_DECL(vnet_iter); + + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + igmp_fasttimo_vnet(); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +} + +/* + * Fast timeout handler (per-vnet). + * Sends are shuffled off to a netisr to deal with Giant. + * + * VIMAGE: Assume caller has set up our curvnet. + */ +static void +igmp_fasttimo_vnet(void) +{ + struct ifqueue scq; /* State-change packets */ + struct ifqueue qrq; /* Query response packets */ + struct ifnet *ifp; + struct igmp_ifinfo *igi; + struct ifmultiaddr *ifma, *tifma; + struct in_multi *inm; + int loop, uri_fasthz; + + loop = 0; + uri_fasthz = 0; + + /* + * Quick check to see if any work needs to be done, in order to + * minimize the overhead of fasttimo processing. + * SMPng: XXX Unlocked reads. + */ + if (!V_current_state_timers_running && + !V_interface_timers_running && + !V_state_change_timers_running) + return; + + IN_MULTI_LOCK(); + IGMP_LOCK(); + + /* + * IGMPv3 General Query response timer processing. + */ + if (V_interface_timers_running) { + CTR1(KTR_IGMPV3, "%s: interface timers running", __func__); + + V_interface_timers_running = 0; + LIST_FOREACH(igi, &V_igi_head, igi_link) { + if (igi->igi_v3_timer == 0) { + /* Do nothing. */ + } else if (--igi->igi_v3_timer == 0) { + igmp_v3_dispatch_general_query(igi); + } else { + V_interface_timers_running = 1; + } + } + } + + if (!V_current_state_timers_running && + !V_state_change_timers_running) + goto out_locked; + + V_current_state_timers_running = 0; + V_state_change_timers_running = 0; + + CTR1(KTR_IGMPV3, "%s: state change timers running", __func__); + + /* + * IGMPv1/v2/v3 host report and state-change timer processing. + * Note: Processing a v3 group timer may remove a node. + */ + LIST_FOREACH(igi, &V_igi_head, igi_link) { + ifp = igi->igi_ifp; + + if (igi->igi_version == IGMP_VERSION_3) { + loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; + uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * + PR_FASTHZ); + + memset(&qrq, 0, sizeof(struct ifqueue)); + IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS); + + memset(&scq, 0, sizeof(struct ifqueue)); + IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS); + } + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, + tifma) { + if (ifma->ifma_addr->sa_family != AF_INET || + ifma->ifma_protospec == NULL) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + switch (igi->igi_version) { + case IGMP_VERSION_1: + case IGMP_VERSION_2: + igmp_v1v2_process_group_timer(inm, + igi->igi_version); + break; + case IGMP_VERSION_3: + igmp_v3_process_group_timers(igi, &qrq, + &scq, inm, uri_fasthz); + break; + } + } + IF_ADDR_UNLOCK(ifp); + + if (igi->igi_version == IGMP_VERSION_3) { + struct in_multi *tinm; + + igmp_dispatch_queue(&qrq, 0, loop); + igmp_dispatch_queue(&scq, 0, loop); + + /* + * Free the in_multi reference(s) for this + * IGMP lifecycle. + */ + SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, + inm_nrele, tinm) { + SLIST_REMOVE_HEAD(&igi->igi_relinmhead, + inm_nrele); + inm_release_locked(inm); + } + } + } + +out_locked: + IGMP_UNLOCK(); + IN_MULTI_UNLOCK(); +} + +/* + * Update host report group timer for IGMPv1/v2. + * Will update the global pending timer flags. + */ +static void +igmp_v1v2_process_group_timer(struct in_multi *inm, const int version) +{ + int report_timer_expired; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + if (inm->inm_timer == 0) { + report_timer_expired = 0; + } else if (--inm->inm_timer == 0) { + report_timer_expired = 1; + } else { + V_current_state_timers_running = 1; + return; + } + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + if (report_timer_expired) { + inm->inm_state = IGMP_IDLE_MEMBER; + (void)igmp_v1v2_queue_report(inm, + (version == IGMP_VERSION_2) ? + IGMP_v2_HOST_MEMBERSHIP_REPORT : + IGMP_v1_HOST_MEMBERSHIP_REPORT); + } + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } +} + +/* + * Update a group's timers for IGMPv3. + * Will update the global pending timer flags. + * Note: Unlocked read from igi. + */ +static void +igmp_v3_process_group_timers(struct igmp_ifinfo *igi, + struct ifqueue *qrq, struct ifqueue *scq, + struct in_multi *inm, const int uri_fasthz) +{ + int query_response_timer_expired; + int state_change_retransmit_timer_expired; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + query_response_timer_expired = 0; + state_change_retransmit_timer_expired = 0; + + /* + * During a transition from v1/v2 compatibility mode back to v3, + * a group record in REPORTING state may still have its group + * timer active. This is a no-op in this function; it is easier + * to deal with it here than to complicate the slow-timeout path. + */ + if (inm->inm_timer == 0) { + query_response_timer_expired = 0; + } else if (--inm->inm_timer == 0) { + query_response_timer_expired = 1; + } else { + V_current_state_timers_running = 1; + } + + if (inm->inm_sctimer == 0) { + state_change_retransmit_timer_expired = 0; + } else if (--inm->inm_sctimer == 0) { + state_change_retransmit_timer_expired = 1; + } else { + V_state_change_timers_running = 1; + } + + /* We are in fasttimo, so be quick about it. */ + if (!state_change_retransmit_timer_expired && + !query_response_timer_expired) + return; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + case IGMP_IDLE_MEMBER: + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + /* + * Respond to a previously pending Group-Specific + * or Group-and-Source-Specific query by enqueueing + * the appropriate Current-State report for + * immediate transmission. + */ + if (query_response_timer_expired) { + int retval; + + retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, + (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); + CTR2(KTR_IGMPV3, "%s: enqueue record = %d", + __func__, retval); + inm->inm_state = IGMP_REPORTING_MEMBER; + /* XXX Clear recorded sources for next time. */ + inm_clear_recorded(inm); + } + /* FALLTHROUGH */ + case IGMP_REPORTING_MEMBER: + case IGMP_LEAVING_MEMBER: + if (state_change_retransmit_timer_expired) { + /* + * State-change retransmission timer fired. + * If there are any further pending retransmissions, + * set the global pending state-change flag, and + * reset the timer. + */ + if (--inm->inm_scrv > 0) { + inm->inm_sctimer = uri_fasthz; + V_state_change_timers_running = 1; + } + /* + * Retransmit the previously computed state-change + * report. If there are no further pending + * retransmissions, the mbuf queue will be consumed. + * Update T0 state to T1 as we have now sent + * a state-change. + */ + (void)igmp_v3_merge_state_changes(inm, scq); + + inm_commit(inm); + CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + + /* + * If we are leaving the group for good, make sure + * we release IGMP's reference to it. + * This release must be deferred using a SLIST, + * as we are called from a loop which traverses + * the in_ifmultiaddr TAILQ. + */ + if (inm->inm_state == IGMP_LEAVING_MEMBER && + inm->inm_scrv == 0) { + inm->inm_state = IGMP_NOT_MEMBER; + SLIST_INSERT_HEAD(&igi->igi_relinmhead, + inm, inm_nrele); + } + } + break; + } +} + + +/* + * Suppress a group's pending response to a group or source/group query. + * + * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. + * Do NOT update ST1/ST0 as this operation merely suppresses + * the currently pending group record. + * Do NOT suppress the response to a general query. It is possible but + * it would require adding another state or flag. + */ +static void +igmp_v3_suppress_group_record(struct in_multi *inm) +{ + + IN_MULTI_LOCK_ASSERT(); + + KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3, + ("%s: not IGMPv3 mode on link", __func__)); + + if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) + return; + + if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) + inm_clear_recorded(inm); + + inm->inm_timer = 0; + inm->inm_state = IGMP_REPORTING_MEMBER; +} + +/* + * Switch to a different IGMP version on the given interface, + * as per Section 7.2.1. + */ +static void +igmp_set_version(struct igmp_ifinfo *igi, const int version) +{ + int old_version_timer; + + IGMP_LOCK_ASSERT(); + + CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__, + version, igi->igi_ifp, igi->igi_ifp->if_xname); + + if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) { + /* + * Compute the "Older Version Querier Present" timer as per + * Section 8.12. + */ + old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; + old_version_timer *= PR_SLOWHZ; + + if (version == IGMP_VERSION_1) { + igi->igi_v1_timer = old_version_timer; + igi->igi_v2_timer = 0; + } else if (version == IGMP_VERSION_2) { + igi->igi_v1_timer = 0; + igi->igi_v2_timer = old_version_timer; + } + } + + if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { + if (igi->igi_version != IGMP_VERSION_2) { + igi->igi_version = IGMP_VERSION_2; + igmp_v3_cancel_link_timers(igi); + } + } else if (igi->igi_v1_timer > 0) { + if (igi->igi_version != IGMP_VERSION_1) { + igi->igi_version = IGMP_VERSION_1; + igmp_v3_cancel_link_timers(igi); + } + } +} + +/* + * Cancel pending IGMPv3 timers for the given link and all groups + * joined on it; state-change, general-query, and group-query timers. + * + * Only ever called on a transition from v3 to Compatibility mode. Kill + * the timers stone dead (this may be expensive for large N groups), they + * will be restarted if Compatibility Mode deems that they must be due to + * query processing. + */ +static void +igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi) +{ + struct ifmultiaddr *ifma; + struct ifnet *ifp; + struct in_multi *inm; + + CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__, + igi->igi_ifp, igi->igi_ifp->if_xname); + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + /* + * Stop the v3 General Query Response on this link stone dead. + * If fasttimo is woken up due to V_interface_timers_running, + * the flag will be cleared if there are no pending link timers. + */ + igi->igi_v3_timer = 0; + + /* + * Now clear the current-state and state-change report timers + * for all memberships scoped to this link. + */ + ifp = igi->igi_ifp; + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET || + ifma->ifma_protospec == NULL) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + /* + * These states are either not relevant in v3 mode, + * or are unreported. Do nothing. + */ + break; + case IGMP_LEAVING_MEMBER: + /* + * If we are leaving the group and switching to + * compatibility mode, we need to release the final + * reference held for issuing the INCLUDE {}, and + * transition to REPORTING to ensure the host leave + * message is sent upstream to the old querier -- + * transition to NOT would lose the leave and race. + * + * SMPNG: Must drop and re-acquire IF_ADDR_LOCK + * around inm_release_locked(), as it is not + * a recursive mutex. + */ + IF_ADDR_UNLOCK(ifp); + inm_release_locked(inm); + IF_ADDR_LOCK(ifp); + /* FALLTHROUGH */ + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + inm_clear_recorded(inm); + /* FALLTHROUGH */ + case IGMP_REPORTING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + break; + } + /* + * Always clear state-change and group report timers. + * Free any pending IGMPv3 state-change records. + */ + inm->inm_sctimer = 0; + inm->inm_timer = 0; + _IF_DRAIN(&inm->inm_scq); + } + IF_ADDR_UNLOCK(ifp); +} + +/* + * Update the Older Version Querier Present timers for a link. + * See Section 7.2.1 of RFC 3376. + */ +static void +igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi) +{ + + IGMP_LOCK_ASSERT(); + + if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { + /* + * IGMPv1 and IGMPv2 Querier Present timers expired. + * + * Revert to IGMPv3. + */ + if (igi->igi_version != IGMP_VERSION_3) { + CTR5(KTR_IGMPV3, + "%s: transition from v%d -> v%d on %p(%s)", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_xname); + igi->igi_version = IGMP_VERSION_3; + } + } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { + /* + * IGMPv1 Querier Present timer expired, + * IGMPv2 Querier Present timer running. + * If IGMPv2 was disabled since last timeout, + * revert to IGMPv3. + * If IGMPv2 is enabled, revert to IGMPv2. + */ + if (!V_igmp_v2enable) { + CTR5(KTR_IGMPV3, + "%s: transition from v%d -> v%d on %p(%s)", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_xname); + igi->igi_v2_timer = 0; + igi->igi_version = IGMP_VERSION_3; + } else { + --igi->igi_v2_timer; + if (igi->igi_version != IGMP_VERSION_2) { + CTR5(KTR_IGMPV3, + "%s: transition from v%d -> v%d on %p(%s)", + __func__, igi->igi_version, IGMP_VERSION_2, + igi->igi_ifp, igi->igi_ifp->if_xname); + igi->igi_version = IGMP_VERSION_2; + } + } + } else if (igi->igi_v1_timer > 0) { + /* + * IGMPv1 Querier Present timer running. + * Stop IGMPv2 timer if running. + * + * If IGMPv1 was disabled since last timeout, + * revert to IGMPv3. + * If IGMPv1 is enabled, reset IGMPv2 timer if running. + */ + if (!V_igmp_v1enable) { + CTR5(KTR_IGMPV3, + "%s: transition from v%d -> v%d on %p(%s)", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_xname); + igi->igi_v1_timer = 0; + igi->igi_version = IGMP_VERSION_3; + } else { + --igi->igi_v1_timer; + } + if (igi->igi_v2_timer > 0) { + CTR3(KTR_IGMPV3, + "%s: cancel v2 timer on %p(%s)", + __func__, igi->igi_ifp, igi->igi_ifp->if_xname); + igi->igi_v2_timer = 0; + } + } +} + +/* + * Global slowtimo handler. + * VIMAGE: Timeout handlers are expected to service all vimages. + */ +void +igmp_slowtimo(void) +{ + VNET_ITERATOR_DECL(vnet_iter); + + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + igmp_slowtimo_vnet(); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +} + +/* + * Per-vnet slowtimo handler. + */ +static void +igmp_slowtimo_vnet(void) +{ + struct igmp_ifinfo *igi; + + IGMP_LOCK(); + + LIST_FOREACH(igi, &V_igi_head, igi_link) { + igmp_v1v2_process_querier_timers(igi); + } + + IGMP_UNLOCK(); +} + +/* + * Dispatch an IGMPv1/v2 host report or leave message. + * These are always small enough to fit inside a single mbuf. + */ +static int +igmp_v1v2_queue_report(struct in_multi *inm, const int type) +{ + struct ifnet *ifp; + struct igmp *igmp; + struct ip *ip; + struct mbuf *m; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + ifp = inm->inm_ifp; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return (ENOMEM); + MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); + + m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); + + m->m_data += sizeof(struct ip); + m->m_len = sizeof(struct igmp); + + igmp = mtod(m, struct igmp *); + igmp->igmp_type = type; + igmp->igmp_code = 0; + igmp->igmp_group = inm->inm_addr; + igmp->igmp_cksum = 0; + igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_len = sizeof(struct ip) + sizeof(struct igmp); + ip->ip_off = 0; + ip->ip_p = IPPROTO_IGMP; + ip->ip_src.s_addr = INADDR_ANY; + + if (type == IGMP_HOST_LEAVE_MESSAGE) + ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); + else + ip->ip_dst = inm->inm_addr; + + igmp_save_context(m, ifp); + + m->m_flags |= M_IGMPV2; + if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) + m->m_flags |= M_IGMP_LOOP; + + CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m); + netisr_dispatch(NETISR_IGMP, m); + + return (0); +} + +/* + * Process a state change from the upper layer for the given IPv4 group. + * + * Each socket holds a reference on the in_multi in its own ip_moptions. + * The socket layer will have made the necessary updates to.the group + * state, it is now up to IGMP to issue a state change report if there + * has been any change between T0 (when the last state-change was issued) + * and T1 (now). + * + * We use the IGMPv3 state machine at group level. The IGMP module + * however makes the decision as to which IGMP protocol version to speak. + * A state change *from* INCLUDE {} always means an initial join. + * A state change *to* INCLUDE {} always means a final leave. + * + * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can + * save ourselves a bunch of work; any exclusive mode groups need not + * compute source filter lists. + * + * VIMAGE: curvnet should have been set by caller, as this routine + * is called from the socket option handlers. + */ +int +igmp_change_state(struct in_multi *inm) +{ + struct igmp_ifinfo *igi; + struct ifnet *ifp; + int error; + + IN_MULTI_LOCK_ASSERT(); + + error = 0; + + /* + * Try to detect if the upper layer just asked us to change state + * for an interface which has now gone away. + */ + KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); + ifp = inm->inm_ifma->ifma_ifp; + if (ifp != NULL) { + /* + * Sanity check that netinet's notion of ifp is the + * same as net's. + */ + KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); + } + + IGMP_LOCK(); + + igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp; + KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp)); + + /* + * If we detect a state transition to or from MCAST_UNDEFINED + * for this group, then we are starting or finishing an IGMP + * life cycle for this group. + */ + if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { + CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__, + inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode); + if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { + CTR1(KTR_IGMPV3, "%s: initial join", __func__); + error = igmp_initial_join(inm, igi); + goto out_locked; + } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { + CTR1(KTR_IGMPV3, "%s: final leave", __func__); + igmp_final_leave(inm, igi); + goto out_locked; + } + } else { + CTR1(KTR_IGMPV3, "%s: filter set change", __func__); + } + + error = igmp_handle_state_change(inm, igi); + +out_locked: + IGMP_UNLOCK(); + return (error); +} + +/* + * Perform the initial join for an IGMP group. + * + * When joining a group: + * If the group should have its IGMP traffic suppressed, do nothing. + * IGMPv1 starts sending IGMPv1 host membership reports. + * IGMPv2 starts sending IGMPv2 host membership reports. + * IGMPv3 will schedule an IGMPv3 state-change report containing the + * initial state of the membership. + */ +static int +igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + struct ifqueue *ifq; + int error, retval, syncstates; + + CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_xname); + + error = 0; + syncstates = 1; + + ifp = inm->inm_ifp; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); + + /* + * Groups joined on loopback or marked as 'not reported', + * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and + * are never reported in any IGMP protocol exchanges. + * All other groups enter the appropriate IGMP state machine + * for the version in use on this link. + * A link marked as IGIF_SILENT causes IGMP to be completely + * disabled for the link. + */ + if ((ifp->if_flags & IFF_LOOPBACK) || + (igi->igi_flags & IGIF_SILENT) || + !igmp_isgroupreported(inm->inm_addr)) { + CTR1(KTR_IGMPV3, +"%s: not kicking state machine for silent group", __func__); + inm->inm_state = IGMP_SILENT_MEMBER; + inm->inm_timer = 0; + } else { + /* + * Deal with overlapping in_multi lifecycle. + * If this group was LEAVING, then make sure + * we drop the reference we picked up to keep the + * group around for the final INCLUDE {} enqueue. + */ + if (igi->igi_version == IGMP_VERSION_3 && + inm->inm_state == IGMP_LEAVING_MEMBER) + inm_release_locked(inm); + + inm->inm_state = IGMP_REPORTING_MEMBER; + + switch (igi->igi_version) { + case IGMP_VERSION_1: + case IGMP_VERSION_2: + inm->inm_state = IGMP_IDLE_MEMBER; + error = igmp_v1v2_queue_report(inm, + (igi->igi_version == IGMP_VERSION_2) ? + IGMP_v2_HOST_MEMBERSHIP_REPORT : + IGMP_v1_HOST_MEMBERSHIP_REPORT); + if (error == 0) { + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_V1V2_MAX_RI * PR_FASTHZ); + V_current_state_timers_running = 1; + } + break; + + case IGMP_VERSION_3: + /* + * Defer update of T0 to T1, until the first copy + * of the state change has been transmitted. + */ + syncstates = 0; + + /* + * Immediately enqueue a State-Change Report for + * this interface, freeing any previous reports. + * Don't kick the timers if there is nothing to do, + * or if an error occurred. + */ + ifq = &inm->inm_scq; + _IF_DRAIN(ifq); + retval = igmp_v3_enqueue_group_record(ifq, inm, 1, + 0, 0); + CTR2(KTR_IGMPV3, "%s: enqueue record = %d", + __func__, retval); + if (retval <= 0) { + error = retval * -1; + break; + } + + /* + * Schedule transmission of pending state-change + * report up to RV times for this link. The timer + * will fire at the next igmp_fasttimo (~200ms), + * giving us an opportunity to merge the reports. + */ + if (igi->igi_flags & IGIF_LOOPBACK) { + inm->inm_scrv = 1; + } else { + KASSERT(igi->igi_rv > 1, + ("%s: invalid robustness %d", __func__, + igi->igi_rv)); + inm->inm_scrv = igi->igi_rv; + } + inm->inm_sctimer = 1; + V_state_change_timers_running = 1; + + error = 0; + break; + } + } + + /* + * Only update the T0 state if state change is atomic, + * i.e. we don't need to wait for a timer to fire before we + * can consider the state change to have been communicated. + */ + if (syncstates) { + inm_commit(inm); + CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + } + + return (error); +} + +/* + * Issue an intermediate state change during the IGMP life-cycle. + */ +static int +igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + int retval; + + CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_xname); + + ifp = inm->inm_ifp; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__)); + + if ((ifp->if_flags & IFF_LOOPBACK) || + (igi->igi_flags & IGIF_SILENT) || + !igmp_isgroupreported(inm->inm_addr) || + (igi->igi_version != IGMP_VERSION_3)) { + if (!igmp_isgroupreported(inm->inm_addr)) { + CTR1(KTR_IGMPV3, +"%s: not kicking state machine for silent group", __func__); + } + CTR1(KTR_IGMPV3, "%s: nothing to do", __func__); + inm_commit(inm); + CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + return (0); + } + + _IF_DRAIN(&inm->inm_scq); + + retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); + CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval); + if (retval <= 0) + return (-retval); + + /* + * If record(s) were enqueued, start the state-change + * report timer for this group. + */ + inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); + inm->inm_sctimer = 1; + V_state_change_timers_running = 1; + + return (0); +} + +/* + * Perform the final leave for an IGMP group. + * + * When leaving a group: + * IGMPv1 does nothing. + * IGMPv2 sends a host leave message, if and only if we are the reporter. + * IGMPv3 enqueues a state-change report containing a transition + * to INCLUDE {} for immediate transmission. + */ +static void +igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + int syncstates; + + syncstates = 1; + + CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_xname); + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_LEAVING_MEMBER: + /* Already leaving or left; do nothing. */ + CTR1(KTR_IGMPV3, +"%s: not kicking state machine for silent group", __func__); + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + if (igi->igi_version == IGMP_VERSION_2) { +#ifdef INVARIANTS + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) + panic("%s: IGMPv3 state reached, not IGMPv3 mode", + __func__); +#endif + igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); + inm->inm_state = IGMP_NOT_MEMBER; + } else if (igi->igi_version == IGMP_VERSION_3) { + /* + * Stop group timer and all pending reports. + * Immediately enqueue a state-change report + * TO_IN {} to be sent on the next fast timeout, + * giving us an opportunity to merge reports. + */ + _IF_DRAIN(&inm->inm_scq); + inm->inm_timer = 0; + if (igi->igi_flags & IGIF_LOOPBACK) { + inm->inm_scrv = 1; + } else { + inm->inm_scrv = igi->igi_rv; + } + CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d " + "pending retransmissions.", __func__, + inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_xname, inm->inm_scrv); + if (inm->inm_scrv == 0) { + inm->inm_state = IGMP_NOT_MEMBER; + inm->inm_sctimer = 0; + } else { + int retval; + + inm_acquire_locked(inm); + + retval = igmp_v3_enqueue_group_record( + &inm->inm_scq, inm, 1, 0, 0); + KASSERT(retval != 0, + ("%s: enqueue record = %d", __func__, + retval)); + + inm->inm_state = IGMP_LEAVING_MEMBER; + inm->inm_sctimer = 1; + V_state_change_timers_running = 1; + syncstates = 0; + } + break; + } + break; + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + /* Our reports are suppressed; do nothing. */ + break; + } + + if (syncstates) { + inm_commit(inm); + CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname); + } +} + +/* + * Enqueue an IGMPv3 group record to the given output queue. + * + * XXX This function could do with having the allocation code + * split out, and the multiple-tree-walks coalesced into a single + * routine as has been done in igmp_v3_enqueue_filter_change(). + * + * If is_state_change is zero, a current-state record is appended. + * If is_state_change is non-zero, a state-change report is appended. + * + * If is_group_query is non-zero, an mbuf packet chain is allocated. + * If is_group_query is zero, and if there is a packet with free space + * at the tail of the queue, it will be appended to providing there + * is enough free space. + * Otherwise a new mbuf packet chain is allocated. + * + * If is_source_query is non-zero, each source is checked to see if + * it was recorded for a Group-Source query, and will be omitted if + * it is not both in-mode and recorded. + * + * The function will attempt to allocate leading space in the packet + * for the IP/IGMP header to be prepended without fragmenting the chain. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, + const int is_state_change, const int is_group_query, + const int is_source_query) +{ + struct igmp_grouprec ig; + struct igmp_grouprec *pig; + struct ifnet *ifp; + struct ip_msource *ims, *nims; + struct mbuf *m0, *m, *md; + int error, is_filter_list_change; + int minrec0len, m0srcs, msrcs, nbytes, off; + int record_has_sources; + int now; + int type; + in_addr_t naddr; + uint8_t mode; + + IN_MULTI_LOCK_ASSERT(); + + error = 0; + ifp = inm->inm_ifp; + is_filter_list_change = 0; + m = NULL; + m0 = NULL; + m0srcs = 0; + msrcs = 0; + nbytes = 0; + nims = NULL; + record_has_sources = 1; + pig = NULL; + type = IGMP_DO_NOTHING; + mode = inm->inm_st[1].iss_fmode; + + /* + * If we did not transition out of ASM mode during t0->t1, + * and there are no source nodes to process, we can skip + * the generation of source records. + */ + if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && + inm->inm_nsrc == 0) + record_has_sources = 0; + + if (is_state_change) { + /* + * Queue a state change record. + * If the mode did not change, and there are non-ASM + * listeners or source filters present, + * we potentially need to issue two records for the group. + * If we are transitioning to MCAST_UNDEFINED, we need + * not send any sources. + * If there are ASM listeners, and there was no filter + * mode transition of any kind, do nothing. + */ + if (mode != inm->inm_st[0].iss_fmode) { + if (mode == MCAST_EXCLUDE) { + CTR1(KTR_IGMPV3, "%s: change to EXCLUDE", + __func__); + type = IGMP_CHANGE_TO_EXCLUDE_MODE; + } else { + CTR1(KTR_IGMPV3, "%s: change to INCLUDE", + __func__); + type = IGMP_CHANGE_TO_INCLUDE_MODE; + if (mode == MCAST_UNDEFINED) + record_has_sources = 0; + } + } else { + if (record_has_sources) { + is_filter_list_change = 1; + } else { + type = IGMP_DO_NOTHING; + } + } + } else { + /* + * Queue a current state record. + */ + if (mode == MCAST_EXCLUDE) { + type = IGMP_MODE_IS_EXCLUDE; + } else if (mode == MCAST_INCLUDE) { + type = IGMP_MODE_IS_INCLUDE; + KASSERT(inm->inm_st[1].iss_asm == 0, + ("%s: inm %p is INCLUDE but ASM count is %d", + __func__, inm, inm->inm_st[1].iss_asm)); + } + } + + /* + * Generate the filter list changes using a separate function. + */ + if (is_filter_list_change) + return (igmp_v3_enqueue_filter_change(ifq, inm)); + + if (type == IGMP_DO_NOTHING) { + CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s", + __func__, inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_xname); + return (0); + } + + /* + * If any sources are present, we must be able to fit at least + * one in the trailing space of the tail packet's mbuf, + * ideally more. + */ + minrec0len = sizeof(struct igmp_grouprec); + if (record_has_sources) + minrec0len += sizeof(in_addr_t); + + CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__, + igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_xname); + + /* + * Check if we have a packet in the tail of the queue for this + * group into which the first group record for this group will fit. + * Otherwise allocate a new packet. + * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. + * Note: Group records for G/GSR query responses MUST be sent + * in their own packet. + */ + m0 = ifq->ifq_tail; + if (!is_group_query && + m0 != NULL && + (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && + (m0->m_pkthdr.len + minrec0len) < + (ifp->if_mtu - IGMP_LEADINGSPACE)) { + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + m = m0; + CTR1(KTR_IGMPV3, "%s: use existing packet", __func__); + } else { + if (_IF_QFULL(ifq)) { + CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); + return (-ENOMEM); + } + m = NULL; + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + if (!is_state_change && !is_group_query) { + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m) + m->m_data += IGMP_LEADINGSPACE; + } + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) + return (-ENOMEM); + + igmp_save_context(m, ifp); + + CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__); + } + + /* + * Append group record. + * If we have sources, we don't know how many yet. + */ + ig.ig_type = type; + ig.ig_datalen = 0; + ig.ig_numsrc = 0; + ig.ig_group = inm->inm_addr; + if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); + return (-ENOMEM); + } + nbytes += sizeof(struct igmp_grouprec); + + /* + * Append as many sources as will fit in the first packet. + * If we are appending to a new packet, the chain allocation + * may potentially use clusters; use m_getptr() in this case. + * If we are appending to an existing packet, we need to obtain + * a pointer to the group record after m_append(), in case a new + * mbuf was allocated. + * Only append sources which are in-mode at t1. If we are + * transitioning to MCAST_UNDEFINED state on the group, do not + * include source entries. + * Only report recorded sources in our filter set when responding + * to a group-source query. + */ + if (record_has_sources) { + if (m == m0) { + md = m_last(m); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + + md->m_len - nbytes); + } else { + md = m_getptr(m, 0, &off); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + + off); + } + msrcs = 0; + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { + CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, + inet_ntoa_haddr(ims->ims_haddr)); + now = ims_get_mode(inm, ims, 1); + CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now); + if ((now != mode) || + (now == mode && mode == MCAST_UNDEFINED)) { + CTR1(KTR_IGMPV3, "%s: skip node", __func__); + continue; + } + if (is_source_query && ims->ims_stp == 0) { + CTR1(KTR_IGMPV3, "%s: skip unrecorded node", + __func__); + continue; + } + CTR1(KTR_IGMPV3, "%s: append node", __func__); + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, "%s: m_append() failed.", + __func__); + return (-ENOMEM); + } + nbytes += sizeof(in_addr_t); + ++msrcs; + if (msrcs == m0srcs) + break; + } + CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__, + msrcs); + pig->ig_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(in_addr_t)); + } + + if (is_source_query && msrcs == 0) { + CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__); + if (m != m0) + m_freem(m); + return (0); + } + + /* + * We are good to go with first packet. + */ + if (m != m0) { + CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__); + m->m_pkthdr.PH_vt.vt_nrecs = 1; + _IF_ENQUEUE(ifq, m); + } else + m->m_pkthdr.PH_vt.vt_nrecs++; + + /* + * No further work needed if no source list in packet(s). + */ + if (!record_has_sources) + return (nbytes); + + /* + * Whilst sources remain to be announced, we need to allocate + * a new packet and fill out as many sources as will fit. + * Always try for a cluster first. + */ + while (nims != NULL) { + if (_IF_QFULL(ifq)) { + CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__); + return (-ENOMEM); + } + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m) + m->m_data += IGMP_LEADINGSPACE; + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) + return (-ENOMEM); + igmp_save_context(m, ifp); + md = m_getptr(m, 0, &off); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); + CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__); + + if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__); + return (-ENOMEM); + } + m->m_pkthdr.PH_vt.vt_nrecs = 1; + nbytes += sizeof(struct igmp_grouprec); + + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + + msrcs = 0; + RB_FOREACH_FROM(ims, ip_msource_tree, nims) { + CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, + inet_ntoa_haddr(ims->ims_haddr)); + now = ims_get_mode(inm, ims, 1); + if ((now != mode) || + (now == mode && mode == MCAST_UNDEFINED)) { + CTR1(KTR_IGMPV3, "%s: skip node", __func__); + continue; + } + if (is_source_query && ims->ims_stp == 0) { + CTR1(KTR_IGMPV3, "%s: skip unrecorded node", + __func__); + continue; + } + CTR1(KTR_IGMPV3, "%s: append node", __func__); + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, "%s: m_append() failed.", + __func__); + return (-ENOMEM); + } + ++msrcs; + if (msrcs == m0srcs) + break; + } + pig->ig_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(in_addr_t)); + + CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__); + _IF_ENQUEUE(ifq, m); + } + + return (nbytes); +} + +/* + * Type used to mark record pass completion. + * We exploit the fact we can cast to this easily from the + * current filter modes on each ip_msource node. + */ +typedef enum { + REC_NONE = 0x00, /* MCAST_UNDEFINED */ + REC_ALLOW = 0x01, /* MCAST_INCLUDE */ + REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ + REC_FULL = REC_ALLOW | REC_BLOCK +} rectype_t; + +/* + * Enqueue an IGMPv3 filter list change to the given output queue. + * + * Source list filter state is held in an RB-tree. When the filter list + * for a group is changed without changing its mode, we need to compute + * the deltas between T0 and T1 for each source in the filter set, + * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. + * + * As we may potentially queue two record types, and the entire R-B tree + * needs to be walked at once, we break this out into its own function + * so we can generate a tightly packed queue of packets. + * + * XXX This could be written to only use one tree walk, although that makes + * serializing into the mbuf chains a bit harder. For now we do two walks + * which makes things easier on us, and it may or may not be harder on + * the L2 cache. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) +{ + static const int MINRECLEN = + sizeof(struct igmp_grouprec) + sizeof(in_addr_t); + struct ifnet *ifp; + struct igmp_grouprec ig; + struct igmp_grouprec *pig; + struct ip_msource *ims, *nims; + struct mbuf *m, *m0, *md; + in_addr_t naddr; + int m0srcs, nbytes, npbytes, off, rsrcs, schanged; + int nallow, nblock; + uint8_t mode, now, then; + rectype_t crt, drt, nrt; + + IN_MULTI_LOCK_ASSERT(); + + if (inm->inm_nsrc == 0 || + (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) + return (0); + + ifp = inm->inm_ifp; /* interface */ + mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ + crt = REC_NONE; /* current group record type */ + drt = REC_NONE; /* mask of completed group record types */ + nrt = REC_NONE; /* record type for current node */ + m0srcs = 0; /* # source which will fit in current mbuf chain */ + nbytes = 0; /* # of bytes appended to group's state-change queue */ + npbytes = 0; /* # of bytes appended this packet */ + rsrcs = 0; /* # sources encoded in current record */ + schanged = 0; /* # nodes encoded in overall filter change */ + nallow = 0; /* # of source entries in ALLOW_NEW */ + nblock = 0; /* # of source entries in BLOCK_OLD */ + nims = NULL; /* next tree node pointer */ + + /* + * For each possible filter record mode. + * The first kind of source we encounter tells us which + * is the first kind of record we start appending. + * If a node transitioned to UNDEFINED at t1, its mode is treated + * as the inverse of the group's filter mode. + */ + while (drt != REC_FULL) { + do { + m0 = ifq->ifq_tail; + if (m0 != NULL && + (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= + IGMP_V3_REPORT_MAXRECS) && + (m0->m_pkthdr.len + MINRECLEN) < + (ifp->if_mtu - IGMP_LEADINGSPACE)) { + m = m0; + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct igmp_grouprec)) / + sizeof(in_addr_t); + CTR1(KTR_IGMPV3, + "%s: use previous packet", __func__); + } else { + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m) + m->m_data += IGMP_LEADINGSPACE; + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) { + CTR1(KTR_IGMPV3, + "%s: m_get*() failed", __func__); + return (-ENOMEM); + } + m->m_pkthdr.PH_vt.vt_nrecs = 0; + igmp_save_context(m, ifp); + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / + sizeof(in_addr_t); + npbytes = 0; + CTR1(KTR_IGMPV3, + "%s: allocated new packet", __func__); + } + /* + * Append the IGMP group record header to the + * current packet's data area. + * Recalculate pointer to free space for next + * group record, in case m_append() allocated + * a new mbuf or cluster. + */ + memset(&ig, 0, sizeof(ig)); + ig.ig_group = inm->inm_addr; + if (!m_append(m, sizeof(ig), (void *)&ig)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, + "%s: m_append() failed", __func__); + return (-ENOMEM); + } + npbytes += sizeof(struct igmp_grouprec); + if (m != m0) { + /* new packet; offset in c hain */ + md = m_getptr(m, npbytes - + sizeof(struct igmp_grouprec), &off); + pig = (struct igmp_grouprec *)(mtod(md, + uint8_t *) + off); + } else { + /* current packet; offset from last append */ + md = m_last(m); + pig = (struct igmp_grouprec *)(mtod(md, + uint8_t *) + md->m_len - + sizeof(struct igmp_grouprec)); + } + /* + * Begin walking the tree for this record type + * pass, or continue from where we left off + * previously if we had to allocate a new packet. + * Only report deltas in-mode at t1. + * We need not report included sources as allowed + * if we are in inclusive mode on the group, + * however the converse is not true. + */ + rsrcs = 0; + if (nims == NULL) + nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); + RB_FOREACH_FROM(ims, ip_msource_tree, nims) { + CTR2(KTR_IGMPV3, "%s: visit node %s", + __func__, inet_ntoa_haddr(ims->ims_haddr)); + now = ims_get_mode(inm, ims, 1); + then = ims_get_mode(inm, ims, 0); + CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d", + __func__, then, now); + if (now == then) { + CTR1(KTR_IGMPV3, + "%s: skip unchanged", __func__); + continue; + } + if (mode == MCAST_EXCLUDE && + now == MCAST_INCLUDE) { + CTR1(KTR_IGMPV3, + "%s: skip IN src on EX group", + __func__); + continue; + } + nrt = (rectype_t)now; + if (nrt == REC_NONE) + nrt = (rectype_t)(~mode & REC_FULL); + if (schanged++ == 0) { + crt = nrt; + } else if (crt != nrt) + continue; + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), + (void *)&naddr)) { + if (m != m0) + m_freem(m); + CTR1(KTR_IGMPV3, + "%s: m_append() failed", __func__); + return (-ENOMEM); + } + nallow += !!(crt == REC_ALLOW); + nblock += !!(crt == REC_BLOCK); + if (++rsrcs == m0srcs) + break; + } + /* + * If we did not append any tree nodes on this + * pass, back out of allocations. + */ + if (rsrcs == 0) { + npbytes -= sizeof(struct igmp_grouprec); + if (m != m0) { + CTR1(KTR_IGMPV3, + "%s: m_free(m)", __func__); + m_freem(m); + } else { + CTR1(KTR_IGMPV3, + "%s: m_adj(m, -ig)", __func__); + m_adj(m, -((int)sizeof( + struct igmp_grouprec))); + } + continue; + } + npbytes += (rsrcs * sizeof(in_addr_t)); + if (crt == REC_ALLOW) + pig->ig_type = IGMP_ALLOW_NEW_SOURCES; + else if (crt == REC_BLOCK) + pig->ig_type = IGMP_BLOCK_OLD_SOURCES; + pig->ig_numsrc = htons(rsrcs); + /* + * Count the new group record, and enqueue this + * packet if it wasn't already queued. + */ + m->m_pkthdr.PH_vt.vt_nrecs++; + if (m != m0) + _IF_ENQUEUE(ifq, m); + nbytes += npbytes; + } while (nims != NULL); + drt |= crt; + crt = (~crt & REC_FULL); + } + + CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__, + nallow, nblock); + + return (nbytes); +} + +static int +igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) +{ + struct ifqueue *gq; + struct mbuf *m; /* pending state-change */ + struct mbuf *m0; /* copy of pending state-change */ + struct mbuf *mt; /* last state-change in packet */ + int docopy, domerge; + u_int recslen; + + docopy = 0; + domerge = 0; + recslen = 0; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + /* + * If there are further pending retransmissions, make a writable + * copy of each queued state-change message before merging. + */ + if (inm->inm_scrv > 0) + docopy = 1; + + gq = &inm->inm_scq; +#ifdef KTR + if (gq->ifq_head == NULL) { + CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty", + __func__, inm); + } +#endif + + m = gq->ifq_head; + while (m != NULL) { + /* + * Only merge the report into the current packet if + * there is sufficient space to do so; an IGMPv3 report + * packet may only contain 65,535 group records. + * Always use a simple mbuf chain concatentation to do this, + * as large state changes for single groups may have + * allocated clusters. + */ + domerge = 0; + mt = ifscq->ifq_tail; + if (mt != NULL) { + recslen = m_length(m, NULL); + + if ((mt->m_pkthdr.PH_vt.vt_nrecs + + m->m_pkthdr.PH_vt.vt_nrecs <= + IGMP_V3_REPORT_MAXRECS) && + (mt->m_pkthdr.len + recslen <= + (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) + domerge = 1; + } + + if (!domerge && _IF_QFULL(gq)) { + CTR2(KTR_IGMPV3, + "%s: outbound queue full, skipping whole packet %p", + __func__, m); + mt = m->m_nextpkt; + if (!docopy) + m_freem(m); + m = mt; + continue; + } + + if (!docopy) { + CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m); + _IF_DEQUEUE(gq, m0); + m = m0->m_nextpkt; + } else { + CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m); + m0 = m_dup(m, M_NOWAIT); + if (m0 == NULL) + return (ENOMEM); + m0->m_nextpkt = NULL; + m = m->m_nextpkt; + } + + if (!domerge) { + CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)", + __func__, m0, ifscq); + _IF_ENQUEUE(ifscq, m0); + } else { + struct mbuf *mtl; /* last mbuf of packet mt */ + + CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)", + __func__, m0, mt); + + mtl = m_last(mt); + m0->m_flags &= ~M_PKTHDR; + mt->m_pkthdr.len += recslen; + mt->m_pkthdr.PH_vt.vt_nrecs += + m0->m_pkthdr.PH_vt.vt_nrecs; + + mtl->m_next = m0; + } + } + + return (0); +} + +/* + * Respond to a pending IGMPv3 General Query. + */ +static void +igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi) +{ + struct ifmultiaddr *ifma, *tifma; + struct ifnet *ifp; + struct in_multi *inm; + int retval, loop; + + IN_MULTI_LOCK_ASSERT(); + IGMP_LOCK_ASSERT(); + + KASSERT(igi->igi_version == IGMP_VERSION_3, + ("%s: called when version %d", __func__, igi->igi_version)); + + ifp = igi->igi_ifp; + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, tifma) { + if (ifma->ifma_addr->sa_family != AF_INET || + ifma->ifma_protospec == NULL) + continue; + + inm = (struct in_multi *)ifma->ifma_protospec; + KASSERT(ifp == inm->inm_ifp, + ("%s: inconsistent ifp", __func__)); + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + retval = igmp_v3_enqueue_group_record(&igi->igi_gq, + inm, 0, 0, 0); + CTR2(KTR_IGMPV3, "%s: enqueue record = %d", + __func__, retval); + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } + } + IF_ADDR_UNLOCK(ifp); + + loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; + igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop); + + /* + * Slew transmission of bursts over 500ms intervals. + */ + if (igi->igi_gq.ifq_head != NULL) { + igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( + IGMP_RESPONSE_BURST_INTERVAL); + V_interface_timers_running = 1; + } +} + +/* + * Transmit the next pending IGMP message in the output queue. + * + * We get called from netisr_processqueue(). A mutex private to igmpoq + * will be acquired and released around this routine. + * + * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis. + * MRT: Nothing needs to be done, as IGMP traffic is always local to + * a link and uses a link-scope multicast address. + */ +static void +igmp_intr(struct mbuf *m) +{ + struct ip_moptions imo; + struct ifnet *ifp; + struct mbuf *ipopts, *m0; + int error; + uint32_t ifindex; + + CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m); + + /* + * Set VNET image pointer from enqueued mbuf chain + * before doing anything else. Whilst we use interface + * indexes to guard against interface detach, they are + * unique to each VIMAGE and must be retrieved. + */ + CURVNET_SET((struct vnet *)(m->m_pkthdr.header)); + ifindex = igmp_restore_context(m); + + /* + * Check if the ifnet still exists. This limits the scope of + * any race in the absence of a global ifp lock for low cost + * (an array lookup). + */ + ifp = ifnet_byindex(ifindex); + if (ifp == NULL) { + CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.", + __func__, m, ifindex); + m_freem(m); + IPSTAT_INC(ips_noroute); + goto out; + } + + ipopts = V_igmp_sendra ? m_raopt : NULL; + + imo.imo_multicast_ttl = 1; + imo.imo_multicast_vif = -1; + imo.imo_multicast_loop = (V_ip_mrouter != NULL); + + /* + * If the user requested that IGMP traffic be explicitly + * redirected to the loopback interface (e.g. they are running a + * MANET interface and the routing protocol needs to see the + * updates), handle this now. + */ + if (m->m_flags & M_IGMP_LOOP) + imo.imo_multicast_ifp = V_loif; + else + imo.imo_multicast_ifp = ifp; + + if (m->m_flags & M_IGMPV2) { + m0 = m; + } else { + m0 = igmp_v3_encap_report(ifp, m); + if (m0 == NULL) { + CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m); + m_freem(m); + IPSTAT_INC(ips_odropped); + goto out; + } + } + + igmp_scrub_context(m0); + m->m_flags &= ~(M_PROTOFLAGS); + m0->m_pkthdr.rcvif = V_loif; +#ifdef MAC + mac_netinet_igmp_send(ifp, m0); +#endif + error = ip_output(m0, ipopts, NULL, 0, &imo, NULL); + if (error) { + CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error); + goto out; + } + + IGMPSTAT_INC(igps_snd_reports); + +out: + /* + * We must restore the existing vnet pointer before + * continuing as we are run from netisr context. + */ + CURVNET_RESTORE(); +} + +/* + * Encapsulate an IGMPv3 report. + * + * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf + * chain has already had its IP/IGMPv3 header prepended. In this case + * the function will not attempt to prepend; the lengths and checksums + * will however be re-computed. + * + * Returns a pointer to the new mbuf chain head, or NULL if the + * allocation failed. + */ +static struct mbuf * +igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) +{ + struct igmp_report *igmp; + struct ip *ip; + int hdrlen, igmpreclen; + + KASSERT((m->m_flags & M_PKTHDR), + ("%s: mbuf chain %p is !M_PKTHDR", __func__, m)); + + igmpreclen = m_length(m, NULL); + hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); + + if (m->m_flags & M_IGMPV3_HDR) { + igmpreclen -= hdrlen; + } else { + M_PREPEND(m, hdrlen, M_DONTWAIT); + if (m == NULL) + return (NULL); + m->m_flags |= M_IGMPV3_HDR; + } + + CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen); + + m->m_data += sizeof(struct ip); + m->m_len -= sizeof(struct ip); + + igmp = mtod(m, struct igmp_report *); + igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; + igmp->ir_rsv1 = 0; + igmp->ir_rsv2 = 0; + igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs); + igmp->ir_cksum = 0; + igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); + m->m_pkthdr.PH_vt.vt_nrecs = 0; + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + + ip = mtod(m, struct ip *); + ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; + ip->ip_len = hdrlen + igmpreclen; + ip->ip_off = IP_DF; + ip->ip_p = IPPROTO_IGMP; + ip->ip_sum = 0; + + ip->ip_src.s_addr = INADDR_ANY; + + if (m->m_flags & M_IGMP_LOOP) { + struct in_ifaddr *ia; + + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + ip->ip_src = ia->ia_addr.sin_addr; + ifa_free(&ia->ia_ifa); + } + } + + ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); + + return (m); +} + +#ifdef KTR +static char * +igmp_rec_type_to_str(const int type) +{ + + switch (type) { + case IGMP_CHANGE_TO_EXCLUDE_MODE: + return "TO_EX"; + break; + case IGMP_CHANGE_TO_INCLUDE_MODE: + return "TO_IN"; + break; + case IGMP_MODE_IS_EXCLUDE: + return "MODE_EX"; + break; + case IGMP_MODE_IS_INCLUDE: + return "MODE_IN"; + break; + case IGMP_ALLOW_NEW_SOURCES: + return "ALLOW_NEW"; + break; + case IGMP_BLOCK_OLD_SOURCES: + return "BLOCK_OLD"; + break; + default: + break; + } + return "unknown"; +} +#endif + +static void +igmp_init(void *unused __unused) +{ + + CTR1(KTR_IGMPV3, "%s: initializing", __func__); + + IGMP_LOCK_INIT(); + + m_raopt = igmp_ra_alloc(); + + netisr_register(&igmp_nh); +} +SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL); + +static void +igmp_uninit(void *unused __unused) +{ + + CTR1(KTR_IGMPV3, "%s: tearing down", __func__); + + netisr_unregister(&igmp_nh); + + m_free(m_raopt); + m_raopt = NULL; + + IGMP_LOCK_DESTROY(); +} +SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL); + +static void +vnet_igmp_init(const void *unused __unused) +{ + + CTR1(KTR_IGMPV3, "%s: initializing", __func__); + + LIST_INIT(&V_igi_head); +} +VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init, + NULL); + +static void +vnet_igmp_uninit(const void *unused __unused) +{ + + CTR1(KTR_IGMPV3, "%s: tearing down", __func__); + + KASSERT(LIST_EMPTY(&V_igi_head), + ("%s: igi list not empty; ifnets not detached?", __func__)); +} +VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY, + vnet_igmp_uninit, NULL); + +static int +igmp_modevent(module_t mod, int type, void *unused __unused) +{ + + switch (type) { + case MOD_LOAD: + case MOD_UNLOAD: + break; + default: + return (EOPNOTSUPP); + } + return (0); +} + +static moduledata_t igmp_mod = { + "igmp", + igmp_modevent, + 0 +}; +DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/freebsd/sys/netinet/igmp.h b/freebsd/sys/netinet/igmp.h new file mode 100644 index 00000000..f328d21f --- /dev/null +++ b/freebsd/sys/netinet/igmp.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/netinet/igmp_var.h b/freebsd/sys/netinet/igmp_var.h new file mode 100644 index 00000000..e1abe6ab --- /dev/null +++ b/freebsd/sys/netinet/igmp_var.h @@ -0,0 +1,225 @@ +/*-a + * Copyright (c) 1988 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)igmp_var.h 8.1 (Berkeley) 7/19/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IGMP_VAR_HH_ +#define _NETINET_IGMP_VAR_HH_ + +/* + * Internet Group Management Protocol (IGMP), + * implementation-specific definitions. + * + * Written by Steve Deering, Stanford, May 1988. + * + * MULTICAST Revision: 3.5.1.3 + */ + +#ifndef BURN_BRIDGES +/* + * Pre-IGMPV3 igmpstat structure. + */ +struct oigmpstat { + u_int igps_rcv_total; /* total IGMP messages received */ + u_int igps_rcv_tooshort; /* received with too few bytes */ + u_int igps_rcv_badsum; /* received with bad checksum */ + u_int igps_rcv_queries; /* received membership queries */ + u_int igps_rcv_badqueries; /* received invalid queries */ + u_int igps_rcv_reports; /* received membership reports */ + u_int igps_rcv_badreports; /* received invalid reports */ + u_int igps_rcv_ourreports; /* received reports for our groups */ + u_int igps_snd_reports; /* sent membership reports */ + u_int igps_rcv_toolong; /* received with too many bytes */ +}; +#endif + +/* + * IGMPv3 protocol statistics. + */ +struct igmpstat { + /* + * Structure header (to insulate ABI changes). + */ + uint32_t igps_version; /* version of this structure */ + uint32_t igps_len; /* length of this structure */ + /* + * Message statistics. + */ + uint64_t igps_rcv_total; /* total IGMP messages received */ + uint64_t igps_rcv_tooshort; /* received with too few bytes */ + uint64_t igps_rcv_badttl; /* received with ttl other than 1 */ + uint64_t igps_rcv_badsum; /* received with bad checksum */ + /* + * Query statistics. + */ + uint64_t igps_rcv_v1v2_queries; /* received IGMPv1/IGMPv2 queries */ + uint64_t igps_rcv_v3_queries; /* received IGMPv3 queries */ + uint64_t igps_rcv_badqueries; /* received invalid queries */ + uint64_t igps_rcv_gen_queries; /* received general queries */ + uint64_t igps_rcv_group_queries;/* received group queries */ + uint64_t igps_rcv_gsr_queries; /* received group-source queries */ + uint64_t igps_drop_gsr_queries; /* dropped group-source queries */ + /* + * Report statistics. + */ + uint64_t igps_rcv_reports; /* received membership reports */ + uint64_t igps_rcv_badreports; /* received invalid reports */ + uint64_t igps_rcv_ourreports; /* received reports for our groups */ + uint64_t igps_rcv_nora; /* received w/o Router Alert option */ + uint64_t igps_snd_reports; /* sent membership reports */ + /* + * Padding for future additions. + */ + uint64_t __igps_pad[4]; +}; +#define IGPS_VERSION_3 3 /* as of FreeBSD 8.x */ +#define IGPS_VERSION3_LEN 168 + +#ifdef _KERNEL +#define IGMPSTAT_ADD(name, val) V_igmpstat.name += (val) +#define IGMPSTAT_INC(name) IGMPSTAT_ADD(name, 1) +#endif + +#ifdef CTASSERT +CTASSERT(sizeof(struct igmpstat) == 168); +#endif + +#ifdef _KERNEL +#define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) + +#define IGMP_MAX_STATE_CHANGES 24 /* Max pending changes per group */ + +/* + * IGMP per-group states. + */ +#define IGMP_NOT_MEMBER 0 /* Can garbage collect in_multi */ +#define IGMP_SILENT_MEMBER 1 /* Do not perform IGMP for group */ +#define IGMP_REPORTING_MEMBER 2 /* IGMPv1/2/3 we are reporter */ +#define IGMP_IDLE_MEMBER 3 /* IGMPv1/2 we reported last */ +#define IGMP_LAZY_MEMBER 4 /* IGMPv1/2 other member reporting */ +#define IGMP_SLEEPING_MEMBER 5 /* IGMPv1/2 start query response */ +#define IGMP_AWAKENING_MEMBER 6 /* IGMPv1/2 group timer will start */ +#define IGMP_G_QUERY_PENDING_MEMBER 7 /* IGMPv3 group query pending */ +#define IGMP_SG_QUERY_PENDING_MEMBER 8 /* IGMPv3 source query pending */ +#define IGMP_LEAVING_MEMBER 9 /* IGMPv3 dying gasp (pending last */ + /* retransmission of INCLUDE {}) */ + +/* + * IGMP version tag. + */ +#define IGMP_VERSION_NONE 0 /* Invalid */ +#define IGMP_VERSION_1 1 +#define IGMP_VERSION_2 2 +#define IGMP_VERSION_3 3 /* Default */ + +/* + * IGMPv3 protocol control variables. + */ +#define IGMP_RV_INIT 2 /* Robustness Variable */ +#define IGMP_RV_MIN 1 +#define IGMP_RV_MAX 7 + +#define IGMP_QI_INIT 125 /* Query Interval (s) */ +#define IGMP_QI_MIN 1 +#define IGMP_QI_MAX 255 + +#define IGMP_QRI_INIT 10 /* Query Response Interval (s) */ +#define IGMP_QRI_MIN 1 +#define IGMP_QRI_MAX 255 + +#define IGMP_URI_INIT 3 /* Unsolicited Report Interval (s) */ +#define IGMP_URI_MIN 0 +#define IGMP_URI_MAX 10 + +#define IGMP_MAX_G_GS_PACKETS 8 /* # of packets to answer G/GS */ +#define IGMP_MAX_STATE_CHANGE_PACKETS 8 /* # of packets per state change */ +#define IGMP_MAX_RESPONSE_PACKETS 16 /* # of packets for general query */ +#define IGMP_MAX_RESPONSE_BURST 4 /* # of responses to send at once */ +#define IGMP_RESPONSE_BURST_INTERVAL (PR_FASTHZ / 2) /* 500ms */ + +/* + * IGMP-specific mbuf flags. + */ +#define M_IGMPV2 M_PROTO1 /* Packet is IGMPv2 */ +#define M_IGMPV3_HDR M_PROTO2 /* Packet has IGMPv3 headers */ +#define M_GROUPREC M_PROTO3 /* mbuf chain is a group record */ +#define M_IGMP_LOOP M_PROTO4 /* transmit on loif, not real ifp */ + +/* + * Default amount of leading space for IGMPv3 to allocate at the + * beginning of its mbuf packet chains, to avoid fragmentation and + * unnecessary allocation of leading mbufs. + */ +#define RAOPT_LEN 4 /* Length of IP Router Alert option */ +#define IGMP_LEADINGSPACE \ + (sizeof(struct ip) + RAOPT_LEN + sizeof(struct igmp_report)) + +/* + * Subsystem lock macros. + * The IGMP lock is only taken with IGMP. Currently it is system-wide. + * VIMAGE: The lock could be pushed to per-VIMAGE granularity in future. + */ +#define IGMP_LOCK_INIT() mtx_init(&igmp_mtx, "igmp_mtx", NULL, MTX_DEF) +#define IGMP_LOCK_DESTROY() mtx_destroy(&igmp_mtx) +#define IGMP_LOCK() mtx_lock(&igmp_mtx) +#define IGMP_LOCK_ASSERT() mtx_assert(&igmp_mtx, MA_OWNED) +#define IGMP_UNLOCK() mtx_unlock(&igmp_mtx) +#define IGMP_UNLOCK_ASSERT() mtx_assert(&igmp_mtx, MA_NOTOWNED) + +struct igmp_ifinfo; + +int igmp_change_state(struct in_multi *); +void igmp_fasttimo(void); +struct igmp_ifinfo * + igmp_domifattach(struct ifnet *); +void igmp_domifdetach(struct ifnet *); +void igmp_ifdetach(struct ifnet *); +void igmp_input(struct mbuf *, int); +void igmp_slowtimo(void); + +SYSCTL_DECL(_net_inet_igmp); + +#endif /* _KERNEL */ + +/* + * Names for IGMP sysctl objects + */ +#define IGMPCTL_STATS 1 /* statistics (read-only) */ +#define IGMPCTL_MAXID 2 + +#define IGMPCTL_NAMES { \ + { 0, 0 }, \ + { "stats", CTLTYPE_STRUCT } \ +} +#endif diff --git a/freebsd/sys/netinet/in.c b/freebsd/sys/netinet/in.c new file mode 100644 index 00000000..64e5d329 --- /dev/null +++ b/freebsd/sys/netinet/in.c @@ -0,0 +1,1601 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * Copyright (C) 2001 WIDE Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in.c 8.4 (Berkeley) 1/9/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +static int in_mask2len(struct in_addr *); +static void in_len2mask(struct in_addr *, int); +static int in_lifaddr_ioctl(struct socket *, u_long, caddr_t, + struct ifnet *, struct thread *); + +static int in_addprefix(struct in_ifaddr *, int); +static int in_scrubprefix(struct in_ifaddr *); +static void in_socktrim(struct sockaddr_in *); +static int in_ifinit(struct ifnet *, + struct in_ifaddr *, struct sockaddr_in *, int); +static void in_purgemaddrs(struct ifnet *); + +static VNET_DEFINE(int, subnetsarelocal); +#define V_subnetsarelocal VNET(subnetsarelocal) +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, + &VNET_NAME(subnetsarelocal), 0, + "Treat all subnets as directly connected"); +static VNET_DEFINE(int, sameprefixcarponly); +#define V_sameprefixcarponly VNET(sameprefixcarponly) +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, same_prefix_carp_only, CTLFLAG_RW, + &VNET_NAME(sameprefixcarponly), 0, + "Refuse to create same prefixes on different interfaces"); + +VNET_DECLARE(struct inpcbinfo, ripcbinfo); +#define V_ripcbinfo VNET(ripcbinfo) + +/* + * Return 1 if an internet address is for a ``local'' host + * (one to which we have a connection). If subnetsarelocal + * is true, this includes other subnets of the local net. + * Otherwise, it includes only the directly-connected (sub)nets. + */ +int +in_localaddr(struct in_addr in) +{ + register u_long i = ntohl(in.s_addr); + register struct in_ifaddr *ia; + + IN_IFADDR_RLOCK(); + if (V_subnetsarelocal) { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + if ((i & ia->ia_netmask) == ia->ia_net) { + IN_IFADDR_RUNLOCK(); + return (1); + } + } + } else { + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + if ((i & ia->ia_subnetmask) == ia->ia_subnet) { + IN_IFADDR_RUNLOCK(); + return (1); + } + } + } + IN_IFADDR_RUNLOCK(); + return (0); +} + +/* + * Return 1 if an internet address is for the local host and configured + * on one of its interfaces. + */ +int +in_localip(struct in_addr in) +{ + struct in_ifaddr *ia; + + IN_IFADDR_RLOCK(); + LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) { + if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) { + IN_IFADDR_RUNLOCK(); + return (1); + } + } + IN_IFADDR_RUNLOCK(); + return (0); +} + +/* + * Determine whether an IP address is in a reserved set of addresses + * that may not be forwarded, or whether datagrams to that destination + * may be forwarded. + */ +int +in_canforward(struct in_addr in) +{ + register u_long i = ntohl(in.s_addr); + register u_long net; + + if (IN_EXPERIMENTAL(i) || IN_MULTICAST(i) || IN_LINKLOCAL(i)) + return (0); + if (IN_CLASSA(i)) { + net = i & IN_CLASSA_NET; + if (net == 0 || net == (IN_LOOPBACKNET << IN_CLASSA_NSHIFT)) + return (0); + } + return (1); +} + +/* + * Trim a mask in a sockaddr + */ +static void +in_socktrim(struct sockaddr_in *ap) +{ + register char *cplim = (char *) &ap->sin_addr; + register char *cp = (char *) (&ap->sin_addr + 1); + + ap->sin_len = 0; + while (--cp >= cplim) + if (*cp) { + (ap)->sin_len = cp - (char *) (ap) + 1; + break; + } +} + +static int +in_mask2len(mask) + struct in_addr *mask; +{ + int x, y; + u_char *p; + + p = (u_char *)mask; + for (x = 0; x < sizeof(*mask); x++) { + if (p[x] != 0xff) + break; + } + y = 0; + if (x < sizeof(*mask)) { + for (y = 0; y < 8; y++) { + if ((p[x] & (0x80 >> y)) == 0) + break; + } + } + return (x * 8 + y); +} + +static void +in_len2mask(struct in_addr *mask, int len) +{ + int i; + u_char *p; + + p = (u_char *)mask; + bzero(mask, sizeof(*mask)); + for (i = 0; i < len / 8; i++) + p[i] = 0xff; + if (len % 8) + p[i] = (0xff00 >> (len % 8)) & 0xff; +} + +/* + * Generic internet control operations (ioctl's). + * + * ifp is NULL if not an interface-specific ioctl. + */ +/* ARGSUSED */ +int +in_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, + struct thread *td) +{ + register struct ifreq *ifr = (struct ifreq *)data; + register struct in_ifaddr *ia, *iap; + register struct ifaddr *ifa; + struct in_addr allhosts_addr; + struct in_addr dst; + struct in_ifinfo *ii; + struct in_aliasreq *ifra = (struct in_aliasreq *)data; + struct sockaddr_in oldaddr; + int error, hostIsNew, iaIsNew, maskIsNew; + int iaIsFirst; + + ia = NULL; + iaIsFirst = 0; + iaIsNew = 0; + allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); + + /* + * Filter out ioctls we implement directly; forward the rest on to + * in_lifaddr_ioctl() and ifp->if_ioctl(). + */ + switch (cmd) { + case SIOCAIFADDR: + case SIOCDIFADDR: + case SIOCGIFADDR: + case SIOCGIFBRDADDR: + case SIOCGIFDSTADDR: + case SIOCGIFNETMASK: + case SIOCSIFADDR: + case SIOCSIFBRDADDR: + case SIOCSIFDSTADDR: + case SIOCSIFNETMASK: + break; + + case SIOCALIFADDR: + if (td != NULL) { + error = priv_check(td, PRIV_NET_ADDIFADDR); + if (error) + return (error); + } + if (ifp == NULL) + return (EINVAL); + return in_lifaddr_ioctl(so, cmd, data, ifp, td); + + case SIOCDLIFADDR: + if (td != NULL) { + error = priv_check(td, PRIV_NET_DELIFADDR); + if (error) + return (error); + } + if (ifp == NULL) + return (EINVAL); + return in_lifaddr_ioctl(so, cmd, data, ifp, td); + + case SIOCGLIFADDR: + if (ifp == NULL) + return (EINVAL); + return in_lifaddr_ioctl(so, cmd, data, ifp, td); + + default: + if (ifp == NULL || ifp->if_ioctl == NULL) + return (EOPNOTSUPP); + return ((*ifp->if_ioctl)(ifp, cmd, data)); + } + + if (ifp == NULL) + return (EADDRNOTAVAIL); + + /* + * Security checks before we get involved in any work. + */ + switch (cmd) { + case SIOCAIFADDR: + case SIOCSIFADDR: + case SIOCSIFBRDADDR: + case SIOCSIFNETMASK: + case SIOCSIFDSTADDR: + if (td != NULL) { + error = priv_check(td, PRIV_NET_ADDIFADDR); + if (error) + return (error); + } + break; + + case SIOCDIFADDR: + if (td != NULL) { + error = priv_check(td, PRIV_NET_DELIFADDR); + if (error) + return (error); + } + break; + } + + /* + * Find address for this interface, if it exists. + * + * If an alias address was specified, find that one instead of the + * first one on the interface, if possible. + */ + dst = ((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr; + IN_IFADDR_RLOCK(); + LIST_FOREACH(iap, INADDR_HASH(dst.s_addr), ia_hash) { + if (iap->ia_ifp == ifp && + iap->ia_addr.sin_addr.s_addr == dst.s_addr) { + if (td == NULL || prison_check_ip4(td->td_ucred, + &dst) == 0) + ia = iap; + break; + } + } + if (ia != NULL) + ifa_ref(&ia->ia_ifa); + IN_IFADDR_RUNLOCK(); + if (ia == NULL) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + iap = ifatoia(ifa); + if (iap->ia_addr.sin_family == AF_INET) { + if (td != NULL && + prison_check_ip4(td->td_ucred, + &iap->ia_addr.sin_addr) != 0) + continue; + ia = iap; + break; + } + } + if (ia != NULL) + ifa_ref(&ia->ia_ifa); + IF_ADDR_UNLOCK(ifp); + } + if (ia == NULL) + iaIsFirst = 1; + + error = 0; + switch (cmd) { + case SIOCAIFADDR: + case SIOCDIFADDR: + if (ifra->ifra_addr.sin_family == AF_INET) { + struct in_ifaddr *oia; + + IN_IFADDR_RLOCK(); + for (oia = ia; ia; ia = TAILQ_NEXT(ia, ia_link)) { + if (ia->ia_ifp == ifp && + ia->ia_addr.sin_addr.s_addr == + ifra->ifra_addr.sin_addr.s_addr) + break; + } + if (ia != NULL && ia != oia) + ifa_ref(&ia->ia_ifa); + if (oia != NULL && ia != oia) + ifa_free(&oia->ia_ifa); + IN_IFADDR_RUNLOCK(); + if ((ifp->if_flags & IFF_POINTOPOINT) + && (cmd == SIOCAIFADDR) + && (ifra->ifra_dstaddr.sin_addr.s_addr + == INADDR_ANY)) { + error = EDESTADDRREQ; + goto out; + } + } + if (cmd == SIOCDIFADDR && ia == NULL) { + error = EADDRNOTAVAIL; + goto out; + } + /* FALLTHROUGH */ + case SIOCSIFADDR: + case SIOCSIFNETMASK: + case SIOCSIFDSTADDR: + if (ia == NULL) { + ia = (struct in_ifaddr *) + malloc(sizeof *ia, M_IFADDR, M_NOWAIT | + M_ZERO); + if (ia == NULL) { + error = ENOBUFS; + goto out; + } + + ifa = &ia->ia_ifa; + ifa_init(ifa); + ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; + ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; + ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; + + ia->ia_sockmask.sin_len = 8; + ia->ia_sockmask.sin_family = AF_INET; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); + ia->ia_broadaddr.sin_family = AF_INET; + } + ia->ia_ifp = ifp; + + ifa_ref(ifa); /* if_addrhead */ + IF_ADDR_LOCK(ifp); + TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); + IF_ADDR_UNLOCK(ifp); + ifa_ref(ifa); /* in_ifaddrhead */ + IN_IFADDR_WLOCK(); + TAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); + IN_IFADDR_WUNLOCK(); + iaIsNew = 1; + } + break; + + case SIOCSIFBRDADDR: + case SIOCGIFADDR: + case SIOCGIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCGIFBRDADDR: + if (ia == NULL) { + error = EADDRNOTAVAIL; + goto out; + } + break; + } + + /* + * Most paths in this switch return directly or via out. Only paths + * that remove the address break in order to hit common removal code. + */ + switch (cmd) { + case SIOCGIFADDR: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; + goto out; + + case SIOCGIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) { + error = EINVAL; + goto out; + } + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; + goto out; + + case SIOCGIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { + error = EINVAL; + goto out; + } + *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; + goto out; + + case SIOCGIFNETMASK: + *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; + goto out; + + case SIOCSIFDSTADDR: + if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { + error = EINVAL; + goto out; + } + oldaddr = ia->ia_dstaddr; + ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; + if (ifp->if_ioctl != NULL) { + error = (*ifp->if_ioctl)(ifp, SIOCSIFDSTADDR, + (caddr_t)ia); + if (error) { + ia->ia_dstaddr = oldaddr; + goto out; + } + } + if (ia->ia_flags & IFA_ROUTE) { + ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr; + rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + ia->ia_ifa.ifa_dstaddr = + (struct sockaddr *)&ia->ia_dstaddr; + rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + } + goto out; + + case SIOCSIFBRDADDR: + if ((ifp->if_flags & IFF_BROADCAST) == 0) { + error = EINVAL; + goto out; + } + ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; + goto out; + + case SIOCSIFADDR: + error = in_ifinit(ifp, ia, + (struct sockaddr_in *) &ifr->ifr_addr, 1); + if (error != 0 && iaIsNew) + break; + if (error == 0) { + ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); + if (iaIsFirst && + (ifp->if_flags & IFF_MULTICAST) != 0) { + error = in_joingroup(ifp, &allhosts_addr, + NULL, &ii->ii_allhosts); + } + EVENTHANDLER_INVOKE(ifaddr_event, ifp); + } + error = 0; + goto out; + + case SIOCSIFNETMASK: + ia->ia_sockmask.sin_addr = ifra->ifra_addr.sin_addr; + ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); + goto out; + + case SIOCAIFADDR: + maskIsNew = 0; + hostIsNew = 1; + error = 0; + if (ia->ia_addr.sin_family == AF_INET) { + if (ifra->ifra_addr.sin_len == 0) { + ifra->ifra_addr = ia->ia_addr; + hostIsNew = 0; + } else if (ifra->ifra_addr.sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) + hostIsNew = 0; + } + if (ifra->ifra_mask.sin_len) { + /* + * QL: XXX + * Need to scrub the prefix here in case + * the issued command is SIOCAIFADDR with + * the same address, but with a different + * prefix length. And if the prefix length + * is the same as before, then the call is + * un-necessarily executed here. + */ + in_ifscrub(ifp, ia); + ia->ia_sockmask = ifra->ifra_mask; + ia->ia_sockmask.sin_family = AF_INET; + ia->ia_subnetmask = + ntohl(ia->ia_sockmask.sin_addr.s_addr); + maskIsNew = 1; + } + if ((ifp->if_flags & IFF_POINTOPOINT) && + (ifra->ifra_dstaddr.sin_family == AF_INET)) { + in_ifscrub(ifp, ia); + ia->ia_dstaddr = ifra->ifra_dstaddr; + maskIsNew = 1; /* We lie; but the effect's the same */ + } + if (ifra->ifra_addr.sin_family == AF_INET && + (hostIsNew || maskIsNew)) + error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); + if (error != 0 && iaIsNew) + goto out; + + if ((ifp->if_flags & IFF_BROADCAST) && + (ifra->ifra_broadaddr.sin_family == AF_INET)) + ia->ia_broadaddr = ifra->ifra_broadaddr; + if (error == 0) { + ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); + if (iaIsFirst && + (ifp->if_flags & IFF_MULTICAST) != 0) { + error = in_joingroup(ifp, &allhosts_addr, + NULL, &ii->ii_allhosts); + } + EVENTHANDLER_INVOKE(ifaddr_event, ifp); + } + goto out; + + case SIOCDIFADDR: + /* + * in_ifscrub kills the interface route. + */ + in_ifscrub(ifp, ia); + + /* + * in_ifadown gets rid of all the rest of + * the routes. This is not quite the right + * thing to do, but at least if we are running + * a routing process they will come back. + */ + in_ifadown(&ia->ia_ifa, 1); + EVENTHANDLER_INVOKE(ifaddr_event, ifp); + error = 0; + break; + + default: + panic("in_control: unsupported ioctl"); + } + + IF_ADDR_LOCK(ifp); + /* Re-check that ia is still part of the list. */ + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa == &ia->ia_ifa) + break; + } + if (ifa == NULL) { + /* + * If we lost the race with another thread, there is no need to + * try it again for the next loop as there is no other exit + * path between here and out. + */ + IF_ADDR_UNLOCK(ifp); + error = EADDRNOTAVAIL; + goto out; + } + TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); + IF_ADDR_UNLOCK(ifp); + ifa_free(&ia->ia_ifa); /* if_addrhead */ + + IN_IFADDR_WLOCK(); + TAILQ_REMOVE(&V_in_ifaddrhead, ia, ia_link); + if (ia->ia_addr.sin_family == AF_INET) { + struct in_ifaddr *if_ia; + + LIST_REMOVE(ia, ia_hash); + IN_IFADDR_WUNLOCK(); + /* + * If this is the last IPv4 address configured on this + * interface, leave the all-hosts group. + * No state-change report need be transmitted. + */ + if_ia = NULL; + IFP_TO_IA(ifp, if_ia); + if (if_ia == NULL) { + ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); + IN_MULTI_LOCK(); + if (ii->ii_allhosts) { + (void)in_leavegroup_locked(ii->ii_allhosts, + NULL); + ii->ii_allhosts = NULL; + } + IN_MULTI_UNLOCK(); + } else + ifa_free(&if_ia->ia_ifa); + } else + IN_IFADDR_WUNLOCK(); + ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ +out: + if (ia != NULL) + ifa_free(&ia->ia_ifa); + return (error); +} + +/* + * SIOC[GAD]LIFADDR. + * SIOCGLIFADDR: get first address. (?!?) + * SIOCGLIFADDR with IFLR_PREFIX: + * get first address that matches the specified prefix. + * SIOCALIFADDR: add the specified address. + * SIOCALIFADDR with IFLR_PREFIX: + * EINVAL since we can't deduce hostid part of the address. + * SIOCDLIFADDR: delete the specified address. + * SIOCDLIFADDR with IFLR_PREFIX: + * delete the first address that matches the specified prefix. + * return values: + * EINVAL on invalid parameters + * EADDRNOTAVAIL on prefix match failed/specified address not found + * other values may be returned from in_ioctl() + */ +static int +in_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, + struct ifnet *ifp, struct thread *td) +{ + struct if_laddrreq *iflr = (struct if_laddrreq *)data; + struct ifaddr *ifa; + + /* sanity checks */ + if (data == NULL || ifp == NULL) { + panic("invalid argument to in_lifaddr_ioctl"); + /*NOTRECHED*/ + } + + switch (cmd) { + case SIOCGLIFADDR: + /* address must be specified on GET with IFLR_PREFIX */ + if ((iflr->flags & IFLR_PREFIX) == 0) + break; + /*FALLTHROUGH*/ + case SIOCALIFADDR: + case SIOCDLIFADDR: + /* address must be specified on ADD and DELETE */ + if (iflr->addr.ss_family != AF_INET) + return (EINVAL); + if (iflr->addr.ss_len != sizeof(struct sockaddr_in)) + return (EINVAL); + /* XXX need improvement */ + if (iflr->dstaddr.ss_family + && iflr->dstaddr.ss_family != AF_INET) + return (EINVAL); + if (iflr->dstaddr.ss_family + && iflr->dstaddr.ss_len != sizeof(struct sockaddr_in)) + return (EINVAL); + break; + default: /*shouldn't happen*/ + return (EOPNOTSUPP); + } + if (sizeof(struct in_addr) * 8 < iflr->prefixlen) + return (EINVAL); + + switch (cmd) { + case SIOCALIFADDR: + { + struct in_aliasreq ifra; + + if (iflr->flags & IFLR_PREFIX) + return (EINVAL); + + /* copy args to in_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */ + bzero(&ifra, sizeof(ifra)); + bcopy(iflr->iflr_name, ifra.ifra_name, + sizeof(ifra.ifra_name)); + + bcopy(&iflr->addr, &ifra.ifra_addr, iflr->addr.ss_len); + + if (iflr->dstaddr.ss_family) { /*XXX*/ + bcopy(&iflr->dstaddr, &ifra.ifra_dstaddr, + iflr->dstaddr.ss_len); + } + + ifra.ifra_mask.sin_family = AF_INET; + ifra.ifra_mask.sin_len = sizeof(struct sockaddr_in); + in_len2mask(&ifra.ifra_mask.sin_addr, iflr->prefixlen); + + return (in_control(so, SIOCAIFADDR, (caddr_t)&ifra, ifp, td)); + } + case SIOCGLIFADDR: + case SIOCDLIFADDR: + { + struct in_ifaddr *ia; + struct in_addr mask, candidate, match; + struct sockaddr_in *sin; + + bzero(&mask, sizeof(mask)); + bzero(&match, sizeof(match)); + if (iflr->flags & IFLR_PREFIX) { + /* lookup a prefix rather than address. */ + in_len2mask(&mask, iflr->prefixlen); + + sin = (struct sockaddr_in *)&iflr->addr; + match.s_addr = sin->sin_addr.s_addr; + match.s_addr &= mask.s_addr; + + /* if you set extra bits, that's wrong */ + if (match.s_addr != sin->sin_addr.s_addr) + return (EINVAL); + + } else { + /* on getting an address, take the 1st match */ + /* on deleting an address, do exact match */ + if (cmd != SIOCGLIFADDR) { + in_len2mask(&mask, 32); + sin = (struct sockaddr_in *)&iflr->addr; + match.s_addr = sin->sin_addr.s_addr; + } + } + + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + if (match.s_addr == 0) + break; + candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr; + candidate.s_addr &= mask.s_addr; + if (candidate.s_addr == match.s_addr) + break; + } + if (ifa == NULL) + return (EADDRNOTAVAIL); + ia = (struct in_ifaddr *)ifa; + + if (cmd == SIOCGLIFADDR) { + /* fill in the if_laddrreq structure */ + bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len); + + if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { + bcopy(&ia->ia_dstaddr, &iflr->dstaddr, + ia->ia_dstaddr.sin_len); + } else + bzero(&iflr->dstaddr, sizeof(iflr->dstaddr)); + + iflr->prefixlen = + in_mask2len(&ia->ia_sockmask.sin_addr); + + iflr->flags = 0; /*XXX*/ + + return (0); + } else { + struct in_aliasreq ifra; + + /* fill in_aliasreq and do ioctl(SIOCDIFADDR_IN6) */ + bzero(&ifra, sizeof(ifra)); + bcopy(iflr->iflr_name, ifra.ifra_name, + sizeof(ifra.ifra_name)); + + bcopy(&ia->ia_addr, &ifra.ifra_addr, + ia->ia_addr.sin_len); + if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { + bcopy(&ia->ia_dstaddr, &ifra.ifra_dstaddr, + ia->ia_dstaddr.sin_len); + } + bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr, + ia->ia_sockmask.sin_len); + + return (in_control(so, SIOCDIFADDR, (caddr_t)&ifra, + ifp, td)); + } + } + } + + return (EOPNOTSUPP); /*just for safety*/ +} + +/* + * Delete any existing route for an interface. + */ +void +in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia) +{ + + in_scrubprefix(ia); +} + +/* + * Initialize an interface's internet address + * and routing table entry. + */ +static int +in_ifinit(struct ifnet *ifp, struct in_ifaddr *ia, struct sockaddr_in *sin, + int scrub) +{ + register u_long i = ntohl(sin->sin_addr.s_addr); + struct sockaddr_in oldaddr; + int s = splimp(), flags = RTF_UP, error = 0; + + oldaddr = ia->ia_addr; + if (oldaddr.sin_family == AF_INET) + LIST_REMOVE(ia, ia_hash); + ia->ia_addr = *sin; + if (ia->ia_addr.sin_family == AF_INET) { + IN_IFADDR_WLOCK(); + LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), + ia, ia_hash); + IN_IFADDR_WUNLOCK(); + } + /* + * Give the interface a chance to initialize + * if this is its first address, + * and to validate the address if necessary. + */ + if (ifp->if_ioctl != NULL) { + error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); + if (error) { + splx(s); + /* LIST_REMOVE(ia, ia_hash) is done in in_control */ + ia->ia_addr = oldaddr; + IN_IFADDR_WLOCK(); + if (ia->ia_addr.sin_family == AF_INET) + LIST_INSERT_HEAD(INADDR_HASH( + ia->ia_addr.sin_addr.s_addr), ia, ia_hash); + else + /* + * If oldaddr family is not AF_INET (e.g. + * interface has been just created) in_control + * does not call LIST_REMOVE, and we end up + * with bogus ia entries in hash + */ + LIST_REMOVE(ia, ia_hash); + IN_IFADDR_WUNLOCK(); + return (error); + } + } + splx(s); + if (scrub) { + ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; + in_ifscrub(ifp, ia); + ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; + } + if (IN_CLASSA(i)) + ia->ia_netmask = IN_CLASSA_NET; + else if (IN_CLASSB(i)) + ia->ia_netmask = IN_CLASSB_NET; + else + ia->ia_netmask = IN_CLASSC_NET; + /* + * The subnet mask usually includes at least the standard network part, + * but may may be smaller in the case of supernetting. + * If it is set, we believe it. + */ + if (ia->ia_subnetmask == 0) { + ia->ia_subnetmask = ia->ia_netmask; + ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); + } else + ia->ia_netmask &= ia->ia_subnetmask; + ia->ia_net = i & ia->ia_netmask; + ia->ia_subnet = i & ia->ia_subnetmask; + in_socktrim(&ia->ia_sockmask); + /* + * XXX: carp(4) does not have interface route + */ + if (ifp->if_type == IFT_CARP) + return (0); + /* + * Add route for the network. + */ + ia->ia_ifa.ifa_metric = ifp->if_metric; + if (ifp->if_flags & IFF_BROADCAST) { + ia->ia_broadaddr.sin_addr.s_addr = + htonl(ia->ia_subnet | ~ia->ia_subnetmask); + ia->ia_netbroadcast.s_addr = + htonl(ia->ia_net | ~ ia->ia_netmask); + } else if (ifp->if_flags & IFF_LOOPBACK) { + ia->ia_dstaddr = ia->ia_addr; + flags |= RTF_HOST; + } else if (ifp->if_flags & IFF_POINTOPOINT) { + if (ia->ia_dstaddr.sin_family != AF_INET) + return (0); + flags |= RTF_HOST; + } + if ((error = in_addprefix(ia, flags)) != 0) + return (error); + + if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY) + return (0); + + if (ifp->if_flags & IFF_POINTOPOINT) { + if (ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) + return (0); + } + + + /* + * add a loopback route to self + */ + if (V_useloopback && !(ifp->if_flags & IFF_LOOPBACK)) { + struct route ia_ro; + + bzero(&ia_ro, sizeof(ia_ro)); + *((struct sockaddr_in *)(&ia_ro.ro_dst)) = ia->ia_addr; + rtalloc_ign_fib(&ia_ro, 0, 0); + if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) && + (ia_ro.ro_rt->rt_ifp == V_loif)) { + RT_LOCK(ia_ro.ro_rt); + RT_ADDREF(ia_ro.ro_rt); + RTFREE_LOCKED(ia_ro.ro_rt); + } else + error = ifa_add_loopback_route((struct ifaddr *)ia, + (struct sockaddr *)&ia->ia_addr); + if (error == 0) + ia->ia_flags |= IFA_RTSELF; + if (ia_ro.ro_rt != NULL) + RTFREE(ia_ro.ro_rt); + } + + return (error); +} + +#define rtinitflags(x) \ + ((((x)->ia_ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) != 0) \ + ? RTF_HOST : 0) + +/* + * Generate a routing message when inserting or deleting + * an interface address alias. + */ +static void in_addralias_rtmsg(int cmd, struct in_addr *prefix, + struct in_ifaddr *target) +{ + struct route pfx_ro; + struct sockaddr_in *pfx_addr; + struct rtentry msg_rt; + + /* QL: XXX + * This is a bit questionable because there is no + * additional route entry added/deleted for an address + * alias. Therefore this route report is inaccurate. + */ + bzero(&pfx_ro, sizeof(pfx_ro)); + pfx_addr = (struct sockaddr_in *)(&pfx_ro.ro_dst); + pfx_addr->sin_len = sizeof(*pfx_addr); + pfx_addr->sin_family = AF_INET; + pfx_addr->sin_addr = *prefix; + rtalloc_ign_fib(&pfx_ro, 0, 0); + if (pfx_ro.ro_rt != NULL) { + msg_rt = *pfx_ro.ro_rt; + + /* QL: XXX + * Point the gateway to the new interface + * address as if a new prefix route entry has + * been added through the new address alias. + * All other parts of the rtentry is accurate, + * e.g., rt_key, rt_mask, rt_ifp etc. + */ + msg_rt.rt_gateway = + (struct sockaddr *)&target->ia_addr; + rt_newaddrmsg(cmd, + (struct ifaddr *)target, + 0, &msg_rt); + RTFREE(pfx_ro.ro_rt); + } + return; +} + +/* + * Check if we have a route for the given prefix already or add one accordingly. + */ +static int +in_addprefix(struct in_ifaddr *target, int flags) +{ + struct in_ifaddr *ia; + struct in_addr prefix, mask, p, m; + int error; + + if ((flags & RTF_HOST) != 0) { + prefix = target->ia_dstaddr.sin_addr; + mask.s_addr = 0; + } else { + prefix = target->ia_addr.sin_addr; + mask = target->ia_sockmask.sin_addr; + prefix.s_addr &= mask.s_addr; + } + + IN_IFADDR_RLOCK(); + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + if (rtinitflags(ia)) { + p = ia->ia_addr.sin_addr; + + if (prefix.s_addr != p.s_addr) + continue; + } else { + p = ia->ia_addr.sin_addr; + m = ia->ia_sockmask.sin_addr; + p.s_addr &= m.s_addr; + + if (prefix.s_addr != p.s_addr || + mask.s_addr != m.s_addr) + continue; + } + + /* + * If we got a matching prefix route inserted by other + * interface address, we are done here. + */ + if (ia->ia_flags & IFA_ROUTE) { +#ifdef RADIX_MPATH + if (ia->ia_addr.sin_addr.s_addr == + target->ia_addr.sin_addr.s_addr) { + IN_IFADDR_RUNLOCK(); + return (EEXIST); + } else + break; +#endif + if (V_sameprefixcarponly && + target->ia_ifp->if_type != IFT_CARP && + ia->ia_ifp->if_type != IFT_CARP) { + IN_IFADDR_RUNLOCK(); + return (EEXIST); + } else { + in_addralias_rtmsg(RTM_ADD, &prefix, target); + IN_IFADDR_RUNLOCK(); + return (0); + } + } + } + IN_IFADDR_RUNLOCK(); + + /* + * No-one seem to have this prefix route, so we try to insert it. + */ + error = rtinit(&target->ia_ifa, (int)RTM_ADD, flags); + if (!error) + target->ia_flags |= IFA_ROUTE; + return (error); +} + +extern void arp_ifscrub(struct ifnet *ifp, uint32_t addr); + +/* + * If there is no other address in the system that can serve a route to the + * same prefix, remove the route. Hand over the route to the new address + * otherwise. + */ +static int +in_scrubprefix(struct in_ifaddr *target) +{ + struct in_ifaddr *ia; + struct in_addr prefix, mask, p; + int error = 0; + struct sockaddr_in prefix0, mask0; + + /* + * Remove the loopback route to the interface address. + * The "useloopback" setting is not consulted because if the + * user configures an interface address, turns off this + * setting, and then tries to delete that interface address, + * checking the current setting of "useloopback" would leave + * that interface address loopback route untouched, which + * would be wrong. Therefore the interface address loopback route + * deletion is unconditional. + */ + if ((target->ia_addr.sin_addr.s_addr != INADDR_ANY) && + !(target->ia_ifp->if_flags & IFF_LOOPBACK) && + (target->ia_flags & IFA_RTSELF)) { + struct route ia_ro; + int freeit = 0; + + bzero(&ia_ro, sizeof(ia_ro)); + *((struct sockaddr_in *)(&ia_ro.ro_dst)) = target->ia_addr; + rtalloc_ign_fib(&ia_ro, 0, 0); + if ((ia_ro.ro_rt != NULL) && (ia_ro.ro_rt->rt_ifp != NULL) && + (ia_ro.ro_rt->rt_ifp == V_loif)) { + RT_LOCK(ia_ro.ro_rt); + if (ia_ro.ro_rt->rt_refcnt <= 1) + freeit = 1; + else + RT_REMREF(ia_ro.ro_rt); + RTFREE_LOCKED(ia_ro.ro_rt); + } + if (freeit) + error = ifa_del_loopback_route((struct ifaddr *)target, + (struct sockaddr *)&target->ia_addr); + if (error == 0) + target->ia_flags &= ~IFA_RTSELF; + /* remove arp cache */ + arp_ifscrub(target->ia_ifp, IA_SIN(target)->sin_addr.s_addr); + } + + if (rtinitflags(target)) + prefix = target->ia_dstaddr.sin_addr; + else { + prefix = target->ia_addr.sin_addr; + mask = target->ia_sockmask.sin_addr; + prefix.s_addr &= mask.s_addr; + } + + if ((target->ia_flags & IFA_ROUTE) == 0) { + in_addralias_rtmsg(RTM_DELETE, &prefix, target); + return (0); + } + + IN_IFADDR_RLOCK(); + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + if (rtinitflags(ia)) + p = ia->ia_dstaddr.sin_addr; + else { + p = ia->ia_addr.sin_addr; + p.s_addr &= ia->ia_sockmask.sin_addr.s_addr; + } + + if (prefix.s_addr != p.s_addr) + continue; + + /* + * If we got a matching prefix address, move IFA_ROUTE and + * the route itself to it. Make sure that routing daemons + * get a heads-up. + * + * XXX: a special case for carp(4) interface - this should + * be more generally specified as an interface that + * doesn't support such action. + */ + if ((ia->ia_flags & IFA_ROUTE) == 0 + && (ia->ia_ifp->if_type != IFT_CARP) + ) { + IN_IFADDR_RUNLOCK(); + rtinit(&(target->ia_ifa), (int)RTM_DELETE, + rtinitflags(target)); + target->ia_flags &= ~IFA_ROUTE; + + error = rtinit(&ia->ia_ifa, (int)RTM_ADD, + rtinitflags(ia) | RTF_UP); + if (error == 0) + ia->ia_flags |= IFA_ROUTE; + return (error); + } + } + IN_IFADDR_RUNLOCK(); + + /* + * remove all L2 entries on the given prefix + */ + bzero(&prefix0, sizeof(prefix0)); + prefix0.sin_len = sizeof(prefix0); + prefix0.sin_family = AF_INET; + prefix0.sin_addr.s_addr = target->ia_subnet; + bzero(&mask0, sizeof(mask0)); + mask0.sin_len = sizeof(mask0); + mask0.sin_family = AF_INET; + mask0.sin_addr.s_addr = target->ia_subnetmask; + lltable_prefix_free(AF_INET, (struct sockaddr *)&prefix0, + (struct sockaddr *)&mask0); + + /* + * As no-one seem to have this prefix, we can remove the route. + */ + rtinit(&(target->ia_ifa), (int)RTM_DELETE, rtinitflags(target)); + target->ia_flags &= ~IFA_ROUTE; + return (0); +} + +#undef rtinitflags + +/* + * Return 1 if the address might be a local broadcast address. + */ +int +in_broadcast(struct in_addr in, struct ifnet *ifp) +{ + register struct ifaddr *ifa; + u_long t; + + if (in.s_addr == INADDR_BROADCAST || + in.s_addr == INADDR_ANY) + return (1); + if ((ifp->if_flags & IFF_BROADCAST) == 0) + return (0); + t = ntohl(in.s_addr); + /* + * Look through the list of addresses for a match + * with a broadcast address. + */ +#define ia ((struct in_ifaddr *)ifa) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) + if (ifa->ifa_addr->sa_family == AF_INET && + (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || + in.s_addr == ia->ia_netbroadcast.s_addr || + /* + * Check for old-style (host 0) broadcast. + */ + t == ia->ia_subnet || t == ia->ia_net) && + /* + * Check for an all one subnetmask. These + * only exist when an interface gets a secondary + * address. + */ + ia->ia_subnetmask != (u_long)0xffffffff) + return (1); + return (0); +#undef ia +} + +/* + * On interface removal, clean up IPv4 data structures hung off of the ifnet. + */ +void +in_ifdetach(struct ifnet *ifp) +{ + + in_pcbpurgeif0(&V_ripcbinfo, ifp); + in_pcbpurgeif0(&V_udbinfo, ifp); + in_purgemaddrs(ifp); +} + +/* + * Delete all IPv4 multicast address records, and associated link-layer + * multicast address records, associated with ifp. + * XXX It looks like domifdetach runs AFTER the link layer cleanup. + * XXX This should not race with ifma_protospec being set during + * a new allocation, if it does, we have bigger problems. + */ +static void +in_purgemaddrs(struct ifnet *ifp) +{ + LIST_HEAD(,in_multi) purgeinms; + struct in_multi *inm, *tinm; + struct ifmultiaddr *ifma; + + LIST_INIT(&purgeinms); + IN_MULTI_LOCK(); + + /* + * Extract list of in_multi associated with the detaching ifp + * which the PF_INET layer is about to release. + * We need to do this as IF_ADDR_LOCK() may be re-acquired + * by code further down. + */ + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET || + ifma->ifma_protospec == NULL) + continue; +#if 0 + KASSERT(ifma->ifma_protospec != NULL, + ("%s: ifma_protospec is NULL", __func__)); +#endif + inm = (struct in_multi *)ifma->ifma_protospec; + LIST_INSERT_HEAD(&purgeinms, inm, inm_link); + } + IF_ADDR_UNLOCK(ifp); + + LIST_FOREACH_SAFE(inm, &purgeinms, inm_link, tinm) { + LIST_REMOVE(inm, inm_link); + inm_release_locked(inm); + } + igmp_ifdetach(ifp); + + IN_MULTI_UNLOCK(); +} + +#include +#include + +struct in_llentry { + struct llentry base; + struct sockaddr_in l3_addr4; +}; + +static struct llentry * +in_lltable_new(const struct sockaddr *l3addr, u_int flags) +{ + struct in_llentry *lle; + + lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_DONTWAIT | M_ZERO); + if (lle == NULL) /* NB: caller generates msg */ + return NULL; + + callout_init(&lle->base.la_timer, CALLOUT_MPSAFE); + /* + * For IPv4 this will trigger "arpresolve" to generate + * an ARP request. + */ + lle->base.la_expire = time_second; /* mark expired */ + lle->l3_addr4 = *(const struct sockaddr_in *)l3addr; + lle->base.lle_refcnt = 1; + LLE_LOCK_INIT(&lle->base); + return &lle->base; +} + +/* + * Deletes an address from the address table. + * This function is called by the timer functions + * such as arptimer() and nd6_llinfo_timer(), and + * the caller does the locking. + */ +static void +in_lltable_free(struct lltable *llt, struct llentry *lle) +{ + LLE_WUNLOCK(lle); + LLE_LOCK_DESTROY(lle); + free(lle, M_LLTABLE); +} + + +#define IN_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ + (((ntohl((d)->sin_addr.s_addr) ^ (a)->sin_addr.s_addr) & (m)->sin_addr.s_addr)) == 0 ) + +static void +in_lltable_prefix_free(struct lltable *llt, + const struct sockaddr *prefix, + const struct sockaddr *mask) +{ + const struct sockaddr_in *pfx = (const struct sockaddr_in *)prefix; + const struct sockaddr_in *msk = (const struct sockaddr_in *)mask; + struct llentry *lle, *next; + register int i; + + for (i=0; i < LLTBL_HASHTBL_SIZE; i++) { + LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { + + if (IN_ARE_MASKED_ADDR_EQUAL((struct sockaddr_in *)L3_ADDR(lle), + pfx, msk)) { + int canceled; + + canceled = callout_drain(&lle->la_timer); + LLE_WLOCK(lle); + if (canceled) + LLE_REMREF(lle); + llentry_free(lle); + } + } + } +} + + +static int +in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) +{ + struct rtentry *rt; + + KASSERT(l3addr->sa_family == AF_INET, + ("sin_family %d", l3addr->sa_family)); + + /* XXX rtalloc1 should take a const param */ + rt = rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0); + if (rt == NULL || (!(flags & LLE_PUB) && + ((rt->rt_flags & RTF_GATEWAY) || + (rt->rt_ifp != ifp)))) { +#ifdef DIAGNOSTIC + log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n", + inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr)); +#endif + if (rt != NULL) + RTFREE_LOCKED(rt); + return (EINVAL); + } + RTFREE_LOCKED(rt); + return 0; +} + +/* + * Return NULL if not found or marked for deletion. + * If found return lle read locked. + */ +static struct llentry * +in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; + struct ifnet *ifp = llt->llt_ifp; + struct llentry *lle; + struct llentries *lleh; + u_int hashkey; + + IF_AFDATA_LOCK_ASSERT(ifp); + KASSERT(l3addr->sa_family == AF_INET, + ("sin_family %d", l3addr->sa_family)); + + hashkey = sin->sin_addr.s_addr; + lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)]; + LIST_FOREACH(lle, lleh, lle_next) { + struct sockaddr_in *sa2 = (struct sockaddr_in *)L3_ADDR(lle); + if (lle->la_flags & LLE_DELETED) + continue; + if (sa2->sin_addr.s_addr == sin->sin_addr.s_addr) + break; + } + if (lle == NULL) { +#ifdef DIAGNOSTIC + if (flags & LLE_DELETE) + log(LOG_INFO, "interface address is missing from cache = %p in delete\n", lle); +#endif + if (!(flags & LLE_CREATE)) + return (NULL); + /* + * A route that covers the given address must have + * been installed 1st because we are doing a resolution, + * verify this. + */ + if (!(flags & LLE_IFADDR) && + in_lltable_rtcheck(ifp, flags, l3addr) != 0) + goto done; + + lle = in_lltable_new(l3addr, flags); + if (lle == NULL) { + log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); + goto done; + } + lle->la_flags = flags & ~LLE_CREATE; + if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) { + bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen); + lle->la_flags |= (LLE_VALID | LLE_STATIC); + } + + lle->lle_tbl = llt; + lle->lle_head = lleh; + LIST_INSERT_HEAD(lleh, lle, lle_next); + } else if (flags & LLE_DELETE) { + if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) { + LLE_WLOCK(lle); + lle->la_flags = LLE_DELETED; + LLE_WUNLOCK(lle); +#ifdef DIAGNOSTIC + log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); +#endif + } + lle = (void *)-1; + + } + if (LLE_IS_VALID(lle)) { + if (flags & LLE_EXCLUSIVE) + LLE_WLOCK(lle); + else + LLE_RLOCK(lle); + } +done: + return (lle); +} + +static int +in_lltable_dump(struct lltable *llt, struct sysctl_req *wr) +{ +#define SIN(lle) ((struct sockaddr_in *) L3_ADDR(lle)) + struct ifnet *ifp = llt->llt_ifp; + struct llentry *lle; + /* XXX stack use */ + struct { + struct rt_msghdr rtm; + struct sockaddr_inarp sin; + struct sockaddr_dl sdl; + } arpc; + int error, i; + + LLTABLE_LOCK_ASSERT(); + + error = 0; + for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { + LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { + struct sockaddr_dl *sdl; + + /* skip deleted entries */ + if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) + continue; + /* Skip if jailed and not a valid IP of the prison. */ + if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0) + continue; + /* + * produce a msg made of: + * struct rt_msghdr; + * struct sockaddr_inarp; (IPv4) + * struct sockaddr_dl; + */ + bzero(&arpc, sizeof(arpc)); + arpc.rtm.rtm_msglen = sizeof(arpc); + arpc.rtm.rtm_version = RTM_VERSION; + arpc.rtm.rtm_type = RTM_GET; + arpc.rtm.rtm_flags = RTF_UP; + arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; + arpc.sin.sin_family = AF_INET; + arpc.sin.sin_len = sizeof(arpc.sin); + arpc.sin.sin_addr.s_addr = SIN(lle)->sin_addr.s_addr; + + /* publish */ + if (lle->la_flags & LLE_PUB) { + arpc.rtm.rtm_flags |= RTF_ANNOUNCE; + /* proxy only */ + if (lle->la_flags & LLE_PROXY) + arpc.sin.sin_other = SIN_PROXY; + } + + sdl = &arpc.sdl; + sdl->sdl_family = AF_LINK; + sdl->sdl_len = sizeof(*sdl); + sdl->sdl_index = ifp->if_index; + sdl->sdl_type = ifp->if_type; + if ((lle->la_flags & LLE_VALID) == LLE_VALID) { + sdl->sdl_alen = ifp->if_addrlen; + bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); + } else { + sdl->sdl_alen = 0; + bzero(LLADDR(sdl), ifp->if_addrlen); + } + + arpc.rtm.rtm_rmx.rmx_expire = + lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; + arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); + if (lle->la_flags & LLE_STATIC) + arpc.rtm.rtm_flags |= RTF_STATIC; + arpc.rtm.rtm_index = ifp->if_index; + error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); + if (error) + break; + } + } + return error; +#undef SIN +} + +void * +in_domifattach(struct ifnet *ifp) +{ + struct in_ifinfo *ii; + struct lltable *llt; + + ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); + + llt = lltable_init(ifp, AF_INET); + if (llt != NULL) { + llt->llt_new = in_lltable_new; + llt->llt_free = in_lltable_free; + llt->llt_prefix_free = in_lltable_prefix_free; + llt->llt_rtcheck = in_lltable_rtcheck; + llt->llt_lookup = in_lltable_lookup; + llt->llt_dump = in_lltable_dump; + } + ii->ii_llt = llt; + + ii->ii_igmp = igmp_domifattach(ifp); + + return ii; +} + +void +in_domifdetach(struct ifnet *ifp, void *aux) +{ + struct in_ifinfo *ii = (struct in_ifinfo *)aux; + + igmp_domifdetach(ifp); + lltable_free(ii->ii_llt); + free(ii, M_IFADDR); +} diff --git a/freebsd/sys/netinet/in.h b/freebsd/sys/netinet/in.h new file mode 100644 index 00000000..73c7ca1a --- /dev/null +++ b/freebsd/sys/netinet/in.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/netinet/in_gif.c b/freebsd/sys/netinet/in_gif.c new file mode 100644 index 00000000..3613e214 --- /dev/null +++ b/freebsd/sys/netinet/in_gif.c @@ -0,0 +1,469 @@ +#include + +/* $KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $ */ + +/*- + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET6 +#include +#endif + +#ifdef MROUTING +#include +#endif /* MROUTING */ + +#include + +static int gif_validate4(const struct ip *, struct gif_softc *, + struct ifnet *); + +extern struct domain inetdomain; +struct protosw in_gif_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = 0/* IPPROTO_IPV[46] */, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = in_gif_input, + .pr_output = (pr_output_t*)rip_output, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}; + +VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL; +#define V_ip_gif_ttl VNET(ip_gif_ttl) +SYSCTL_VNET_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, + &VNET_NAME(ip_gif_ttl), 0, ""); + +int +in_gif_output(struct ifnet *ifp, int family, struct mbuf *m) +{ + struct gif_softc *sc = ifp->if_softc; + struct sockaddr_in *dst = (struct sockaddr_in *)&sc->gif_ro.ro_dst; + struct sockaddr_in *sin_src = (struct sockaddr_in *)sc->gif_psrc; + struct sockaddr_in *sin_dst = (struct sockaddr_in *)sc->gif_pdst; + struct ip iphdr; /* capsule IP header, host byte ordered */ + struct etherip_header eiphdr; + int error, len, proto; + u_int8_t tos; + + GIF_LOCK_ASSERT(sc); + + if (sin_src == NULL || sin_dst == NULL || + sin_src->sin_family != AF_INET || + sin_dst->sin_family != AF_INET) { + m_freem(m); + return EAFNOSUPPORT; + } + + switch (family) { +#ifdef INET + case AF_INET: + { + struct ip *ip; + + proto = IPPROTO_IPV4; + if (m->m_len < sizeof(*ip)) { + m = m_pullup(m, sizeof(*ip)); + if (!m) + return ENOBUFS; + } + ip = mtod(m, struct ip *); + tos = ip->ip_tos; + break; + } +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + { + struct ip6_hdr *ip6; + proto = IPPROTO_IPV6; + if (m->m_len < sizeof(*ip6)) { + m = m_pullup(m, sizeof(*ip6)); + if (!m) + return ENOBUFS; + } + ip6 = mtod(m, struct ip6_hdr *); + tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + break; + } +#endif /* INET6 */ + case AF_LINK: + proto = IPPROTO_ETHERIP; + + /* + * GIF_SEND_REVETHIP (disabled by default) intentionally + * sends an EtherIP packet with revered version field in + * the header. This is a knob for backward compatibility + * with FreeBSD 7.2R or prior. + */ + if ((sc->gif_options & GIF_SEND_REVETHIP)) { + eiphdr.eip_ver = 0; + eiphdr.eip_resvl = ETHERIP_VERSION; + eiphdr.eip_resvh = 0; + } else { + eiphdr.eip_ver = ETHERIP_VERSION; + eiphdr.eip_resvl = 0; + eiphdr.eip_resvh = 0; + } + /* prepend Ethernet-in-IP header */ + M_PREPEND(m, sizeof(struct etherip_header), M_DONTWAIT); + if (m && m->m_len < sizeof(struct etherip_header)) + m = m_pullup(m, sizeof(struct etherip_header)); + if (m == NULL) + return ENOBUFS; + bcopy(&eiphdr, mtod(m, struct etherip_header *), + sizeof(struct etherip_header)); + break; + + default: +#ifdef DEBUG + printf("in_gif_output: warning: unknown family %d passed\n", + family); +#endif + m_freem(m); + return EAFNOSUPPORT; + } + + bzero(&iphdr, sizeof(iphdr)); + iphdr.ip_src = sin_src->sin_addr; + /* bidirectional configured tunnel mode */ + if (sin_dst->sin_addr.s_addr != INADDR_ANY) + iphdr.ip_dst = sin_dst->sin_addr; + else { + m_freem(m); + return ENETUNREACH; + } + iphdr.ip_p = proto; + /* version will be set in ip_output() */ + iphdr.ip_ttl = V_ip_gif_ttl; + iphdr.ip_len = m->m_pkthdr.len + sizeof(struct ip); + ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE, + &iphdr.ip_tos, &tos); + + /* prepend new IP header */ + len = sizeof(struct ip); +#ifndef __NO_STRICT_ALIGNMENT + if (family == AF_LINK) + len += ETHERIP_ALIGN; +#endif + M_PREPEND(m, len, M_DONTWAIT); + if (m != NULL && m->m_len < len) + m = m_pullup(m, len); + if (m == NULL) { + printf("ENOBUFS in in_gif_output %d\n", __LINE__); + return ENOBUFS; + } +#ifndef __NO_STRICT_ALIGNMENT + if (family == AF_LINK) { + len = mtod(m, vm_offset_t) & 3; + KASSERT(len == 0 || len == ETHERIP_ALIGN, + ("in_gif_output: unexpected misalignment")); + m->m_data += len; + m->m_len -= ETHERIP_ALIGN; + } +#endif + bcopy(&iphdr, mtod(m, struct ip *), sizeof(struct ip)); + + M_SETFIB(m, sc->gif_fibnum); + + if (dst->sin_family != sin_dst->sin_family || + dst->sin_addr.s_addr != sin_dst->sin_addr.s_addr) { + /* cache route doesn't match */ + bzero(dst, sizeof(*dst)); + dst->sin_family = sin_dst->sin_family; + dst->sin_len = sizeof(struct sockaddr_in); + dst->sin_addr = sin_dst->sin_addr; + if (sc->gif_ro.ro_rt) { + RTFREE(sc->gif_ro.ro_rt); + sc->gif_ro.ro_rt = NULL; + } +#if 0 + GIF2IFP(sc)->if_mtu = GIF_MTU; +#endif + } + + if (sc->gif_ro.ro_rt == NULL) { + in_rtalloc_ign(&sc->gif_ro, 0, sc->gif_fibnum); + if (sc->gif_ro.ro_rt == NULL) { + m_freem(m); + return ENETUNREACH; + } + + /* if it constitutes infinite encapsulation, punt. */ + if (sc->gif_ro.ro_rt->rt_ifp == ifp) { + m_freem(m); + return ENETUNREACH; /* XXX */ + } +#if 0 + ifp->if_mtu = sc->gif_ro.ro_rt->rt_ifp->if_mtu + - sizeof(struct ip); +#endif + } + + error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL); + + if (!(GIF2IFP(sc)->if_flags & IFF_LINK0) && + sc->gif_ro.ro_rt != NULL) { + RTFREE(sc->gif_ro.ro_rt); + sc->gif_ro.ro_rt = NULL; + } + + return (error); +} + +void +in_gif_input(struct mbuf *m, int off) +{ + struct ifnet *gifp = NULL; + struct gif_softc *sc; + struct ip *ip; + int af; + u_int8_t otos; + int proto; + + ip = mtod(m, struct ip *); + proto = ip->ip_p; + + sc = (struct gif_softc *)encap_getarg(m); + if (sc == NULL) { + m_freem(m); + KMOD_IPSTAT_INC(ips_nogif); + return; + } + + gifp = GIF2IFP(sc); + if (gifp == NULL || (gifp->if_flags & IFF_UP) == 0) { + m_freem(m); + KMOD_IPSTAT_INC(ips_nogif); + return; + } + + otos = ip->ip_tos; + m_adj(m, off); + + switch (proto) { +#ifdef INET + case IPPROTO_IPV4: + { + struct ip *ip; + af = AF_INET; + if (m->m_len < sizeof(*ip)) { + m = m_pullup(m, sizeof(*ip)); + if (!m) + return; + } + ip = mtod(m, struct ip *); + if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ? + ECN_ALLOWED : ECN_NOCARE, + &otos, &ip->ip_tos) == 0) { + m_freem(m); + return; + } + break; + } +#endif +#ifdef INET6 + case IPPROTO_IPV6: + { + struct ip6_hdr *ip6; + u_int8_t itos, oitos; + + af = AF_INET6; + if (m->m_len < sizeof(*ip6)) { + m = m_pullup(m, sizeof(*ip6)); + if (!m) + return; + } + ip6 = mtod(m, struct ip6_hdr *); + itos = oitos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + if (ip_ecn_egress((gifp->if_flags & IFF_LINK1) ? + ECN_ALLOWED : ECN_NOCARE, + &otos, &itos) == 0) { + m_freem(m); + return; + } + if (itos != oitos) { + ip6->ip6_flow &= ~htonl(0xff << 20); + ip6->ip6_flow |= htonl((u_int32_t)itos << 20); + } + break; + } +#endif /* INET6 */ + case IPPROTO_ETHERIP: + af = AF_LINK; + break; + + default: + KMOD_IPSTAT_INC(ips_nogif); + m_freem(m); + return; + } + gif_input(m, af, gifp); + return; +} + +/* + * validate outer address. + */ +static int +gif_validate4(const struct ip *ip, struct gif_softc *sc, struct ifnet *ifp) +{ + struct sockaddr_in *src, *dst; + struct in_ifaddr *ia4; + + src = (struct sockaddr_in *)sc->gif_psrc; + dst = (struct sockaddr_in *)sc->gif_pdst; + + /* check for address match */ + if (src->sin_addr.s_addr != ip->ip_dst.s_addr || + dst->sin_addr.s_addr != ip->ip_src.s_addr) + return 0; + + /* martian filters on outer source - NOT done in ip_input! */ + if (IN_MULTICAST(ntohl(ip->ip_src.s_addr))) + return 0; + switch ((ntohl(ip->ip_src.s_addr) & 0xff000000) >> 24) { + case 0: case 127: case 255: + return 0; + } + + /* reject packets with broadcast on source */ + /* XXXRW: should use hash lists? */ + IN_IFADDR_RLOCK(); + TAILQ_FOREACH(ia4, &V_in_ifaddrhead, ia_link) { + if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) + continue; + if (ip->ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { + IN_IFADDR_RUNLOCK(); + return 0; + } + } + IN_IFADDR_RUNLOCK(); + + /* ingress filters on outer source */ + if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0 && ifp) { + struct sockaddr_in sin; + struct rtentry *rt; + + bzero(&sin, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(struct sockaddr_in); + sin.sin_addr = ip->ip_src; + /* XXX MRT check for the interface we would use on output */ + rt = in_rtalloc1((struct sockaddr *)&sin, 0, + 0UL, sc->gif_fibnum); + if (!rt || rt->rt_ifp != ifp) { +#if 0 + log(LOG_WARNING, "%s: packet from 0x%x dropped " + "due to ingress filter\n", if_name(GIF2IFP(sc)), + (u_int32_t)ntohl(sin.sin_addr.s_addr)); +#endif + if (rt) + RTFREE_LOCKED(rt); + return 0; + } + RTFREE_LOCKED(rt); + } + + return 32 * 2; +} + +/* + * we know that we are in IFF_UP, outer address available, and outer family + * matched the physical addr family. see gif_encapcheck(). + */ +int +gif_encapcheck4(const struct mbuf *m, int off, int proto, void *arg) +{ + struct ip ip; + struct gif_softc *sc; + struct ifnet *ifp; + + /* sanity check done in caller */ + sc = (struct gif_softc *)arg; + + /* LINTED const cast */ + m_copydata(m, 0, sizeof(ip), (caddr_t)&ip); + ifp = ((m->m_flags & M_PKTHDR) != 0) ? m->m_pkthdr.rcvif : NULL; + + return gif_validate4(&ip, sc, ifp); +} + +int +in_gif_attach(struct gif_softc *sc) +{ + sc->encap_cookie4 = encap_attach_func(AF_INET, -1, gif_encapcheck, + &in_gif_protosw, sc); + if (sc->encap_cookie4 == NULL) + return EEXIST; + return 0; +} + +int +in_gif_detach(struct gif_softc *sc) +{ + int error; + + error = encap_detach(sc->encap_cookie4); + if (error == 0) + sc->encap_cookie4 = NULL; + return error; +} diff --git a/freebsd/sys/netinet/in_gif.h b/freebsd/sys/netinet/in_gif.h new file mode 100644 index 00000000..1e42b01f --- /dev/null +++ b/freebsd/sys/netinet/in_gif.h @@ -0,0 +1,45 @@ +/* $FreeBSD$ */ +/* $KAME: in_gif.h,v 1.5 2000/04/14 08:36:02 itojun Exp $ */ + +/*- + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETINET_IN_GIF_HH_ +#define _NETINET_IN_GIF_HH_ + +#define GIF_TTL 30 + +struct gif_softc; +void in_gif_input(struct mbuf *, int); +int in_gif_output(struct ifnet *, int, struct mbuf *); +int gif_encapcheck4(const struct mbuf *, int, int, void *); +int in_gif_attach(struct gif_softc *); +int in_gif_detach(struct gif_softc *); + +#endif /*_NETINET_IN_GIF_HH_*/ diff --git a/freebsd/sys/netinet/in_mcast.c b/freebsd/sys/netinet/in_mcast.c new file mode 100644 index 00000000..ed2bcc12 --- /dev/null +++ b/freebsd/sys/netinet/in_mcast.c @@ -0,0 +1,2902 @@ +#include + +/*- + * Copyright (c) 2007-2009 Bruce Simpson. + * Copyright (c) 2005 Robert N. M. Watson. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * IPv4 multicast socket, group, and socket option processing module. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifndef KTR_IGMPV3 +#define KTR_IGMPV3 KTR_INET +#endif + +#ifndef __SOCKUNION_DECLARED +union sockunion { + struct sockaddr_storage ss; + struct sockaddr sa; + struct sockaddr_dl sdl; + struct sockaddr_in sin; +}; +typedef union sockunion sockunion_t; +#define __SOCKUNION_DECLARED +#endif /* __SOCKUNION_DECLARED */ + +static MALLOC_DEFINE(M_INMFILTER, "in_mfilter", + "IPv4 multicast PCB-layer source filter"); +static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group"); +static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options"); +static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource", + "IPv4 multicast IGMP-layer source filter"); + +/* + * Locking: + * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK. + * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however + * it can be taken by code in net/if.c also. + * - ip_moptions and in_mfilter are covered by the INP_WLOCK. + * + * struct in_multi is covered by IN_MULTI_LOCK. There isn't strictly + * any need for in_multi itself to be virtualized -- it is bound to an ifp + * anyway no matter what happens. + */ +struct mtx in_multi_mtx; +MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF); + +/* + * Functions with non-static linkage defined in this file should be + * declared in in_var.h: + * imo_multi_filter() + * in_addmulti() + * in_delmulti() + * in_joingroup() + * in_joingroup_locked() + * in_leavegroup() + * in_leavegroup_locked() + * and ip_var.h: + * inp_freemoptions() + * inp_getmoptions() + * inp_setmoptions() + * + * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() + * and in_delmulti(). + */ +static void imf_commit(struct in_mfilter *); +static int imf_get_source(struct in_mfilter *imf, + const struct sockaddr_in *psin, + struct in_msource **); +static struct in_msource * + imf_graft(struct in_mfilter *, const uint8_t, + const struct sockaddr_in *); +static void imf_leave(struct in_mfilter *); +static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); +static void imf_purge(struct in_mfilter *); +static void imf_rollback(struct in_mfilter *); +static void imf_reap(struct in_mfilter *); +static int imo_grow(struct ip_moptions *); +static size_t imo_match_group(const struct ip_moptions *, + const struct ifnet *, const struct sockaddr *); +static struct in_msource * + imo_match_source(const struct ip_moptions *, const size_t, + const struct sockaddr *); +static void ims_merge(struct ip_msource *ims, + const struct in_msource *lims, const int rollback); +static int in_getmulti(struct ifnet *, const struct in_addr *, + struct in_multi **); +static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, + const int noalloc, struct ip_msource **pims); +static int inm_is_ifp_detached(const struct in_multi *); +static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); +static void inm_purge(struct in_multi *); +static void inm_reap(struct in_multi *); +static struct ip_moptions * + inp_findmoptions(struct inpcb *); +static int inp_get_source_filters(struct inpcb *, struct sockopt *); +static int inp_join_group(struct inpcb *, struct sockopt *); +static int inp_leave_group(struct inpcb *, struct sockopt *); +static struct ifnet * + inp_lookup_mcast_ifp(const struct inpcb *, + const struct sockaddr_in *, const struct in_addr); +static int inp_block_unblock_source(struct inpcb *, struct sockopt *); +static int inp_set_multicast_if(struct inpcb *, struct sockopt *); +static int inp_set_source_filters(struct inpcb *, struct sockopt *); +static int sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS); + +SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0, "IPv4 multicast"); + +static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; +SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, + CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_maxgrpsrc, 0, + "Max source filters per group"); +TUNABLE_ULONG("net.inet.ip.mcast.maxgrpsrc", &in_mcast_maxgrpsrc); + +static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; +SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, + CTLFLAG_RW | CTLFLAG_TUN, &in_mcast_maxsocksrc, 0, + "Max source filters per socket"); +TUNABLE_ULONG("net.inet.ip.mcast.maxsocksrc", &in_mcast_maxsocksrc); + +int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; +SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_TUN, + &in_mcast_loop, 0, "Loopback multicast datagrams by default"); +TUNABLE_INT("net.inet.ip.mcast.loop", &in_mcast_loop); + +SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, + CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters, + "Per-interface stack-wide source filters"); + +/* + * Inline function which wraps assertions for a valid ifp. + * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp + * is detached. + */ +static int __inline +inm_is_ifp_detached(const struct in_multi *inm) +{ + struct ifnet *ifp; + + KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__)); + ifp = inm->inm_ifma->ifma_ifp; + if (ifp != NULL) { + /* + * Sanity check that netinet's notion of ifp is the + * same as net's. + */ + KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__)); + } + + return (ifp == NULL); +} + +/* + * Initialize an in_mfilter structure to a known state at t0, t1 + * with an empty source filter list. + */ +static __inline void +imf_init(struct in_mfilter *imf, const int st0, const int st1) +{ + memset(imf, 0, sizeof(struct in_mfilter)); + RB_INIT(&imf->imf_sources); + imf->imf_st[0] = st0; + imf->imf_st[1] = st1; +} + +/* + * Resize the ip_moptions vector to the next power-of-two minus 1. + * May be called with locks held; do not sleep. + */ +static int +imo_grow(struct ip_moptions *imo) +{ + struct in_multi **nmships; + struct in_multi **omships; + struct in_mfilter *nmfilters; + struct in_mfilter *omfilters; + size_t idx; + size_t newmax; + size_t oldmax; + + nmships = NULL; + nmfilters = NULL; + omships = imo->imo_membership; + omfilters = imo->imo_mfilters; + oldmax = imo->imo_max_memberships; + newmax = ((oldmax + 1) * 2) - 1; + + if (newmax <= IP_MAX_MEMBERSHIPS) { + nmships = (struct in_multi **)realloc(omships, + sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT); + nmfilters = (struct in_mfilter *)realloc(omfilters, + sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT); + if (nmships != NULL && nmfilters != NULL) { + /* Initialize newly allocated source filter heads. */ + for (idx = oldmax; idx < newmax; idx++) { + imf_init(&nmfilters[idx], MCAST_UNDEFINED, + MCAST_EXCLUDE); + } + imo->imo_max_memberships = newmax; + imo->imo_membership = nmships; + imo->imo_mfilters = nmfilters; + } + } + + if (nmships == NULL || nmfilters == NULL) { + if (nmships != NULL) + free(nmships, M_IPMOPTS); + if (nmfilters != NULL) + free(nmfilters, M_INMFILTER); + return (ETOOMANYREFS); + } + + return (0); +} + +/* + * Find an IPv4 multicast group entry for this ip_moptions instance + * which matches the specified group, and optionally an interface. + * Return its index into the array, or -1 if not found. + */ +static size_t +imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group) +{ + const struct sockaddr_in *gsin; + struct in_multi **pinm; + int idx; + int nmships; + + gsin = (const struct sockaddr_in *)group; + + /* The imo_membership array may be lazy allocated. */ + if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) + return (-1); + + nmships = imo->imo_num_memberships; + pinm = &imo->imo_membership[0]; + for (idx = 0; idx < nmships; idx++, pinm++) { + if (*pinm == NULL) + continue; + if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) && + in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) { + break; + } + } + if (idx >= nmships) + idx = -1; + + return (idx); +} + +/* + * Find an IPv4 multicast source entry for this imo which matches + * the given group index for this socket, and source address. + * + * NOTE: This does not check if the entry is in-mode, merely if + * it exists, which may not be the desired behaviour. + */ +static struct in_msource * +imo_match_source(const struct ip_moptions *imo, const size_t gidx, + const struct sockaddr *src) +{ + struct ip_msource find; + struct in_mfilter *imf; + struct ip_msource *ims; + const sockunion_t *psa; + + KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__)); + KASSERT(gidx != -1 && gidx < imo->imo_num_memberships, + ("%s: invalid index %d\n", __func__, (int)gidx)); + + /* The imo_mfilters array may be lazy allocated. */ + if (imo->imo_mfilters == NULL) + return (NULL); + imf = &imo->imo_mfilters[gidx]; + + /* Source trees are keyed in host byte order. */ + psa = (const sockunion_t *)src; + find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + + return ((struct in_msource *)ims); +} + +/* + * Perform filtering for multicast datagrams on a socket by group and source. + * + * Returns 0 if a datagram should be allowed through, or various error codes + * if the socket was not a member of the group, or the source was muted, etc. + */ +int +imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group, const struct sockaddr *src) +{ + size_t gidx; + struct in_msource *ims; + int mode; + + KASSERT(ifp != NULL, ("%s: null ifp", __func__)); + + gidx = imo_match_group(imo, ifp, group); + if (gidx == -1) + return (MCAST_NOTGMEMBER); + + /* + * Check if the source was included in an (S,G) join. + * Allow reception on exclusive memberships by default, + * reject reception on inclusive memberships by default. + * Exclude source only if an in-mode exclude filter exists. + * Include source only if an in-mode include filter exists. + * NOTE: We are comparing group state here at IGMP t1 (now) + * with socket-layer t0 (since last downcall). + */ + mode = imo->imo_mfilters[gidx].imf_st[1]; + ims = imo_match_source(imo, gidx, src); + + if ((ims == NULL && mode == MCAST_INCLUDE) || + (ims != NULL && ims->imsl_st[0] != mode)) + return (MCAST_NOTSMEMBER); + + return (MCAST_PASS); +} + +/* + * Find and return a reference to an in_multi record for (ifp, group), + * and bump its reference count. + * If one does not exist, try to allocate it, and update link-layer multicast + * filters on ifp to listen for group. + * Assumes the IN_MULTI lock is held across the call. + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +in_getmulti(struct ifnet *ifp, const struct in_addr *group, + struct in_multi **pinm) +{ + struct sockaddr_in gsin; + struct ifmultiaddr *ifma; + struct in_ifinfo *ii; + struct in_multi *inm; + int error; + + IN_MULTI_LOCK_ASSERT(); + + ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET]; + + inm = inm_lookup(ifp, *group); + if (inm != NULL) { + /* + * If we already joined this group, just bump the + * refcount and return it. + */ + KASSERT(inm->inm_refcount >= 1, + ("%s: bad refcount %d", __func__, inm->inm_refcount)); + ++inm->inm_refcount; + *pinm = inm; + return (0); + } + + memset(&gsin, 0, sizeof(gsin)); + gsin.sin_family = AF_INET; + gsin.sin_len = sizeof(struct sockaddr_in); + gsin.sin_addr = *group; + + /* + * Check if a link-layer group is already associated + * with this network-layer group on the given ifnet. + */ + error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); + if (error != 0) + return (error); + + /* XXX ifma_protospec must be covered by IF_ADDR_LOCK */ + IF_ADDR_LOCK(ifp); + + /* + * If something other than netinet is occupying the link-layer + * group, print a meaningful error message and back out of + * the allocation. + * Otherwise, bump the refcount on the existing network-layer + * group association and return it. + */ + if (ifma->ifma_protospec != NULL) { + inm = (struct in_multi *)ifma->ifma_protospec; +#ifdef INVARIANTS + KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr", + __func__)); + KASSERT(ifma->ifma_addr->sa_family == AF_INET, + ("%s: ifma not AF_INET", __func__)); + KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__)); + if (inm->inm_ifma != ifma || inm->inm_ifp != ifp || + !in_hosteq(inm->inm_addr, *group)) + panic("%s: ifma %p is inconsistent with %p (%s)", + __func__, ifma, inm, inet_ntoa(*group)); +#endif + ++inm->inm_refcount; + *pinm = inm; + IF_ADDR_UNLOCK(ifp); + return (0); + } + + IF_ADDR_LOCK_ASSERT(ifp); + + /* + * A new in_multi record is needed; allocate and initialize it. + * We DO NOT perform an IGMP join as the in_ layer may need to + * push an initial source list down to IGMP to support SSM. + * + * The initial source filter state is INCLUDE, {} as per the RFC. + */ + inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO); + if (inm == NULL) { + if_delmulti_ifma(ifma); + IF_ADDR_UNLOCK(ifp); + return (ENOMEM); + } + inm->inm_addr = *group; + inm->inm_ifp = ifp; + inm->inm_igi = ii->ii_igmp; + inm->inm_ifma = ifma; + inm->inm_refcount = 1; + inm->inm_state = IGMP_NOT_MEMBER; + + /* + * Pending state-changes per group are subject to a bounds check. + */ + IFQ_SET_MAXLEN(&inm->inm_scq, IGMP_MAX_STATE_CHANGES); + + inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + RB_INIT(&inm->inm_srcs); + + ifma->ifma_protospec = inm; + + *pinm = inm; + + IF_ADDR_UNLOCK(ifp); + return (0); +} + +/* + * Drop a reference to an in_multi record. + * + * If the refcount drops to 0, free the in_multi record and + * delete the underlying link-layer membership. + */ +void +inm_release_locked(struct in_multi *inm) +{ + struct ifmultiaddr *ifma; + + IN_MULTI_LOCK_ASSERT(); + + CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount); + + if (--inm->inm_refcount > 0) { + CTR2(KTR_IGMPV3, "%s: refcount is now %d", __func__, + inm->inm_refcount); + return; + } + + CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm); + + ifma = inm->inm_ifma; + + /* XXX this access is not covered by IF_ADDR_LOCK */ + CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma); + KASSERT(ifma->ifma_protospec == inm, + ("%s: ifma_protospec != inm", __func__)); + ifma->ifma_protospec = NULL; + + inm_purge(inm); + + free(inm, M_IPMADDR); + + if_delmulti_ifma(ifma); +} + +/* + * Clear recorded source entries for a group. + * Used by the IGMP code. Caller must hold the IN_MULTI lock. + * FIXME: Should reap. + */ +void +inm_clear_recorded(struct in_multi *inm) +{ + struct ip_msource *ims; + + IN_MULTI_LOCK_ASSERT(); + + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { + if (ims->ims_stp) { + ims->ims_stp = 0; + --inm->inm_st[1].iss_rec; + } + } + KASSERT(inm->inm_st[1].iss_rec == 0, + ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec)); +} + +/* + * Record a source as pending for a Source-Group IGMPv3 query. + * This lives here as it modifies the shared tree. + * + * inm is the group descriptor. + * naddr is the address of the source to record in network-byte order. + * + * If the net.inet.igmp.sgalloc sysctl is non-zero, we will + * lazy-allocate a source node in response to an SG query. + * Otherwise, no allocation is performed. This saves some memory + * with the trade-off that the source will not be reported to the + * router if joined in the window between the query response and + * the group actually being joined on the local host. + * + * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed. + * This turns off the allocation of a recorded source entry if + * the group has not been joined. + * + * Return 0 if the source didn't exist or was already marked as recorded. + * Return 1 if the source was marked as recorded by this function. + * Return <0 if any error occured (negated errno code). + */ +int +inm_record_source(struct in_multi *inm, const in_addr_t naddr) +{ + struct ip_msource find; + struct ip_msource *ims, *nims; + + IN_MULTI_LOCK_ASSERT(); + + find.ims_haddr = ntohl(naddr); + ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); + if (ims && ims->ims_stp) + return (0); + if (ims == NULL) { + if (inm->inm_nsrc == in_mcast_maxgrpsrc) + return (-ENOSPC); + nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, + M_NOWAIT | M_ZERO); + if (nims == NULL) + return (-ENOMEM); + nims->ims_haddr = find.ims_haddr; + RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); + ++inm->inm_nsrc; + ims = nims; + } + + /* + * Mark the source as recorded and update the recorded + * source count. + */ + ++ims->ims_stp; + ++inm->inm_st[1].iss_rec; + + return (1); +} + +/* + * Return a pointer to an in_msource owned by an in_mfilter, + * given its source address. + * Lazy-allocate if needed. If this is a new entry its filter state is + * undefined at t0. + * + * imf is the filter set being modified. + * haddr is the source address in *host* byte-order. + * + * SMPng: May be called with locks held; malloc must not block. + */ +static int +imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, + struct in_msource **plims) +{ + struct ip_msource find; + struct ip_msource *ims, *nims; + struct in_msource *lims; + int error; + + error = 0; + ims = NULL; + lims = NULL; + + /* key is host byte order */ + find.ims_haddr = ntohl(psin->sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + lims = (struct in_msource *)ims; + if (lims == NULL) { + if (imf->imf_nsrc == in_mcast_maxsocksrc) + return (ENOSPC); + nims = malloc(sizeof(struct in_msource), M_INMFILTER, + M_NOWAIT | M_ZERO); + if (nims == NULL) + return (ENOMEM); + lims = (struct in_msource *)nims; + lims->ims_haddr = find.ims_haddr; + lims->imsl_st[0] = MCAST_UNDEFINED; + RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); + ++imf->imf_nsrc; + } + + *plims = lims; + + return (error); +} + +/* + * Graft a source entry into an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being in the new filter mode at t1. + * + * Return the pointer to the new node, otherwise return NULL. + */ +static struct in_msource * +imf_graft(struct in_mfilter *imf, const uint8_t st1, + const struct sockaddr_in *psin) +{ + struct ip_msource *nims; + struct in_msource *lims; + + nims = malloc(sizeof(struct in_msource), M_INMFILTER, + M_NOWAIT | M_ZERO); + if (nims == NULL) + return (NULL); + lims = (struct in_msource *)nims; + lims->ims_haddr = ntohl(psin->sin_addr.s_addr); + lims->imsl_st[0] = MCAST_UNDEFINED; + lims->imsl_st[1] = st1; + RB_INSERT(ip_msource_tree, &imf->imf_sources, nims); + ++imf->imf_nsrc; + + return (lims); +} + +/* + * Prune a source entry from an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being left at t1, it is not freed. + * + * Return 0 if no error occurred, otherwise return an errno value. + */ +static int +imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) +{ + struct ip_msource find; + struct ip_msource *ims; + struct in_msource *lims; + + /* key is host byte order */ + find.ims_haddr = ntohl(psin->sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + if (ims == NULL) + return (ENOENT); + lims = (struct in_msource *)ims; + lims->imsl_st[1] = MCAST_UNDEFINED; + return (0); +} + +/* + * Revert socket-layer filter set deltas at t1 to t0 state. + */ +static void +imf_rollback(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + struct in_msource *lims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == lims->imsl_st[1]) { + /* no change at t1 */ + continue; + } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { + /* revert change to existing source at t1 */ + lims->imsl_st[1] = lims->imsl_st[0]; + } else { + /* revert source added t1 */ + CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + free(ims, M_INMFILTER); + imf->imf_nsrc--; + } + } + imf->imf_st[1] = imf->imf_st[0]; +} + +/* + * Mark socket-layer filter set as INCLUDE {} at t1. + */ +static void +imf_leave(struct in_mfilter *imf) +{ + struct ip_msource *ims; + struct in_msource *lims; + + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + lims->imsl_st[1] = MCAST_UNDEFINED; + } + imf->imf_st[1] = MCAST_INCLUDE; +} + +/* + * Mark socket-layer filter set deltas as committed. + */ +static void +imf_commit(struct in_mfilter *imf) +{ + struct ip_msource *ims; + struct in_msource *lims; + + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + lims->imsl_st[0] = lims->imsl_st[1]; + } + imf->imf_st[0] = imf->imf_st[1]; +} + +/* + * Reap unreferenced sources from socket-layer filter set. + */ +static void +imf_reap(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + struct in_msource *lims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + lims = (struct in_msource *)ims; + if ((lims->imsl_st[0] == MCAST_UNDEFINED) && + (lims->imsl_st[1] == MCAST_UNDEFINED)) { + CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + free(ims, M_INMFILTER); + imf->imf_nsrc--; + } + } +} + +/* + * Purge socket-layer filter set. + */ +static void +imf_purge(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + free(ims, M_INMFILTER); + imf->imf_nsrc--; + } + imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; + KASSERT(RB_EMPTY(&imf->imf_sources), + ("%s: imf_sources not empty", __func__)); +} + +/* + * Look up a source filter entry for a multicast group. + * + * inm is the group descriptor to work with. + * haddr is the host-byte-order IPv4 address to look up. + * noalloc may be non-zero to suppress allocation of sources. + * *pims will be set to the address of the retrieved or allocated source. + * + * SMPng: NOTE: may be called with locks held. + * Return 0 if successful, otherwise return a non-zero error code. + */ +static int +inm_get_source(struct in_multi *inm, const in_addr_t haddr, + const int noalloc, struct ip_msource **pims) +{ + struct ip_msource find; + struct ip_msource *ims, *nims; +#ifdef KTR + struct in_addr ia; +#endif + + find.ims_haddr = haddr; + ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); + if (ims == NULL && !noalloc) { + if (inm->inm_nsrc == in_mcast_maxgrpsrc) + return (ENOSPC); + nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE, + M_NOWAIT | M_ZERO); + if (nims == NULL) + return (ENOMEM); + nims->ims_haddr = haddr; + RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); + ++inm->inm_nsrc; + ims = nims; +#ifdef KTR + ia.s_addr = htonl(haddr); + CTR3(KTR_IGMPV3, "%s: allocated %s as %p", __func__, + inet_ntoa(ia), ims); +#endif + } + + *pims = ims; + return (0); +} + +/* + * Merge socket-layer source into IGMP-layer source. + * If rollback is non-zero, perform the inverse of the merge. + */ +static void +ims_merge(struct ip_msource *ims, const struct in_msource *lims, + const int rollback) +{ + int n = rollback ? -1 : 1; +#ifdef KTR + struct in_addr ia; + + ia.s_addr = htonl(ims->ims_haddr); +#endif + + if (lims->imsl_st[0] == MCAST_EXCLUDE) { + CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on %s", + __func__, n, inet_ntoa(ia)); + ims->ims_st[1].ex -= n; + } else if (lims->imsl_st[0] == MCAST_INCLUDE) { + CTR3(KTR_IGMPV3, "%s: t1 in -= %d on %s", + __func__, n, inet_ntoa(ia)); + ims->ims_st[1].in -= n; + } + + if (lims->imsl_st[1] == MCAST_EXCLUDE) { + CTR3(KTR_IGMPV3, "%s: t1 ex += %d on %s", + __func__, n, inet_ntoa(ia)); + ims->ims_st[1].ex += n; + } else if (lims->imsl_st[1] == MCAST_INCLUDE) { + CTR3(KTR_IGMPV3, "%s: t1 in += %d on %s", + __func__, n, inet_ntoa(ia)); + ims->ims_st[1].in += n; + } +} + +/* + * Atomically update the global in_multi state, when a membership's + * filter list is being updated in any way. + * + * imf is the per-inpcb-membership group filter pointer. + * A fake imf may be passed for in-kernel consumers. + * + * XXX This is a candidate for a set-symmetric-difference style loop + * which would eliminate the repeated lookup from root of ims nodes, + * as they share the same key space. + * + * If any error occurred this function will back out of refcounts + * and return a non-zero value. + */ +static int +inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) +{ + struct ip_msource *ims, *nims; + struct in_msource *lims; + int schanged, error; + int nsrc0, nsrc1; + + schanged = 0; + error = 0; + nsrc1 = nsrc0 = 0; + + /* + * Update the source filters first, as this may fail. + * Maintain count of in-mode filters at t0, t1. These are + * used to work out if we transition into ASM mode or not. + * Maintain a count of source filters whose state was + * actually modified by this operation. + */ + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; + if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; + if (lims->imsl_st[0] == lims->imsl_st[1]) continue; + error = inm_get_source(inm, lims->ims_haddr, 0, &nims); + ++schanged; + if (error) + break; + ims_merge(nims, lims, 0); + } + if (error) { + struct ip_msource *bims; + + RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == lims->imsl_st[1]) + continue; + (void)inm_get_source(inm, lims->ims_haddr, 1, &bims); + if (bims == NULL) + continue; + ims_merge(bims, lims, 1); + } + goto out_reap; + } + + CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1", + __func__, nsrc0, nsrc1); + + /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ + if (imf->imf_st[0] == imf->imf_st[1] && + imf->imf_st[1] == MCAST_INCLUDE) { + if (nsrc1 == 0) { + CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); + --inm->inm_st[1].iss_in; + } + } + + /* Handle filter mode transition on socket. */ + if (imf->imf_st[0] != imf->imf_st[1]) { + CTR3(KTR_IGMPV3, "%s: imf transition %d to %d", + __func__, imf->imf_st[0], imf->imf_st[1]); + + if (imf->imf_st[0] == MCAST_EXCLUDE) { + CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__); + --inm->inm_st[1].iss_ex; + } else if (imf->imf_st[0] == MCAST_INCLUDE) { + CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__); + --inm->inm_st[1].iss_in; + } + + if (imf->imf_st[1] == MCAST_EXCLUDE) { + CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__); + inm->inm_st[1].iss_ex++; + } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { + CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__); + inm->inm_st[1].iss_in++; + } + } + + /* + * Track inm filter state in terms of listener counts. + * If there are any exclusive listeners, stack-wide + * membership is exclusive. + * Otherwise, if only inclusive listeners, stack-wide is inclusive. + * If no listeners remain, state is undefined at t1, + * and the IGMP lifecycle for this group should finish. + */ + if (inm->inm_st[1].iss_ex > 0) { + CTR1(KTR_IGMPV3, "%s: transition to EX", __func__); + inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; + } else if (inm->inm_st[1].iss_in > 0) { + CTR1(KTR_IGMPV3, "%s: transition to IN", __func__); + inm->inm_st[1].iss_fmode = MCAST_INCLUDE; + } else { + CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__); + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + } + + /* Decrement ASM listener count on transition out of ASM mode. */ + if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { + if ((imf->imf_st[1] != MCAST_EXCLUDE) || + (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) + CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__); + --inm->inm_st[1].iss_asm; + } + + /* Increment ASM listener count on transition to ASM mode. */ + if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { + CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__); + inm->inm_st[1].iss_asm++; + } + + CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm); + inm_print(inm); + +out_reap: + if (schanged > 0) { + CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__); + inm_reap(inm); + } + return (error); +} + +/* + * Mark an in_multi's filter set deltas as committed. + * Called by IGMP after a state change has been enqueued. + */ +void +inm_commit(struct in_multi *inm) +{ + struct ip_msource *ims; + + CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm); + CTR1(KTR_IGMPV3, "%s: pre commit:", __func__); + inm_print(inm); + + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { + ims->ims_st[0] = ims->ims_st[1]; + } + inm->inm_st[0] = inm->inm_st[1]; +} + +/* + * Reap unreferenced nodes from an in_multi's filter set. + */ +static void +inm_reap(struct in_multi *inm) +{ + struct ip_msource *ims, *tims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { + if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || + ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || + ims->ims_stp != 0) + continue; + CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); + RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); + free(ims, M_IPMSOURCE); + inm->inm_nsrc--; + } +} + +/* + * Purge all source nodes from an in_multi's filter set. + */ +static void +inm_purge(struct in_multi *inm) +{ + struct ip_msource *ims, *tims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { + CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims); + RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); + free(ims, M_IPMSOURCE); + inm->inm_nsrc--; + } +} + +/* + * Join a multicast group; unlocked entry point. + * + * SMPng: XXX: in_joingroup() is called from in_control() when Giant + * is not held. Fortunately, ifp is unlikely to have been detached + * at this point, so we assume it's OK to recurse. + */ +int +in_joingroup(struct ifnet *ifp, const struct in_addr *gina, + /*const*/ struct in_mfilter *imf, struct in_multi **pinm) +{ + int error; + + IN_MULTI_LOCK(); + error = in_joingroup_locked(ifp, gina, imf, pinm); + IN_MULTI_UNLOCK(); + + return (error); +} + +/* + * Join a multicast group; real entry point. + * + * Only preserves atomicity at inm level. + * NOTE: imf argument cannot be const due to sys/tree.h limitations. + * + * If the IGMP downcall fails, the group is not joined, and an error + * code is returned. + */ +int +in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina, + /*const*/ struct in_mfilter *imf, struct in_multi **pinm) +{ + struct in_mfilter timf; + struct in_multi *inm; + int error; + + IN_MULTI_LOCK_ASSERT(); + + CTR4(KTR_IGMPV3, "%s: join %s on %p(%s))", __func__, + inet_ntoa(*gina), ifp, ifp->if_xname); + + error = 0; + inm = NULL; + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); + imf = &timf; + } + + error = in_getmulti(ifp, gina, &inm); + if (error) { + CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__); + return (error); + } + + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); + goto out_inm_release; + } + + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to update source", __func__); + goto out_inm_release; + } + +out_inm_release: + if (error) { + CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); + inm_release_locked(inm); + } else { + *pinm = inm; + } + + return (error); +} + +/* + * Leave a multicast group; unlocked entry point. + */ +int +in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) +{ + struct ifnet *ifp; + int error; + + ifp = inm->inm_ifp; + + IN_MULTI_LOCK(); + error = in_leavegroup_locked(inm, imf); + IN_MULTI_UNLOCK(); + + return (error); +} + +/* + * Leave a multicast group; real entry point. + * All source filters will be expunged. + * + * Only preserves atomicity at inm level. + * + * Holding the write lock for the INP which contains imf + * is highly advisable. We can't assert for it as imf does not + * contain a back-pointer to the owning inp. + * + * Note: This is not the same as inm_release(*) as this function also + * makes a state change downcall into IGMP. + */ +int +in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf) +{ + struct in_mfilter timf; + int error; + + error = 0; + + IN_MULTI_LOCK_ASSERT(); + + CTR5(KTR_IGMPV3, "%s: leave inm %p, %s/%s, imf %p", __func__, + inm, inet_ntoa(inm->inm_addr), + (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname), + imf); + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); + imf = &timf; + } + + /* + * Begin state merge transaction at IGMP layer. + * + * As this particular invocation should not cause any memory + * to be allocated, and there is no opportunity to roll back + * the transaction, it MUST NOT fail. + */ + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + KASSERT(error == 0, ("%s: failed to merge inm state", __func__)); + + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) + CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); + + CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm); + inm_release_locked(inm); + + return (error); +} + +/*#ifndef BURN_BRIDGES*/ +/* + * Join an IPv4 multicast group in (*,G) exclusive mode. + * The group must be a 224.0.0.0/24 link-scope group. + * This KPI is for legacy kernel consumers only. + */ +struct in_multi * +in_addmulti(struct in_addr *ap, struct ifnet *ifp) +{ + struct in_multi *pinm; + int error; + + KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), + ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa(*ap))); + + error = in_joingroup(ifp, ap, NULL, &pinm); + if (error != 0) + pinm = NULL; + + return (pinm); +} + +/* + * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode. + * This KPI is for legacy kernel consumers only. + */ +void +in_delmulti(struct in_multi *inm) +{ + + (void)in_leavegroup(inm, NULL); +} +/*#endif*/ + +/* + * Block or unblock an ASM multicast source on an inpcb. + * This implements the delta-based API described in RFC 3678. + * + * The delta-based API applies only to exclusive-mode memberships. + * An IGMP downcall will be performed. + * + * SMPng: NOTE: Must take Giant as a join may create a new ifma. + * + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_msource *ims; + struct in_multi *inm; + size_t idx; + uint16_t fmode; + int error, doblock; + + ifp = NULL; + error = 0; + doblock = 0; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + ssa = (sockunion_t *)&gsr.gsr_source; + + switch (sopt->sopt_name) { + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: { + struct ip_mreq_source mreqs; + + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq_source), + sizeof(struct ip_mreq_source)); + if (error) + return (error); + + gsa->sin.sin_family = AF_INET; + gsa->sin.sin_len = sizeof(struct sockaddr_in); + gsa->sin.sin_addr = mreqs.imr_multiaddr; + + ssa->sin.sin_family = AF_INET; + ssa->sin.sin_len = sizeof(struct sockaddr_in); + ssa->sin.sin_addr = mreqs.imr_sourceaddr; + + if (!in_nullhost(mreqs.imr_interface)) + INADDR_TO_IFP(mreqs.imr_interface, ifp); + + if (sopt->sopt_name == IP_BLOCK_SOURCE) + doblock = 1; + + CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", + __func__, inet_ntoa(mreqs.imr_interface), ifp); + break; + } + + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + if (error) + return (error); + + if (gsa->sin.sin_family != AF_INET || + gsa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + if (ssa->sin.sin_family != AF_INET || + ssa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) + return (EADDRNOTAVAIL); + + ifp = ifnet_byindex(gsr.gsr_interface); + + if (sopt->sopt_name == MCAST_BLOCK_SOURCE) + doblock = 1; + break; + + default: + CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", + __func__, sopt->sopt_name); + return (EOPNOTSUPP); + break; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + /* + * Check if we are actually a member of this group. + */ + imo = inp_findmoptions(inp); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == -1 || imo->imo_mfilters == NULL) { + error = EADDRNOTAVAIL; + goto out_inp_locked; + } + + KASSERT(imo->imo_mfilters != NULL, + ("%s: imo_mfilters not allocated", __func__)); + imf = &imo->imo_mfilters[idx]; + inm = imo->imo_membership[idx]; + + /* + * Attempting to use the delta-based API on an + * non exclusive-mode membership is an error. + */ + fmode = imf->imf_st[0]; + if (fmode != MCAST_EXCLUDE) { + error = EINVAL; + goto out_inp_locked; + } + + /* + * Deal with error cases up-front: + * Asked to block, but already blocked; or + * Asked to unblock, but nothing to unblock. + * If adding a new block entry, allocate it. + */ + ims = imo_match_source(imo, idx, &ssa->sa); + if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { + CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__, + inet_ntoa(ssa->sin.sin_addr), doblock ? "" : "not "); + error = EADDRNOTAVAIL; + goto out_inp_locked; + } + + INP_WLOCK_ASSERT(inp); + + /* + * Begin state merge transaction at socket layer. + */ + if (doblock) { + CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); + ims = imf_graft(imf, fmode, &ssa->sin); + if (ims == NULL) + error = ENOMEM; + } else { + CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); + error = imf_prune(imf, &ssa->sin); + } + + if (error) { + CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__); + goto out_imf_rollback; + } + + /* + * Begin state merge transaction at IGMP layer. + */ + IN_MULTI_LOCK(); + + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); + goto out_imf_rollback; + } + + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) + CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); + + IN_MULTI_UNLOCK(); + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); + +out_inp_locked: + INP_WUNLOCK(inp); + return (error); +} + +/* + * Given an inpcb, return its multicast options structure pointer. Accepts + * an unlocked inpcb pointer, but will return it locked. May sleep. + * + * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. + * SMPng: NOTE: Returns with the INP write lock held. + */ +static struct ip_moptions * +inp_findmoptions(struct inpcb *inp) +{ + struct ip_moptions *imo; + struct in_multi **immp; + struct in_mfilter *imfp; + size_t idx; + + INP_WLOCK(inp); + if (inp->inp_moptions != NULL) + return (inp->inp_moptions); + + INP_WUNLOCK(inp); + + imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK); + immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, + M_WAITOK | M_ZERO); + imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS, + M_INMFILTER, M_WAITOK); + + imo->imo_multicast_ifp = NULL; + imo->imo_multicast_addr.s_addr = INADDR_ANY; + imo->imo_multicast_vif = -1; + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + imo->imo_multicast_loop = in_mcast_loop; + imo->imo_num_memberships = 0; + imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; + imo->imo_membership = immp; + + /* Initialize per-group source filters. */ + for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) + imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); + imo->imo_mfilters = imfp; + + INP_WLOCK(inp); + if (inp->inp_moptions != NULL) { + free(imfp, M_INMFILTER); + free(immp, M_IPMOPTS); + free(imo, M_IPMOPTS); + return (inp->inp_moptions); + } + inp->inp_moptions = imo; + return (imo); +} + +/* + * Discard the IP multicast options (and source filters). + * + * SMPng: NOTE: assumes INP write lock is held. + */ +void +inp_freemoptions(struct ip_moptions *imo) +{ + struct in_mfilter *imf; + size_t idx, nmships; + + KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__)); + + nmships = imo->imo_num_memberships; + for (idx = 0; idx < nmships; ++idx) { + imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL; + if (imf) + imf_leave(imf); + (void)in_leavegroup(imo->imo_membership[idx], imf); + if (imf) + imf_purge(imf); + } + + if (imo->imo_mfilters) + free(imo->imo_mfilters, M_INMFILTER); + free(imo->imo_membership, M_IPMOPTS); + free(imo, M_IPMOPTS); +} + +/* + * Atomically get source filters on a socket for an IPv4 multicast group. + * Called with INP lock held; returns with lock released. + */ +static int +inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) +{ + struct __msfilterreq msfr; + sockunion_t *gsa; + struct ifnet *ifp; + struct ip_moptions *imo; + struct in_mfilter *imf; + struct ip_msource *ims; + struct in_msource *lims; + struct sockaddr_in *psin; + struct sockaddr_storage *ptss; + struct sockaddr_storage *tss; + int error; + size_t idx, nsrcs, ncsrcs; + + INP_WLOCK_ASSERT(inp); + + imo = inp->inp_moptions; + KASSERT(imo != NULL, ("%s: null ip_moptions", __func__)); + + INP_WUNLOCK(inp); + + error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), + sizeof(struct __msfilterreq)); + if (error) + return (error); + + if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) + return (EINVAL); + + ifp = ifnet_byindex(msfr.msfr_ifindex); + if (ifp == NULL) + return (EINVAL); + + INP_WLOCK(inp); + + /* + * Lookup group on the socket. + */ + gsa = (sockunion_t *)&msfr.msfr_group; + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == -1 || imo->imo_mfilters == NULL) { + INP_WUNLOCK(inp); + return (EADDRNOTAVAIL); + } + imf = &imo->imo_mfilters[idx]; + + /* + * Ignore memberships which are in limbo. + */ + if (imf->imf_st[1] == MCAST_UNDEFINED) { + INP_WUNLOCK(inp); + return (EAGAIN); + } + msfr.msfr_fmode = imf->imf_st[1]; + + /* + * If the user specified a buffer, copy out the source filter + * entries to userland gracefully. + * We only copy out the number of entries which userland + * has asked for, but we always tell userland how big the + * buffer really needs to be. + */ + tss = NULL; + if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) { + tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + M_TEMP, M_NOWAIT | M_ZERO); + if (tss == NULL) { + INP_WUNLOCK(inp); + return (ENOBUFS); + } + } + + /* + * Count number of sources in-mode at t0. + * If buffer space exists and remains, copy out source entries. + */ + nsrcs = msfr.msfr_nsrcs; + ncsrcs = 0; + ptss = tss; + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == MCAST_UNDEFINED || + lims->imsl_st[0] != imf->imf_st[0]) + continue; + ++ncsrcs; + if (tss != NULL && nsrcs > 0) { + psin = (struct sockaddr_in *)ptss; + psin->sin_family = AF_INET; + psin->sin_len = sizeof(struct sockaddr_in); + psin->sin_addr.s_addr = htonl(lims->ims_haddr); + psin->sin_port = 0; + ++ptss; + --nsrcs; + } + } + + INP_WUNLOCK(inp); + + if (tss != NULL) { + error = copyout(tss, msfr.msfr_srcs, + sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + free(tss, M_TEMP); + if (error) + return (error); + } + + msfr.msfr_nsrcs = ncsrcs; + error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq)); + + return (error); +} + +/* + * Return the IP multicast options in response to user getsockopt(). + */ +int +inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) +{ + struct ip_mreqn mreqn; + struct ip_moptions *imo; + struct ifnet *ifp; + struct in_ifaddr *ia; + int error, optval; + u_char coptval; + + INP_WLOCK(inp); + imo = inp->inp_moptions; + /* + * If socket is neither of type SOCK_RAW or SOCK_DGRAM, + * or is a divert socket, reject it. + */ + if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || + (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { + INP_WUNLOCK(inp); + return (EOPNOTSUPP); + } + + error = 0; + switch (sopt->sopt_name) { + case IP_MULTICAST_VIF: + if (imo != NULL) + optval = imo->imo_multicast_vif; + else + optval = -1; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof(int)); + break; + + case IP_MULTICAST_IF: + memset(&mreqn, 0, sizeof(struct ip_mreqn)); + if (imo != NULL) { + ifp = imo->imo_multicast_ifp; + if (!in_nullhost(imo->imo_multicast_addr)) { + mreqn.imr_address = imo->imo_multicast_addr; + } else if (ifp != NULL) { + mreqn.imr_ifindex = ifp->if_index; + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + mreqn.imr_address = + IA_SIN(ia)->sin_addr; + ifa_free(&ia->ia_ifa); + } + } + } + INP_WUNLOCK(inp); + if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { + error = sooptcopyout(sopt, &mreqn, + sizeof(struct ip_mreqn)); + } else { + error = sooptcopyout(sopt, &mreqn.imr_address, + sizeof(struct in_addr)); + } + break; + + case IP_MULTICAST_TTL: + if (imo == 0) + optval = coptval = IP_DEFAULT_MULTICAST_TTL; + else + optval = coptval = imo->imo_multicast_ttl; + INP_WUNLOCK(inp); + if (sopt->sopt_valsize == sizeof(u_char)) + error = sooptcopyout(sopt, &coptval, sizeof(u_char)); + else + error = sooptcopyout(sopt, &optval, sizeof(int)); + break; + + case IP_MULTICAST_LOOP: + if (imo == 0) + optval = coptval = IP_DEFAULT_MULTICAST_LOOP; + else + optval = coptval = imo->imo_multicast_loop; + INP_WUNLOCK(inp); + if (sopt->sopt_valsize == sizeof(u_char)) + error = sooptcopyout(sopt, &coptval, sizeof(u_char)); + else + error = sooptcopyout(sopt, &optval, sizeof(int)); + break; + + case IP_MSFILTER: + if (imo == NULL) { + error = EADDRNOTAVAIL; + INP_WUNLOCK(inp); + } else { + error = inp_get_source_filters(inp, sopt); + } + break; + + default: + INP_WUNLOCK(inp); + error = ENOPROTOOPT; + break; + } + + INP_UNLOCK_ASSERT(inp); + + return (error); +} + +/* + * Look up the ifnet to use for a multicast group membership, + * given the IPv4 address of an interface, and the IPv4 group address. + * + * This routine exists to support legacy multicast applications + * which do not understand that multicast memberships are scoped to + * specific physical links in the networking stack, or which need + * to join link-scope groups before IPv4 addresses are configured. + * + * If inp is non-NULL, use this socket's current FIB number for any + * required FIB lookup. + * If ina is INADDR_ANY, look up the group address in the unicast FIB, + * and use its ifp; usually, this points to the default next-hop. + * + * If the FIB lookup fails, attempt to use the first non-loopback + * interface with multicast capability in the system as a + * last resort. The legacy IPv4 ASM API requires that we do + * this in order to allow groups to be joined when the routing + * table has not yet been populated during boot. + * + * Returns NULL if no ifp could be found. + * + * SMPng: TODO: Acquire the appropriate locks for INADDR_TO_IFP. + * FUTURE: Implement IPv4 source-address selection. + */ +static struct ifnet * +inp_lookup_mcast_ifp(const struct inpcb *inp, + const struct sockaddr_in *gsin, const struct in_addr ina) +{ + struct ifnet *ifp; + + KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__)); + KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)), + ("%s: not multicast", __func__)); + + ifp = NULL; + if (!in_nullhost(ina)) { + INADDR_TO_IFP(ina, ifp); + } else { + struct route ro; + + ro.ro_rt = NULL; + memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in)); + in_rtalloc_ign(&ro, 0, inp ? inp->inp_inc.inc_fibnum : 0); + if (ro.ro_rt != NULL) { + ifp = ro.ro_rt->rt_ifp; + KASSERT(ifp != NULL, ("%s: null ifp", __func__)); + RTFREE(ro.ro_rt); + } else { + struct in_ifaddr *ia; + struct ifnet *mifp; + + mifp = NULL; + IN_IFADDR_RLOCK(); + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + mifp = ia->ia_ifp; + if (!(mifp->if_flags & IFF_LOOPBACK) && + (mifp->if_flags & IFF_MULTICAST)) { + ifp = mifp; + break; + } + } + IN_IFADDR_RUNLOCK(); + } + } + + return (ifp); +} + +/* + * Join an IPv4 multicast group, possibly with a source. + */ +static int +inp_join_group(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_multi *inm; + struct in_msource *lims; + size_t idx; + int error, is_new; + + ifp = NULL; + imf = NULL; + error = 0; + is_new = 0; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + gsa->ss.ss_family = AF_UNSPEC; + ssa = (sockunion_t *)&gsr.gsr_source; + ssa->ss.ss_family = AF_UNSPEC; + + switch (sopt->sopt_name) { + case IP_ADD_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: { + struct ip_mreq_source mreqs; + + if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq), + sizeof(struct ip_mreq)); + /* + * Do argument switcharoo from ip_mreq into + * ip_mreq_source to avoid using two instances. + */ + mreqs.imr_interface = mreqs.imr_sourceaddr; + mreqs.imr_sourceaddr.s_addr = INADDR_ANY; + } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq_source), + sizeof(struct ip_mreq_source)); + } + if (error) + return (error); + + gsa->sin.sin_family = AF_INET; + gsa->sin.sin_len = sizeof(struct sockaddr_in); + gsa->sin.sin_addr = mreqs.imr_multiaddr; + + if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { + ssa->sin.sin_family = AF_INET; + ssa->sin.sin_len = sizeof(struct sockaddr_in); + ssa->sin.sin_addr = mreqs.imr_sourceaddr; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, + mreqs.imr_interface); + CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", + __func__, inet_ntoa(mreqs.imr_interface), ifp); + break; + } + + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + if (sopt->sopt_name == MCAST_JOIN_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_req), + sizeof(struct group_req)); + } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + } + if (error) + return (error); + + if (gsa->sin.sin_family != AF_INET || + gsa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + /* + * Overwrite the port field if present, as the sockaddr + * being copied in may be matched with a binary comparison. + */ + gsa->sin.sin_port = 0; + if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { + if (ssa->sin.sin_family != AF_INET || + ssa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + ssa->sin.sin_port = 0; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) + return (EADDRNOTAVAIL); + ifp = ifnet_byindex(gsr.gsr_interface); + break; + + default: + CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", + __func__, sopt->sopt_name); + return (EOPNOTSUPP); + break; + } + + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) + return (EADDRNOTAVAIL); + + imo = inp_findmoptions(inp); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == -1) { + is_new = 1; + } else { + inm = imo->imo_membership[idx]; + imf = &imo->imo_mfilters[idx]; + if (ssa->ss.ss_family != AF_UNSPEC) { + /* + * MCAST_JOIN_SOURCE_GROUP on an exclusive membership + * is an error. On an existing inclusive membership, + * it just adds the source to the filter list. + */ + if (imf->imf_st[1] != MCAST_INCLUDE) { + error = EINVAL; + goto out_inp_locked; + } + /* Throw out duplicates. */ + lims = imo_match_source(imo, idx, &ssa->sa); + if (lims != NULL) { + error = EADDRNOTAVAIL; + goto out_inp_locked; + } + } else { + /* + * MCAST_JOIN_GROUP on an existing inclusive + * membership is an error; if you want to change + * filter mode, you must use the userland API + * setsourcefilter(). + */ + if (imf->imf_st[1] == MCAST_INCLUDE) { + error = EINVAL; + goto out_inp_locked; + } + /* + * MCAST_JOIN_GROUP on an existing exclusive + * membership is an error; return EADDRINUSE + * to preserve 4.4BSD API idempotence, and + * avoid tedious detour to code below. + * NOTE: This is bending RFC 3678 a bit. + */ + if (imf->imf_st[1] == MCAST_EXCLUDE) { + error = EADDRINUSE; + goto out_inp_locked; + } + } + } + + /* + * Begin state merge transaction at socket layer. + */ + INP_WLOCK_ASSERT(inp); + + if (is_new) { + if (imo->imo_num_memberships == imo->imo_max_memberships) { + error = imo_grow(imo); + if (error) + goto out_inp_locked; + } + /* + * Allocate the new slot upfront so we can deal with + * grafting the new source filter in same code path + * as for join-source on existing membership. + */ + idx = imo->imo_num_memberships; + imo->imo_membership[idx] = NULL; + imo->imo_num_memberships++; + KASSERT(imo->imo_mfilters != NULL, + ("%s: imf_mfilters vector was not allocated", __func__)); + imf = &imo->imo_mfilters[idx]; + KASSERT(RB_EMPTY(&imf->imf_sources), + ("%s: imf_sources not empty", __func__)); + } + + /* + * Graft new source into filter list for this inpcb's + * membership of the group. The in_multi may not have + * been allocated yet if this is a new membership, however, + * the in_mfilter slot will be allocated and must be initialized. + */ + if (ssa->ss.ss_family != AF_UNSPEC) { + /* Membership starts in IN mode */ + if (is_new) { + CTR1(KTR_IGMPV3, "%s: new join w/source", __func__); + imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); + } else { + CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow"); + } + lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); + if (lims == NULL) { + CTR1(KTR_IGMPV3, "%s: merge imf state failed", + __func__); + error = ENOMEM; + goto out_imo_free; + } + } else { + /* No address specified; Membership starts in EX mode */ + if (is_new) { + CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__); + imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); + } + } + + /* + * Begin state merge transaction at IGMP layer. + */ + IN_MULTI_LOCK(); + + if (is_new) { + error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf, + &inm); + if (error) + goto out_imo_free; + imo->imo_membership[idx] = inm; + } else { + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to merge inm state", + __func__); + goto out_imf_rollback; + } + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed igmp downcall", + __func__); + goto out_imf_rollback; + } + } + + IN_MULTI_UNLOCK(); + +out_imf_rollback: + INP_WLOCK_ASSERT(inp); + if (error) { + imf_rollback(imf); + if (is_new) + imf_purge(imf); + else + imf_reap(imf); + } else { + imf_commit(imf); + } + +out_imo_free: + if (error && is_new) { + imo->imo_membership[idx] = NULL; + --imo->imo_num_memberships; + } + +out_inp_locked: + INP_WUNLOCK(inp); + return (error); +} + +/* + * Leave an IPv4 multicast group on an inpcb, possibly with a source. + */ +static int +inp_leave_group(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + struct ip_mreq_source mreqs; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_msource *ims; + struct in_multi *inm; + size_t idx; + int error, is_final; + + ifp = NULL; + error = 0; + is_final = 1; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + gsa->ss.ss_family = AF_UNSPEC; + ssa = (sockunion_t *)&gsr.gsr_source; + ssa->ss.ss_family = AF_UNSPEC; + + switch (sopt->sopt_name) { + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq), + sizeof(struct ip_mreq)); + /* + * Swap interface and sourceaddr arguments, + * as ip_mreq and ip_mreq_source are laid + * out differently. + */ + mreqs.imr_interface = mreqs.imr_sourceaddr; + mreqs.imr_sourceaddr.s_addr = INADDR_ANY; + } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq_source), + sizeof(struct ip_mreq_source)); + } + if (error) + return (error); + + gsa->sin.sin_family = AF_INET; + gsa->sin.sin_len = sizeof(struct sockaddr_in); + gsa->sin.sin_addr = mreqs.imr_multiaddr; + + if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { + ssa->sin.sin_family = AF_INET; + ssa->sin.sin_len = sizeof(struct sockaddr_in); + ssa->sin.sin_addr = mreqs.imr_sourceaddr; + } + + /* + * Attempt to look up hinted ifp from interface address. + * Fallthrough with null ifp iff lookup fails, to + * preserve 4.4BSD mcast API idempotence. + * XXX NOTE WELL: The RFC 3678 API is preferred because + * using an IPv4 address as a key is racy. + */ + if (!in_nullhost(mreqs.imr_interface)) + INADDR_TO_IFP(mreqs.imr_interface, ifp); + + CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p", + __func__, inet_ntoa(mreqs.imr_interface), ifp); + + break; + + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + if (sopt->sopt_name == MCAST_LEAVE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_req), + sizeof(struct group_req)); + } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + } + if (error) + return (error); + + if (gsa->sin.sin_family != AF_INET || + gsa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { + if (ssa->sin.sin_family != AF_INET || + ssa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + } + + if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface) + return (EADDRNOTAVAIL); + + ifp = ifnet_byindex(gsr.gsr_interface); + + if (ifp == NULL) + return (EADDRNOTAVAIL); + break; + + default: + CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d", + __func__, sopt->sopt_name); + return (EOPNOTSUPP); + break; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + /* + * Find the membership in the membership array. + */ + imo = inp_findmoptions(inp); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == -1) { + error = EADDRNOTAVAIL; + goto out_inp_locked; + } + inm = imo->imo_membership[idx]; + imf = &imo->imo_mfilters[idx]; + + if (ssa->ss.ss_family != AF_UNSPEC) + is_final = 0; + + /* + * Begin state merge transaction at socket layer. + */ + INP_WLOCK_ASSERT(inp); + + /* + * If we were instructed only to leave a given source, do so. + * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. + */ + if (is_final) { + imf_leave(imf); + } else { + if (imf->imf_st[0] == MCAST_EXCLUDE) { + error = EADDRNOTAVAIL; + goto out_inp_locked; + } + ims = imo_match_source(imo, idx, &ssa->sa); + if (ims == NULL) { + CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__, + inet_ntoa(ssa->sin.sin_addr), "not "); + error = EADDRNOTAVAIL; + goto out_inp_locked; + } + CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block"); + error = imf_prune(imf, &ssa->sin); + if (error) { + CTR1(KTR_IGMPV3, "%s: merge imf state failed", + __func__); + goto out_inp_locked; + } + } + + /* + * Begin state merge transaction at IGMP layer. + */ + IN_MULTI_LOCK(); + + if (is_final) { + /* + * Give up the multicast address record to which + * the membership points. + */ + (void)in_leavegroup_locked(inm, imf); + } else { + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to merge inm state", + __func__); + goto out_imf_rollback; + } + + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed igmp downcall", + __func__); + } + } + + IN_MULTI_UNLOCK(); + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); + + if (is_final) { + /* Remove the gap in the membership and filter array. */ + for (++idx; idx < imo->imo_num_memberships; ++idx) { + imo->imo_membership[idx-1] = imo->imo_membership[idx]; + imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx]; + } + imo->imo_num_memberships--; + } + +out_inp_locked: + INP_WUNLOCK(inp); + return (error); +} + +/* + * Select the interface for transmitting IPv4 multicast datagrams. + * + * Either an instance of struct in_addr or an instance of struct ip_mreqn + * may be passed to this socket option. An address of INADDR_ANY or an + * interface index of 0 is used to remove a previous selection. + * When no interface is selected, one is chosen for every send. + */ +static int +inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) +{ + struct in_addr addr; + struct ip_mreqn mreqn; + struct ifnet *ifp; + struct ip_moptions *imo; + int error; + + if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { + /* + * An interface index was specified using the + * Linux-derived ip_mreqn structure. + */ + error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), + sizeof(struct ip_mreqn)); + if (error) + return (error); + + if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex) + return (EINVAL); + + if (mreqn.imr_ifindex == 0) { + ifp = NULL; + } else { + ifp = ifnet_byindex(mreqn.imr_ifindex); + if (ifp == NULL) + return (EADDRNOTAVAIL); + } + } else { + /* + * An interface was specified by IPv4 address. + * This is the traditional BSD usage. + */ + error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), + sizeof(struct in_addr)); + if (error) + return (error); + if (in_nullhost(addr)) { + ifp = NULL; + } else { + INADDR_TO_IFP(addr, ifp); + if (ifp == NULL) + return (EADDRNOTAVAIL); + } + CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = %s", __func__, ifp, + inet_ntoa(addr)); + } + + /* Reject interfaces which do not support multicast. */ + if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) + return (EOPNOTSUPP); + + imo = inp_findmoptions(inp); + imo->imo_multicast_ifp = ifp; + imo->imo_multicast_addr.s_addr = INADDR_ANY; + INP_WUNLOCK(inp); + + return (0); +} + +/* + * Atomically set source filters on a socket for an IPv4 multicast group. + * + * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held. + */ +static int +inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) +{ + struct __msfilterreq msfr; + sockunion_t *gsa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_multi *inm; + size_t idx; + int error; + + error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq), + sizeof(struct __msfilterreq)); + if (error) + return (error); + + if (msfr.msfr_nsrcs > in_mcast_maxsocksrc || + (msfr.msfr_fmode != MCAST_EXCLUDE && + msfr.msfr_fmode != MCAST_INCLUDE)) + return (EINVAL); + + if (msfr.msfr_group.ss_family != AF_INET || + msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + gsa = (sockunion_t *)&msfr.msfr_group; + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + gsa->sin.sin_port = 0; /* ignore port */ + + if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex) + return (EADDRNOTAVAIL); + + ifp = ifnet_byindex(msfr.msfr_ifindex); + if (ifp == NULL) + return (EADDRNOTAVAIL); + + /* + * Take the INP write lock. + * Check if this socket is a member of this group. + */ + imo = inp_findmoptions(inp); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == -1 || imo->imo_mfilters == NULL) { + error = EADDRNOTAVAIL; + goto out_inp_locked; + } + inm = imo->imo_membership[idx]; + imf = &imo->imo_mfilters[idx]; + + /* + * Begin state merge transaction at socket layer. + */ + INP_WLOCK_ASSERT(inp); + + imf->imf_st[1] = msfr.msfr_fmode; + + /* + * Apply any new source filters, if present. + * Make a copy of the user-space source vector so + * that we may copy them with a single copyin. This + * allows us to deal with page faults up-front. + */ + if (msfr.msfr_nsrcs > 0) { + struct in_msource *lims; + struct sockaddr_in *psin; + struct sockaddr_storage *kss, *pkss; + int i; + + INP_WUNLOCK(inp); + + CTR2(KTR_IGMPV3, "%s: loading %lu source list entries", + __func__, (unsigned long)msfr.msfr_nsrcs); + kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + M_TEMP, M_WAITOK); + error = copyin(msfr.msfr_srcs, kss, + sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + if (error) { + free(kss, M_TEMP); + return (error); + } + + INP_WLOCK(inp); + + /* + * Mark all source filters as UNDEFINED at t1. + * Restore new group filter mode, as imf_leave() + * will set it to INCLUDE. + */ + imf_leave(imf); + imf->imf_st[1] = msfr.msfr_fmode; + + /* + * Update socket layer filters at t1, lazy-allocating + * new entries. This saves a bunch of memory at the + * cost of one RB_FIND() per source entry; duplicate + * entries in the msfr_nsrcs vector are ignored. + * If we encounter an error, rollback transaction. + * + * XXX This too could be replaced with a set-symmetric + * difference like loop to avoid walking from root + * every time, as the key space is common. + */ + for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { + psin = (struct sockaddr_in *)pkss; + if (psin->sin_family != AF_INET) { + error = EAFNOSUPPORT; + break; + } + if (psin->sin_len != sizeof(struct sockaddr_in)) { + error = EINVAL; + break; + } + error = imf_get_source(imf, psin, &lims); + if (error) + break; + lims->imsl_st[1] = imf->imf_st[1]; + } + free(kss, M_TEMP); + } + + if (error) + goto out_imf_rollback; + + INP_WLOCK_ASSERT(inp); + IN_MULTI_LOCK(); + + /* + * Begin state merge transaction at IGMP layer. + */ + CTR1(KTR_IGMPV3, "%s: merge inm state", __func__); + error = inm_merge(inm, imf); + if (error) { + CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__); + goto out_imf_rollback; + } + + CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__); + error = igmp_change_state(inm); + if (error) + CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__); + + IN_MULTI_UNLOCK(); + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); + +out_inp_locked: + INP_WUNLOCK(inp); + return (error); +} + +/* + * Set the IP multicast options in response to user setsockopt(). + * + * Many of the socket options handled in this function duplicate the + * functionality of socket options in the regular unicast API. However, + * it is not possible to merge the duplicate code, because the idempotence + * of the IPv4 multicast part of the BSD Sockets API must be preserved; + * the effects of these options must be treated as separate and distinct. + * + * SMPng: XXX: Unlocked read of inp_socket believed OK. + * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING + * is refactored to no longer use vifs. + */ +int +inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) +{ + struct ip_moptions *imo; + int error; + + error = 0; + + /* + * If socket is neither of type SOCK_RAW or SOCK_DGRAM, + * or is a divert socket, reject it. + */ + if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || + (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) + return (EOPNOTSUPP); + + switch (sopt->sopt_name) { + case IP_MULTICAST_VIF: { + int vifi; + /* + * Select a multicast VIF for transmission. + * Only useful if multicast forwarding is active. + */ + if (legal_vif_num == NULL) { + error = EOPNOTSUPP; + break; + } + error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); + if (error) + break; + if (!legal_vif_num(vifi) && (vifi != -1)) { + error = EINVAL; + break; + } + imo = inp_findmoptions(inp); + imo->imo_multicast_vif = vifi; + INP_WUNLOCK(inp); + break; + } + + case IP_MULTICAST_IF: + error = inp_set_multicast_if(inp, sopt); + break; + + case IP_MULTICAST_TTL: { + u_char ttl; + + /* + * Set the IP time-to-live for outgoing multicast packets. + * The original multicast API required a char argument, + * which is inconsistent with the rest of the socket API. + * We allow either a char or an int. + */ + if (sopt->sopt_valsize == sizeof(u_char)) { + error = sooptcopyin(sopt, &ttl, sizeof(u_char), + sizeof(u_char)); + if (error) + break; + } else { + u_int ittl; + + error = sooptcopyin(sopt, &ittl, sizeof(u_int), + sizeof(u_int)); + if (error) + break; + if (ittl > 255) { + error = EINVAL; + break; + } + ttl = (u_char)ittl; + } + imo = inp_findmoptions(inp); + imo->imo_multicast_ttl = ttl; + INP_WUNLOCK(inp); + break; + } + + case IP_MULTICAST_LOOP: { + u_char loop; + + /* + * Set the loopback flag for outgoing multicast packets. + * Must be zero or one. The original multicast API required a + * char argument, which is inconsistent with the rest + * of the socket API. We allow either a char or an int. + */ + if (sopt->sopt_valsize == sizeof(u_char)) { + error = sooptcopyin(sopt, &loop, sizeof(u_char), + sizeof(u_char)); + if (error) + break; + } else { + u_int iloop; + + error = sooptcopyin(sopt, &iloop, sizeof(u_int), + sizeof(u_int)); + if (error) + break; + loop = (u_char)iloop; + } + imo = inp_findmoptions(inp); + imo->imo_multicast_loop = !!loop; + INP_WUNLOCK(inp); + break; + } + + case IP_ADD_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + error = inp_join_group(inp, sopt); + break; + + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + error = inp_leave_group(inp, sopt); + break; + + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = inp_block_unblock_source(inp, sopt); + break; + + case IP_MSFILTER: + error = inp_set_source_filters(inp, sopt); + break; + + default: + error = EOPNOTSUPP; + break; + } + + INP_UNLOCK_ASSERT(inp); + + return (error); +} + +/* + * Expose IGMP's multicast filter mode and source list(s) to userland, + * keyed by (ifindex, group). + * The filter mode is written out as a uint32_t, followed by + * 0..n of struct in_addr. + * For use by ifmcstat(8). + * SMPng: NOTE: unlocked read of ifindex space. + */ +static int +sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS) +{ + struct in_addr src, group; + struct ifnet *ifp; + struct ifmultiaddr *ifma; + struct in_multi *inm; + struct ip_msource *ims; + int *name; + int retval; + u_int namelen; + uint32_t fmode, ifindex; + + name = (int *)arg1; + namelen = arg2; + + if (req->newptr != NULL) + return (EPERM); + + if (namelen != 2) + return (EINVAL); + + ifindex = name[0]; + if (ifindex <= 0 || ifindex > V_if_index) { + CTR2(KTR_IGMPV3, "%s: ifindex %u out of range", + __func__, ifindex); + return (ENOENT); + } + + group.s_addr = name[1]; + if (!IN_MULTICAST(ntohl(group.s_addr))) { + CTR2(KTR_IGMPV3, "%s: group %s is not multicast", + __func__, inet_ntoa(group)); + return (EINVAL); + } + + ifp = ifnet_byindex(ifindex); + if (ifp == NULL) { + CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u", + __func__, ifindex); + return (ENOENT); + } + + retval = sysctl_wire_old_buffer(req, + sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr))); + if (retval) + return (retval); + + IN_MULTI_LOCK(); + + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_INET || + ifma->ifma_protospec == NULL) + continue; + inm = (struct in_multi *)ifma->ifma_protospec; + if (!in_hosteq(inm->inm_addr, group)) + continue; + fmode = inm->inm_st[1].iss_fmode; + retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); + if (retval != 0) + break; + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { +#ifdef KTR + struct in_addr ina; + ina.s_addr = htonl(ims->ims_haddr); + CTR2(KTR_IGMPV3, "%s: visit node %s", __func__, + inet_ntoa(ina)); +#endif + /* + * Only copy-out sources which are in-mode. + */ + if (fmode != ims_get_mode(inm, ims, 1)) { + CTR1(KTR_IGMPV3, "%s: skip non-in-mode", + __func__); + continue; + } + src.s_addr = htonl(ims->ims_haddr); + retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); + if (retval != 0) + break; + } + } + IF_ADDR_UNLOCK(ifp); + + IN_MULTI_UNLOCK(); + + return (retval); +} + +#ifdef KTR + +static const char *inm_modestrs[] = { "un", "in", "ex" }; + +static const char * +inm_mode_str(const int mode) +{ + + if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) + return (inm_modestrs[mode]); + return ("??"); +} + +static const char *inm_statestrs[] = { + "not-member", + "silent", + "idle", + "lazy", + "sleeping", + "awakening", + "query-pending", + "sg-query-pending", + "leaving" +}; + +static const char * +inm_state_str(const int state) +{ + + if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) + return (inm_statestrs[state]); + return ("??"); +} + +/* + * Dump an in_multi structure to the console. + */ +void +inm_print(const struct in_multi *inm) +{ + int t; + + if ((ktr_mask & KTR_IGMPV3) == 0) + return; + + printf("%s: --- begin inm %p ---\n", __func__, inm); + printf("addr %s ifp %p(%s) ifma %p\n", + inet_ntoa(inm->inm_addr), + inm->inm_ifp, + inm->inm_ifp->if_xname, + inm->inm_ifma); + printf("timer %u state %s refcount %u scq.len %u\n", + inm->inm_timer, + inm_state_str(inm->inm_state), + inm->inm_refcount, + inm->inm_scq.ifq_len); + printf("igi %p nsrc %lu sctimer %u scrv %u\n", + inm->inm_igi, + inm->inm_nsrc, + inm->inm_sctimer, + inm->inm_scrv); + for (t = 0; t < 2; t++) { + printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, + inm_mode_str(inm->inm_st[t].iss_fmode), + inm->inm_st[t].iss_asm, + inm->inm_st[t].iss_ex, + inm->inm_st[t].iss_in, + inm->inm_st[t].iss_rec); + } + printf("%s: --- end inm %p ---\n", __func__, inm); +} + +#else /* !KTR */ + +void +inm_print(const struct in_multi *inm) +{ + +} + +#endif /* KTR */ + +RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); diff --git a/freebsd/sys/netinet/in_pcb.c b/freebsd/sys/netinet/in_pcb.c new file mode 100644 index 00000000..186a0f0a --- /dev/null +++ b/freebsd/sys/netinet/in_pcb.c @@ -0,0 +1,1958 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1991, 1993, 1995 + * The Regents of the University of California. + * Copyright (c) 2007-2009 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#endif /* INET6 */ + + +#ifdef IPSEC +#include +#include +#endif /* IPSEC */ + +#include + +/* + * These configure the range of local port addresses assigned to + * "unspecified" outgoing connections/packets/whatever. + */ +VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ +VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ +VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ +VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ +VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ +VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ + +/* + * Reserved ports accessible only to root. There are significant + * security considerations that must be accounted for when changing these, + * but the security benefits can be great. Please be careful. + */ +VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ +VNET_DEFINE(int, ipport_reservedlow); + +/* Variables dealing with random ephemeral port allocation. */ +VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ +VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ +VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ +VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ +VNET_DEFINE(int, ipport_tcpallocs); +static VNET_DEFINE(int, ipport_tcplastcount); + +#define V_ipport_tcplastcount VNET(ipport_tcplastcount) + +#define RANGECHK(var, min, max) \ + if ((var) < (min)) { (var) = (min); } \ + else if ((var) > (max)) { (var) = (max); } + +static void in_pcbremlists(struct inpcb *inp); + +static int +sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) +{ + int error; + +#ifdef VIMAGE + error = vnet_sysctl_handle_int(oidp, arg1, arg2, req); +#else + error = sysctl_handle_int(oidp, arg1, arg2, req); +#endif + if (error == 0) { + RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); + RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); + RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); + RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); + } + return (error); +} + +#undef RANGECHK + +SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); + +SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, + CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, + CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, first, + CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, last, + CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, + CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_VNET_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, + CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, + &sysctl_net_ipport_check, "I", ""); +SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, + CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); +SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, + CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); +SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW, + &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); +SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW, + &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " + "allocations before switching to a sequental one"); +SYSCTL_VNET_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW, + &VNET_NAME(ipport_randomtime), 0, + "Minimum time to keep sequental port " + "allocation before switching to a random one"); + +/* + * in_pcb.c: manage the Protocol Control Blocks. + * + * NOTE: It is assumed that most of these functions will be called with + * the pcbinfo lock held, and often, the inpcb lock held, as these utility + * functions often modify hash chains or addresses in pcbs. + */ + +/* + * Allocate a PCB and associate it with the socket. + * On success return with the PCB locked. + */ +int +in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) +{ + struct inpcb *inp; + int error; + + INP_INFO_WLOCK_ASSERT(pcbinfo); + error = 0; + inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); + if (inp == NULL) + return (ENOBUFS); + bzero(inp, inp_zero_size); + inp->inp_pcbinfo = pcbinfo; + inp->inp_socket = so; + inp->inp_cred = crhold(so->so_cred); + inp->inp_inc.inc_fibnum = so->so_fibnum; +#ifdef MAC + error = mac_inpcb_init(inp, M_NOWAIT); + if (error != 0) + goto out; + mac_inpcb_create(so, inp); +#endif +#ifdef IPSEC + error = ipsec_init_policy(so, &inp->inp_sp); + if (error != 0) { +#ifdef MAC + mac_inpcb_destroy(inp); +#endif + goto out; + } +#endif /*IPSEC*/ +#ifdef INET6 + if (INP_SOCKAF(so) == AF_INET6) { + inp->inp_vflag |= INP_IPV6PROTO; + if (V_ip6_v6only) + inp->inp_flags |= IN6P_IPV6_V6ONLY; + } +#endif + LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); + pcbinfo->ipi_count++; + so->so_pcb = (caddr_t)inp; +#ifdef INET6 + if (V_ip6_auto_flowlabel) + inp->inp_flags |= IN6P_AUTOFLOWLABEL; +#endif + INP_WLOCK(inp); + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + inp->inp_refcount = 1; /* Reference from the inpcbinfo */ +#if defined(IPSEC) || defined(MAC) +out: + if (error != 0) { + crfree(inp->inp_cred); + uma_zfree(pcbinfo->ipi_zone, inp); + } +#endif + return (error); +} + +int +in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) +{ + int anonport, error; + + INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); + INP_WLOCK_ASSERT(inp); + + if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) + return (EINVAL); + anonport = inp->inp_lport == 0 && (nam == NULL || + ((struct sockaddr_in *)nam)->sin_port == 0); + error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, + &inp->inp_lport, cred); + if (error) + return (error); + if (in_pcbinshash(inp) != 0) { + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + return (EAGAIN); + } + if (anonport) + inp->inp_flags |= INP_ANONPORT; + return (0); +} + +/* + * Set up a bind operation on a PCB, performing port allocation + * as required, but do not actually modify the PCB. Callers can + * either complete the bind by setting inp_laddr/inp_lport and + * calling in_pcbinshash(), or they can just use the resulting + * port and address to authorise the sending of a once-off packet. + * + * On error, the values of *laddrp and *lportp are not changed. + */ +int +in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, + u_short *lportp, struct ucred *cred) +{ + struct socket *so = inp->inp_socket; + unsigned short *lastport; + struct sockaddr_in *sin; + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct in_addr laddr; + u_short lport = 0; + int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int error; + int dorandom; + + /* + * Because no actual state changes occur here, a global write lock on + * the pcbinfo isn't required. + */ + INP_INFO_LOCK_ASSERT(pcbinfo); + INP_LOCK_ASSERT(inp); + + if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ + return (EADDRNOTAVAIL); + laddr.s_addr = *laddrp; + if (nam != NULL && laddr.s_addr != INADDR_ANY) + return (EINVAL); + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) + wild = INPLOOKUP_WILDCARD; + if (nam == NULL) { + if ((error = prison_local_ip4(cred, &laddr)) != 0) + return (error); + } else { + sin = (struct sockaddr_in *)nam; + if (nam->sa_len != sizeof (*sin)) + return (EINVAL); +#ifdef notdef + /* + * We should check the family, but old programs + * incorrectly fail to initialize it. + */ + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); +#endif + error = prison_local_ip4(cred, &sin->sin_addr); + if (error) + return (error); + if (sin->sin_port != *lportp) { + /* Don't allow the port to change. */ + if (*lportp != 0) + return (EINVAL); + lport = sin->sin_port; + } + /* NB: lport is left as 0 if the port isn't being changed. */ + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { + /* + * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; + * allow complete duplication of binding if + * SO_REUSEPORT is set, or if SO_REUSEADDR is set + * and a multicast address is bound on both + * new and duplicated sockets. + */ + if (so->so_options & SO_REUSEADDR) + reuseport = SO_REUSEADDR|SO_REUSEPORT; + } else if (sin->sin_addr.s_addr != INADDR_ANY) { + sin->sin_port = 0; /* yech... */ + bzero(&sin->sin_zero, sizeof(sin->sin_zero)); + /* + * Is the address a local IP address? + * If INP_BINDANY is set, then the socket may be bound + * to any endpoint address, local or not. + */ + if ((inp->inp_flags & INP_BINDANY) == 0 && + ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) + return (EADDRNOTAVAIL); + } + laddr = sin->sin_addr; + if (lport) { + struct inpcb *t; + struct tcptw *tw; + + /* GROSS */ + if (ntohs(lport) <= V_ipport_reservedhigh && + ntohs(lport) >= V_ipport_reservedlow && + priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, + 0)) + return (EACCES); + if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && + priv_check_cred(inp->inp_cred, + PRIV_NETINET_REUSEPORT, 0) != 0) { + t = in_pcblookup_local(pcbinfo, sin->sin_addr, + lport, INPLOOKUP_WILDCARD, cred); + /* + * XXX + * This entire block sorely needs a rewrite. + */ + if (t && + ((t->inp_flags & INP_TIMEWAIT) == 0) && + (so->so_type != SOCK_STREAM || + ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && + (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != INADDR_ANY || + (t->inp_socket->so_options & + SO_REUSEPORT) == 0) && + (inp->inp_cred->cr_uid != + t->inp_cred->cr_uid)) + return (EADDRINUSE); + } + t = in_pcblookup_local(pcbinfo, sin->sin_addr, + lport, wild, cred); + if (t && (t->inp_flags & INP_TIMEWAIT)) { + /* + * XXXRW: If an incpb has had its timewait + * state recycled, we treat the address as + * being in use (for now). This is better + * than a panic, but not desirable. + */ + tw = intotw(inp); + if (tw == NULL || + (reuseport & tw->tw_so_options) == 0) + return (EADDRINUSE); + } else if (t && + (reuseport & t->inp_socket->so_options) == 0) { +#ifdef INET6 + if (ntohl(sin->sin_addr.s_addr) != + INADDR_ANY || + ntohl(t->inp_laddr.s_addr) != + INADDR_ANY || + INP_SOCKAF(so) == + INP_SOCKAF(t->inp_socket)) +#endif + return (EADDRINUSE); + } + } + } + if (*lportp != 0) + lport = *lportp; + if (lport == 0) { + u_short first, last, aux; + int count; + + if (inp->inp_flags & INP_HIGHPORT) { + first = V_ipport_hifirstauto; /* sysctl */ + last = V_ipport_hilastauto; + lastport = &pcbinfo->ipi_lasthi; + } else if (inp->inp_flags & INP_LOWPORT) { + error = priv_check_cred(cred, + PRIV_NETINET_RESERVEDPORT, 0); + if (error) + return error; + first = V_ipport_lowfirstauto; /* 1023 */ + last = V_ipport_lowlastauto; /* 600 */ + lastport = &pcbinfo->ipi_lastlow; + } else { + first = V_ipport_firstauto; /* sysctl */ + last = V_ipport_lastauto; + lastport = &pcbinfo->ipi_lastport; + } + /* + * For UDP, use random port allocation as long as the user + * allows it. For TCP (and as of yet unknown) connections, + * use random port allocation only if the user allows it AND + * ipport_tick() allows it. + */ + if (V_ipport_randomized && + (!V_ipport_stoprandom || pcbinfo == &V_udbinfo)) + dorandom = 1; + else + dorandom = 0; + /* + * It makes no sense to do random port allocation if + * we have the only port available. + */ + if (first == last) + dorandom = 0; + /* Make sure to not include UDP packets in the count. */ + if (pcbinfo != &V_udbinfo) + V_ipport_tcpallocs++; + /* + * Instead of having two loops further down counting up or down + * make sure that first is always <= last and go with only one + * code path implementing all logic. + */ + if (first > last) { + aux = first; + first = last; + last = aux; + } + + if (dorandom) + *lastport = first + + (arc4random() % (last - first)); + + count = last - first; + + do { + if (count-- < 0) /* completely used? */ + return (EADDRNOTAVAIL); + ++*lastport; + if (*lastport < first || *lastport > last) + *lastport = first; + lport = htons(*lastport); + } while (in_pcblookup_local(pcbinfo, laddr, + lport, wild, cred)); + } + *laddrp = laddr.s_addr; + *lportp = lport; + return (0); +} + +/* + * Connect from a socket to a specified address. + * Both address and port must be specified in argument sin. + * If don't have a local address for this socket yet, + * then pick one. + */ +int +in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) +{ + u_short lport, fport; + in_addr_t laddr, faddr; + int anonport, error; + + INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); + INP_WLOCK_ASSERT(inp); + + lport = inp->inp_lport; + laddr = inp->inp_laddr.s_addr; + anonport = (lport == 0); + error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, + NULL, cred); + if (error) + return (error); + + /* Do the initial binding of the local address if required. */ + if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { + inp->inp_lport = lport; + inp->inp_laddr.s_addr = laddr; + if (in_pcbinshash(inp) != 0) { + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + return (EAGAIN); + } + } + + /* Commit the remaining changes. */ + inp->inp_lport = lport; + inp->inp_laddr.s_addr = laddr; + inp->inp_faddr.s_addr = faddr; + inp->inp_fport = fport; + in_pcbrehash(inp); + + if (anonport) + inp->inp_flags |= INP_ANONPORT; + return (0); +} + +/* + * Do proper source address selection on an unbound socket in case + * of connect. Take jails into account as well. + */ +static int +in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, + struct ucred *cred) +{ + struct ifaddr *ifa; + struct sockaddr *sa; + struct sockaddr_in *sin; + struct route sro; + int error; + + KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); + + /* + * Bypass source address selection and use the primary jail IP + * if requested. + */ + if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) + return (0); + + error = 0; + bzero(&sro, sizeof(sro)); + + sin = (struct sockaddr_in *)&sro.ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_addr.s_addr = faddr->s_addr; + + /* + * If route is known our src addr is taken from the i/f, + * else punt. + * + * Find out route to destination. + */ + if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) + in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); + + /* + * If we found a route, use the address corresponding to + * the outgoing interface. + * + * Otherwise assume faddr is reachable on a directly connected + * network and try to find a corresponding interface to take + * the source address from. + */ + if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { + struct in_ifaddr *ia; + struct ifnet *ifp; + + ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin)); + if (ia == NULL) + ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0)); + if (ia == NULL) { + error = ENETUNREACH; + goto done; + } + + if (cred == NULL || !prison_flag(cred, PR_IP4)) { + laddr->s_addr = ia->ia_addr.sin_addr.s_addr; + ifa_free(&ia->ia_ifa); + goto done; + } + + ifp = ia->ia_ifp; + ifa_free(&ia->ia_ifa); + ia = NULL; + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + + sa = ifa->ifa_addr; + if (sa->sa_family != AF_INET) + continue; + sin = (struct sockaddr_in *)sa; + if (prison_check_ip4(cred, &sin->sin_addr) == 0) { + ia = (struct in_ifaddr *)ifa; + break; + } + } + if (ia != NULL) { + laddr->s_addr = ia->ia_addr.sin_addr.s_addr; + IF_ADDR_UNLOCK(ifp); + goto done; + } + IF_ADDR_UNLOCK(ifp); + + /* 3. As a last resort return the 'default' jail address. */ + error = prison_get_ip4(cred, laddr); + goto done; + } + + /* + * If the outgoing interface on the route found is not + * a loopback interface, use the address from that interface. + * In case of jails do those three steps: + * 1. check if the interface address belongs to the jail. If so use it. + * 2. check if we have any address on the outgoing interface + * belonging to this jail. If so use it. + * 3. as a last resort return the 'default' jail address. + */ + if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { + struct in_ifaddr *ia; + struct ifnet *ifp; + + /* If not jailed, use the default returned. */ + if (cred == NULL || !prison_flag(cred, PR_IP4)) { + ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; + laddr->s_addr = ia->ia_addr.sin_addr.s_addr; + goto done; + } + + /* Jailed. */ + /* 1. Check if the iface address belongs to the jail. */ + sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; + if (prison_check_ip4(cred, &sin->sin_addr) == 0) { + ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; + laddr->s_addr = ia->ia_addr.sin_addr.s_addr; + goto done; + } + + /* + * 2. Check if we have any address on the outgoing interface + * belonging to this jail. + */ + ia = NULL; + ifp = sro.ro_rt->rt_ifp; + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + sa = ifa->ifa_addr; + if (sa->sa_family != AF_INET) + continue; + sin = (struct sockaddr_in *)sa; + if (prison_check_ip4(cred, &sin->sin_addr) == 0) { + ia = (struct in_ifaddr *)ifa; + break; + } + } + if (ia != NULL) { + laddr->s_addr = ia->ia_addr.sin_addr.s_addr; + IF_ADDR_UNLOCK(ifp); + goto done; + } + IF_ADDR_UNLOCK(ifp); + + /* 3. As a last resort return the 'default' jail address. */ + error = prison_get_ip4(cred, laddr); + goto done; + } + + /* + * The outgoing interface is marked with 'loopback net', so a route + * to ourselves is here. + * Try to find the interface of the destination address and then + * take the address from there. That interface is not necessarily + * a loopback interface. + * In case of jails, check that it is an address of the jail + * and if we cannot find, fall back to the 'default' jail address. + */ + if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { + struct sockaddr_in sain; + struct in_ifaddr *ia; + + bzero(&sain, sizeof(struct sockaddr_in)); + sain.sin_family = AF_INET; + sain.sin_len = sizeof(struct sockaddr_in); + sain.sin_addr.s_addr = faddr->s_addr; + + ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain))); + if (ia == NULL) + ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0)); + if (ia == NULL) + ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); + + if (cred == NULL || !prison_flag(cred, PR_IP4)) { + if (ia == NULL) { + error = ENETUNREACH; + goto done; + } + laddr->s_addr = ia->ia_addr.sin_addr.s_addr; + ifa_free(&ia->ia_ifa); + goto done; + } + + /* Jailed. */ + if (ia != NULL) { + struct ifnet *ifp; + + ifp = ia->ia_ifp; + ifa_free(&ia->ia_ifa); + ia = NULL; + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + + sa = ifa->ifa_addr; + if (sa->sa_family != AF_INET) + continue; + sin = (struct sockaddr_in *)sa; + if (prison_check_ip4(cred, + &sin->sin_addr) == 0) { + ia = (struct in_ifaddr *)ifa; + break; + } + } + if (ia != NULL) { + laddr->s_addr = ia->ia_addr.sin_addr.s_addr; + IF_ADDR_UNLOCK(ifp); + goto done; + } + IF_ADDR_UNLOCK(ifp); + } + + /* 3. As a last resort return the 'default' jail address. */ + error = prison_get_ip4(cred, laddr); + goto done; + } + +done: + if (sro.ro_rt != NULL) + RTFREE(sro.ro_rt); + return (error); +} + +/* + * Set up for a connect from a socket to the specified address. + * On entry, *laddrp and *lportp should contain the current local + * address and port for the PCB; these are updated to the values + * that should be placed in inp_laddr and inp_lport to complete + * the connect. + * + * On success, *faddrp and *fportp will be set to the remote address + * and port. These are not updated in the error case. + * + * If the operation fails because the connection already exists, + * *oinpp will be set to the PCB of that connection so that the + * caller can decide to override it. In all other cases, *oinpp + * is set to NULL. + */ +int +in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, + in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, + struct inpcb **oinpp, struct ucred *cred) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)nam; + struct in_ifaddr *ia; + struct inpcb *oinp; + struct in_addr laddr, faddr; + u_short lport, fport; + int error; + + /* + * Because a global state change doesn't actually occur here, a read + * lock is sufficient. + */ + INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo); + INP_LOCK_ASSERT(inp); + + if (oinpp != NULL) + *oinpp = NULL; + if (nam->sa_len != sizeof (*sin)) + return (EINVAL); + if (sin->sin_family != AF_INET) + return (EAFNOSUPPORT); + if (sin->sin_port == 0) + return (EADDRNOTAVAIL); + laddr.s_addr = *laddrp; + lport = *lportp; + faddr = sin->sin_addr; + fport = sin->sin_port; + + if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { + /* + * If the destination address is INADDR_ANY, + * use the primary local address. + * If the supplied address is INADDR_BROADCAST, + * and the primary interface supports broadcast, + * choose the broadcast address for that interface. + */ + if (faddr.s_addr == INADDR_ANY) { + IN_IFADDR_RLOCK(); + faddr = + IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; + IN_IFADDR_RUNLOCK(); + if (cred != NULL && + (error = prison_get_ip4(cred, &faddr)) != 0) + return (error); + } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { + IN_IFADDR_RLOCK(); + if (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & + IFF_BROADCAST) + faddr = satosin(&TAILQ_FIRST( + &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; + IN_IFADDR_RUNLOCK(); + } + } + if (laddr.s_addr == INADDR_ANY) { + error = in_pcbladdr(inp, &faddr, &laddr, cred); + /* + * If the destination address is multicast and an outgoing + * interface has been set as a multicast option, prefer the + * address of that interface as our source address. + */ + if (IN_MULTICAST(ntohl(faddr.s_addr)) && + inp->inp_moptions != NULL) { + struct ip_moptions *imo; + struct ifnet *ifp; + + imo = inp->inp_moptions; + if (imo->imo_multicast_ifp != NULL) { + ifp = imo->imo_multicast_ifp; + IN_IFADDR_RLOCK(); + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) + if (ia->ia_ifp == ifp) + break; + if (ia == NULL) { + IN_IFADDR_RUNLOCK(); + error = EADDRNOTAVAIL; + } else { + laddr = ia->ia_addr.sin_addr; + IN_IFADDR_RUNLOCK(); + error = 0; + } + } + } + if (error) + return (error); + } + oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport, + 0, NULL); + if (oinp != NULL) { + if (oinpp != NULL) + *oinpp = oinp; + return (EADDRINUSE); + } + if (lport == 0) { + error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, + cred); + if (error) + return (error); + } + *laddrp = laddr.s_addr; + *lportp = lport; + *faddrp = faddr.s_addr; + *fportp = fport; + return (0); +} + +void +in_pcbdisconnect(struct inpcb *inp) +{ + + INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); + INP_WLOCK_ASSERT(inp); + + inp->inp_faddr.s_addr = INADDR_ANY; + inp->inp_fport = 0; + in_pcbrehash(inp); +} + +/* + * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. + * For most protocols, this will be invoked immediately prior to calling + * in_pcbfree(). However, with TCP the inpcb may significantly outlive the + * socket, in which case in_pcbfree() is deferred. + */ +void +in_pcbdetach(struct inpcb *inp) +{ + + KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); + + inp->inp_socket->so_pcb = NULL; + inp->inp_socket = NULL; +} + +/* + * in_pcbfree_internal() frees an inpcb that has been detached from its + * socket, and whose reference count has reached 0. It will also remove the + * inpcb from any global lists it might remain on. + */ +static void +in_pcbfree_internal(struct inpcb *inp) +{ + struct inpcbinfo *ipi = inp->inp_pcbinfo; + + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); + KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__)); + + INP_INFO_WLOCK_ASSERT(ipi); + INP_WLOCK_ASSERT(inp); + +#ifdef IPSEC + if (inp->inp_sp != NULL) + ipsec_delete_pcbpolicy(inp); +#endif /* IPSEC */ + inp->inp_gencnt = ++ipi->ipi_gencnt; + in_pcbremlists(inp); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) { + ip6_freepcbopts(inp->in6p_outputopts); + if (inp->in6p_moptions != NULL) + ip6_freemoptions(inp->in6p_moptions); + } +#endif + if (inp->inp_options) + (void)m_free(inp->inp_options); + if (inp->inp_moptions != NULL) + inp_freemoptions(inp->inp_moptions); + inp->inp_vflag = 0; + crfree(inp->inp_cred); + +#ifdef MAC + mac_inpcb_destroy(inp); +#endif + INP_WUNLOCK(inp); + uma_zfree(ipi->ipi_zone, inp); +} + +/* + * in_pcbref() bumps the reference count on an inpcb in order to maintain + * stability of an inpcb pointer despite the inpcb lock being released. This + * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, + * but where the inpcb lock is already held. + * + * While the inpcb will not be freed, releasing the inpcb lock means that the + * connection's state may change, so the caller should be careful to + * revalidate any cached state on reacquiring the lock. Drop the reference + * using in_pcbrele(). + */ +void +in_pcbref(struct inpcb *inp) +{ + + INP_WLOCK_ASSERT(inp); + + KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); + + inp->inp_refcount++; +} + +/* + * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to + * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we + * return a flag indicating whether or not the inpcb remains valid. If it is + * valid, we return with the inpcb lock held. + */ +int +in_pcbrele(struct inpcb *inp) +{ +#ifdef INVARIANTS + struct inpcbinfo *ipi = inp->inp_pcbinfo; +#endif + + KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); + + INP_INFO_WLOCK_ASSERT(ipi); + INP_WLOCK_ASSERT(inp); + + inp->inp_refcount--; + if (inp->inp_refcount > 0) + return (0); + in_pcbfree_internal(inp); + return (1); +} + +/* + * Unconditionally schedule an inpcb to be freed by decrementing its + * reference count, which should occur only after the inpcb has been detached + * from its socket. If another thread holds a temporary reference (acquired + * using in_pcbref()) then the free is deferred until that reference is + * released using in_pcbrele(), but the inpcb is still unlocked. + */ +void +in_pcbfree(struct inpcb *inp) +{ +#ifdef INVARIANTS + struct inpcbinfo *ipi = inp->inp_pcbinfo; +#endif + + KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", + __func__)); + + INP_INFO_WLOCK_ASSERT(ipi); + INP_WLOCK_ASSERT(inp); + + if (!in_pcbrele(inp)) + INP_WUNLOCK(inp); +} + +/* + * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and + * port reservation, and preventing it from being returned by inpcb lookups. + * + * It is used by TCP to mark an inpcb as unused and avoid future packet + * delivery or event notification when a socket remains open but TCP has + * closed. This might occur as a result of a shutdown()-initiated TCP close + * or a RST on the wire, and allows the port binding to be reused while still + * maintaining the invariant that so_pcb always points to a valid inpcb until + * in_pcbdetach(). + * + * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash + * lists, but can lead to confusing netstat output, as open sockets with + * closed TCP connections will no longer appear to have their bound port + * number. An explicit flag would be better, as it would allow us to leave + * the port number intact after the connection is dropped. + * + * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by + * in_pcbnotifyall() and in_pcbpurgeif0()? + */ +void +in_pcbdrop(struct inpcb *inp) +{ + + INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); + INP_WLOCK_ASSERT(inp); + + inp->inp_flags |= INP_DROPPED; + if (inp->inp_flags & INP_INHASHLIST) { + struct inpcbport *phd = inp->inp_phd; + + LIST_REMOVE(inp, inp_hash); + LIST_REMOVE(inp, inp_portlist); + if (LIST_FIRST(&phd->phd_pcblist) == NULL) { + LIST_REMOVE(phd, phd_hash); + free(phd, M_PCB); + } + inp->inp_flags &= ~INP_INHASHLIST; + } +} + +/* + * Common routines to return the socket addresses associated with inpcbs. + */ +struct sockaddr * +in_sockaddr(in_port_t port, struct in_addr *addr_p) +{ + struct sockaddr_in *sin; + + sin = malloc(sizeof *sin, M_SONAME, + M_WAITOK | M_ZERO); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = *addr_p; + sin->sin_port = port; + + return (struct sockaddr *)sin; +} + +int +in_getsockaddr(struct socket *so, struct sockaddr **nam) +{ + struct inpcb *inp; + struct in_addr addr; + in_port_t port; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); + + INP_RLOCK(inp); + port = inp->inp_lport; + addr = inp->inp_laddr; + INP_RUNLOCK(inp); + + *nam = in_sockaddr(port, &addr); + return 0; +} + +int +in_getpeeraddr(struct socket *so, struct sockaddr **nam) +{ + struct inpcb *inp; + struct in_addr addr; + in_port_t port; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); + + INP_RLOCK(inp); + port = inp->inp_fport; + addr = inp->inp_faddr; + INP_RUNLOCK(inp); + + *nam = in_sockaddr(port, &addr); + return 0; +} + +void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, + struct inpcb *(*notify)(struct inpcb *, int)) +{ + struct inpcb *inp, *inp_temp; + + INP_INFO_WLOCK(pcbinfo); + LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { + INP_WLOCK(inp); +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) { + INP_WUNLOCK(inp); + continue; + } +#endif + if (inp->inp_faddr.s_addr != faddr.s_addr || + inp->inp_socket == NULL) { + INP_WUNLOCK(inp); + continue; + } + if ((*notify)(inp, errno)) + INP_WUNLOCK(inp); + } + INP_INFO_WUNLOCK(pcbinfo); +} + +void +in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) +{ + struct inpcb *inp; + struct ip_moptions *imo; + int i, gap; + + INP_INFO_RLOCK(pcbinfo); + LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { + INP_WLOCK(inp); + imo = inp->inp_moptions; + if ((inp->inp_vflag & INP_IPV4) && + imo != NULL) { + /* + * Unselect the outgoing interface if it is being + * detached. + */ + if (imo->imo_multicast_ifp == ifp) + imo->imo_multicast_ifp = NULL; + + /* + * Drop multicast group membership if we joined + * through the interface being detached. + */ + for (i = 0, gap = 0; i < imo->imo_num_memberships; + i++) { + if (imo->imo_membership[i]->inm_ifp == ifp) { + in_delmulti(imo->imo_membership[i]); + gap++; + } else if (gap != 0) + imo->imo_membership[i - gap] = + imo->imo_membership[i]; + } + imo->imo_num_memberships -= gap; + } + INP_WUNLOCK(inp); + } + INP_INFO_RUNLOCK(pcbinfo); +} + +/* + * Lookup a PCB based on the local address and port. + */ +#define INP_LOOKUP_MAPPED_PCB_COST 3 +struct inpcb * +in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, + u_short lport, int wild_okay, struct ucred *cred) +{ + struct inpcb *inp; +#ifdef INET6 + int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; +#else + int matchwild = 3; +#endif + int wildcard; + + INP_INFO_LOCK_ASSERT(pcbinfo); + + if (!wild_okay) { + struct inpcbhead *head; + /* + * Look for an unconnected (wildcard foreign addr) PCB that + * matches the local address and port we're looking for. + */ + head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, + 0, pcbinfo->ipi_hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == INADDR_ANY && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_lport == lport) { + /* + * Found? + */ + if (cred == NULL || + prison_equal_ip4(cred->cr_prison, + inp->inp_cred->cr_prison)) + return (inp); + } + } + /* + * Not found. + */ + return (NULL); + } else { + struct inpcbporthead *porthash; + struct inpcbport *phd; + struct inpcb *match = NULL; + /* + * Best fit PCB lookup. + * + * First see if this local port is in use by looking on the + * port hash list. + */ + porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, + pcbinfo->ipi_porthashmask)]; + LIST_FOREACH(phd, porthash, phd_hash) { + if (phd->phd_port == lport) + break; + } + if (phd != NULL) { + /* + * Port is in use by one or more PCBs. Look for best + * fit. + */ + LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { + wildcard = 0; + if (cred != NULL && + !prison_equal_ip4(inp->inp_cred->cr_prison, + cred->cr_prison)) + continue; +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; + /* + * We never select the PCB that has + * INP_IPV6 flag and is bound to :: if + * we have another PCB which is bound + * to 0.0.0.0. If a PCB has the + * INP_IPV6 flag, then we set its cost + * higher than IPv4 only PCBs. + * + * Note that the case only happens + * when a socket is bound to ::, under + * the condition that the use of the + * mapped address is allowed. + */ + if ((inp->inp_vflag & INP_IPV6) != 0) + wildcard += INP_LOOKUP_MAPPED_PCB_COST; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY) + wildcard++; + if (inp->inp_laddr.s_addr != INADDR_ANY) { + if (laddr.s_addr == INADDR_ANY) + wildcard++; + else if (inp->inp_laddr.s_addr != laddr.s_addr) + continue; + } else { + if (laddr.s_addr != INADDR_ANY) + wildcard++; + } + if (wildcard < matchwild) { + match = inp; + matchwild = wildcard; + if (matchwild == 0) + break; + } + } + } + return (match); + } +} +#undef INP_LOOKUP_MAPPED_PCB_COST + +/* + * Lookup PCB in hash list. + */ +struct inpcb * +in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, + u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard, + struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp, *tmpinp; + u_short fport = fport_arg, lport = lport_arg; + + INP_INFO_LOCK_ASSERT(pcbinfo); + + /* + * First look for an exact match. + */ + tmpinp = NULL; + head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, + pcbinfo->ipi_hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + /* + * XXX We should be able to directly return + * the inp here, without any checks. + * Well unless both bound with SO_REUSEPORT? + */ + if (prison_flag(inp->inp_cred, PR_IP4)) + return (inp); + if (tmpinp == NULL) + tmpinp = inp; + } + } + if (tmpinp != NULL) + return (tmpinp); + + /* + * Then look for a wildcard match, if requested. + */ + if (wildcard == INPLOOKUP_WILDCARD) { + struct inpcb *local_wild = NULL, *local_exact = NULL; +#ifdef INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + struct inpcb *jail_wild = NULL; + int injail; + + /* + * Order of socket selection - we always prefer jails. + * 1. jailed, non-wild. + * 2. jailed, wild. + * 3. non-jailed, non-wild. + * 4. non-jailed, wild. + */ + + head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, + 0, pcbinfo->ipi_hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr != INADDR_ANY || + inp->inp_lport != lport) + continue; + + /* XXX inp locking */ + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; + + injail = prison_flag(inp->inp_cred, PR_IP4); + if (injail) { + if (prison_check_ip4(inp->inp_cred, + &laddr) != 0) + continue; + } else { + if (local_exact != NULL) + continue; + } + + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if (injail) + return (inp); + else + local_exact = inp; + } else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#ifdef INET6 + /* XXX inp locking, NULL check */ + if (inp->inp_vflag & INP_IPV6PROTO) + local_wild_mapped = inp; + else +#endif /* INET6 */ + if (injail) + jail_wild = inp; + else + local_wild = inp; + } + } /* LIST_FOREACH */ + if (jail_wild != NULL) + return (jail_wild); + if (local_exact != NULL) + return (local_exact); + if (local_wild != NULL) + return (local_wild); +#ifdef INET6 + if (local_wild_mapped != NULL) + return (local_wild_mapped); +#endif /* defined(INET6) */ + } /* if (wildcard == INPLOOKUP_WILDCARD) */ + + return (NULL); +} + +/* + * Insert PCB onto various hash lists. + */ +int +in_pcbinshash(struct inpcb *inp) +{ + struct inpcbhead *pcbhash; + struct inpcbporthead *pcbporthash; + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbport *phd; + u_int32_t hashkey_faddr; + + INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, + ("in_pcbinshash: INP_INHASHLIST")); + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; + else +#endif /* INET6 */ + hashkey_faddr = inp->inp_faddr.s_addr; + + pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, + inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; + + pcbporthash = &pcbinfo->ipi_porthashbase[ + INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; + + /* + * Go through port list and look for a head for this lport. + */ + LIST_FOREACH(phd, pcbporthash, phd_hash) { + if (phd->phd_port == inp->inp_lport) + break; + } + /* + * If none exists, malloc one and tack it on. + */ + if (phd == NULL) { + phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); + if (phd == NULL) { + return (ENOBUFS); /* XXX */ + } + phd->phd_port = inp->inp_lport; + LIST_INIT(&phd->phd_pcblist); + LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); + } + inp->inp_phd = phd; + LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); + LIST_INSERT_HEAD(pcbhash, inp, inp_hash); + inp->inp_flags |= INP_INHASHLIST; + return (0); +} + +/* + * Move PCB to the proper hash bucket when { faddr, fport } have been + * changed. NOTE: This does not handle the case of the lport changing (the + * hashed port list would have to be updated as well), so the lport must + * not change after in_pcbinshash() has been called. + */ +void +in_pcbrehash(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbhead *head; + u_int32_t hashkey_faddr; + + INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_WLOCK_ASSERT(inp); + KASSERT(inp->inp_flags & INP_INHASHLIST, + ("in_pcbrehash: !INP_INHASHLIST")); + +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; + else +#endif /* INET6 */ + hashkey_faddr = inp->inp_faddr.s_addr; + + head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, + inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; + + LIST_REMOVE(inp, inp_hash); + LIST_INSERT_HEAD(head, inp, inp_hash); +} + +/* + * Remove PCB from various lists. + */ +static void +in_pcbremlists(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + + INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_WLOCK_ASSERT(inp); + + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + if (inp->inp_flags & INP_INHASHLIST) { + struct inpcbport *phd = inp->inp_phd; + + LIST_REMOVE(inp, inp_hash); + LIST_REMOVE(inp, inp_portlist); + if (LIST_FIRST(&phd->phd_pcblist) == NULL) { + LIST_REMOVE(phd, phd_hash); + free(phd, M_PCB); + } + inp->inp_flags &= ~INP_INHASHLIST; + } + LIST_REMOVE(inp, inp_list); + pcbinfo->ipi_count--; +} + +/* + * A set label operation has occurred at the socket layer, propagate the + * label change into the in_pcb for the socket. + */ +void +in_pcbsosetlabel(struct socket *so) +{ +#ifdef MAC + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); + + INP_WLOCK(inp); + SOCK_LOCK(so); + mac_inpcb_sosetlabel(so, inp); + SOCK_UNLOCK(so); + INP_WUNLOCK(inp); +#endif +} + +/* + * ipport_tick runs once per second, determining if random port allocation + * should be continued. If more than ipport_randomcps ports have been + * allocated in the last second, then we return to sequential port + * allocation. We return to random allocation only once we drop below + * ipport_randomcps for at least ipport_randomtime seconds. + */ +void +ipport_tick(void *xtp) +{ + VNET_ITERATOR_DECL(vnet_iter); + + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ + if (V_ipport_tcpallocs <= + V_ipport_tcplastcount + V_ipport_randomcps) { + if (V_ipport_stoprandom > 0) + V_ipport_stoprandom--; + } else + V_ipport_stoprandom = V_ipport_randomtime; + V_ipport_tcplastcount = V_ipport_tcpallocs; + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); + callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); +} + +void +inp_wlock(struct inpcb *inp) +{ + + INP_WLOCK(inp); +} + +void +inp_wunlock(struct inpcb *inp) +{ + + INP_WUNLOCK(inp); +} + +void +inp_rlock(struct inpcb *inp) +{ + + INP_RLOCK(inp); +} + +void +inp_runlock(struct inpcb *inp) +{ + + INP_RUNLOCK(inp); +} + +#ifdef INVARIANTS +void +inp_lock_assert(struct inpcb *inp) +{ + + INP_WLOCK_ASSERT(inp); +} + +void +inp_unlock_assert(struct inpcb *inp) +{ + + INP_UNLOCK_ASSERT(inp); +} +#endif + +void +inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) +{ + struct inpcb *inp; + + INP_INFO_RLOCK(&V_tcbinfo); + LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { + INP_WLOCK(inp); + func(inp, arg); + INP_WUNLOCK(inp); + } + INP_INFO_RUNLOCK(&V_tcbinfo); +} + +struct socket * +inp_inpcbtosocket(struct inpcb *inp) +{ + + INP_WLOCK_ASSERT(inp); + return (inp->inp_socket); +} + +struct tcpcb * +inp_inpcbtotcpcb(struct inpcb *inp) +{ + + INP_WLOCK_ASSERT(inp); + return ((struct tcpcb *)inp->inp_ppcb); +} + +int +inp_ip_tos_get(const struct inpcb *inp) +{ + + return (inp->inp_ip_tos); +} + +void +inp_ip_tos_set(struct inpcb *inp, int val) +{ + + inp->inp_ip_tos = val; +} + +void +inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, + uint32_t *faddr, uint16_t *fp) +{ + + INP_LOCK_ASSERT(inp); + *laddr = inp->inp_laddr.s_addr; + *faddr = inp->inp_faddr.s_addr; + *lp = inp->inp_lport; + *fp = inp->inp_fport; +} + +struct inpcb * +so_sotoinpcb(struct socket *so) +{ + + return (sotoinpcb(so)); +} + +struct tcpcb * +so_sototcpcb(struct socket *so) +{ + + return (sototcpcb(so)); +} + +#ifdef DDB +static void +db_print_indent(int indent) +{ + int i; + + for (i = 0; i < indent; i++) + db_printf(" "); +} + +static void +db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) +{ + char faddr_str[48], laddr_str[48]; + + db_print_indent(indent); + db_printf("%s at %p\n", name, inc); + + indent += 2; + +#ifdef INET6 + if (inc->inc_flags & INC_ISIPV6) { + /* IPv6. */ + ip6_sprintf(laddr_str, &inc->inc6_laddr); + ip6_sprintf(faddr_str, &inc->inc6_faddr); + } else { +#endif + /* IPv4. */ + inet_ntoa_r(inc->inc_laddr, laddr_str); + inet_ntoa_r(inc->inc_faddr, faddr_str); +#ifdef INET6 + } +#endif + db_print_indent(indent); + db_printf("inc_laddr %s inc_lport %u\n", laddr_str, + ntohs(inc->inc_lport)); + db_print_indent(indent); + db_printf("inc_faddr %s inc_fport %u\n", faddr_str, + ntohs(inc->inc_fport)); +} + +static void +db_print_inpflags(int inp_flags) +{ + int comma; + + comma = 0; + if (inp_flags & INP_RECVOPTS) { + db_printf("%sINP_RECVOPTS", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_RECVRETOPTS) { + db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_RECVDSTADDR) { + db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_HDRINCL) { + db_printf("%sINP_HDRINCL", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_HIGHPORT) { + db_printf("%sINP_HIGHPORT", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_LOWPORT) { + db_printf("%sINP_LOWPORT", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_ANONPORT) { + db_printf("%sINP_ANONPORT", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_RECVIF) { + db_printf("%sINP_RECVIF", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_MTUDISC) { + db_printf("%sINP_MTUDISC", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_FAITH) { + db_printf("%sINP_FAITH", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_RECVTTL) { + db_printf("%sINP_RECVTTL", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_DONTFRAG) { + db_printf("%sINP_DONTFRAG", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_IPV6_V6ONLY) { + db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_PKTINFO) { + db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_HOPLIMIT) { + db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_HOPOPTS) { + db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_DSTOPTS) { + db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_RTHDR) { + db_printf("%sIN6P_RTHDR", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_RTHDRDSTOPTS) { + db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_TCLASS) { + db_printf("%sIN6P_TCLASS", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_AUTOFLOWLABEL) { + db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_TIMEWAIT) { + db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_ONESBCAST) { + db_printf("%sINP_ONESBCAST", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_DROPPED) { + db_printf("%sINP_DROPPED", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & INP_SOCKREF) { + db_printf("%sINP_SOCKREF", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_RFC2292) { + db_printf("%sIN6P_RFC2292", comma ? ", " : ""); + comma = 1; + } + if (inp_flags & IN6P_MTU) { + db_printf("IN6P_MTU%s", comma ? ", " : ""); + comma = 1; + } +} + +static void +db_print_inpvflag(u_char inp_vflag) +{ + int comma; + + comma = 0; + if (inp_vflag & INP_IPV4) { + db_printf("%sINP_IPV4", comma ? ", " : ""); + comma = 1; + } + if (inp_vflag & INP_IPV6) { + db_printf("%sINP_IPV6", comma ? ", " : ""); + comma = 1; + } + if (inp_vflag & INP_IPV6PROTO) { + db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); + comma = 1; + } +} + +static void +db_print_inpcb(struct inpcb *inp, const char *name, int indent) +{ + + db_print_indent(indent); + db_printf("%s at %p\n", name, inp); + + indent += 2; + + db_print_indent(indent); + db_printf("inp_flow: 0x%x\n", inp->inp_flow); + + db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); + + db_print_indent(indent); + db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", + inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); + + db_print_indent(indent); + db_printf("inp_label: %p inp_flags: 0x%x (", + inp->inp_label, inp->inp_flags); + db_print_inpflags(inp->inp_flags); + db_printf(")\n"); + + db_print_indent(indent); + db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, + inp->inp_vflag); + db_print_inpvflag(inp->inp_vflag); + db_printf(")\n"); + + db_print_indent(indent); + db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", + inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); + + db_print_indent(indent); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + db_printf("in6p_options: %p in6p_outputopts: %p " + "in6p_moptions: %p\n", inp->in6p_options, + inp->in6p_outputopts, inp->in6p_moptions); + db_printf("in6p_icmp6filt: %p in6p_cksum %d " + "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, + inp->in6p_hops); + } else +#endif + { + db_printf("inp_ip_tos: %d inp_ip_options: %p " + "inp_ip_moptions: %p\n", inp->inp_ip_tos, + inp->inp_options, inp->inp_moptions); + } + + db_print_indent(indent); + db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, + (uintmax_t)inp->inp_gencnt); +} + +DB_SHOW_COMMAND(inpcb, db_show_inpcb) +{ + struct inpcb *inp; + + if (!have_addr) { + db_printf("usage: show inpcb \n"); + return; + } + inp = (struct inpcb *)addr; + + db_print_inpcb(inp, "inpcb", 0); +} +#endif diff --git a/freebsd/sys/netinet/in_pcb.h b/freebsd/sys/netinet/in_pcb.h new file mode 100644 index 00000000..8cd4a5f8 --- /dev/null +++ b/freebsd/sys/netinet/in_pcb.h @@ -0,0 +1,525 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_pcb.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_PCB_HH_ +#define _NETINET_IN_PCB_HH_ + +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#endif + +#define in6pcb inpcb /* for KAME src sync over BSD*'s */ +#define in6p_sp inp_sp /* for KAME src sync over BSD*'s */ +struct inpcbpolicy; + +/* + * struct inpcb is the common protocol control block structure used in most + * IP transport protocols. + * + * Pointers to local and foreign host table entries, local and foreign socket + * numbers, and pointers up (to a socket structure) and down (to a + * protocol-specific control block) are stored here. + */ +LIST_HEAD(inpcbhead, inpcb); +LIST_HEAD(inpcbporthead, inpcbport); +typedef u_quad_t inp_gen_t; + +/* + * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. + * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing + * the following structure. + */ +struct in_addr_4in6 { + u_int32_t ia46_pad32[3]; + struct in_addr ia46_addr4; +}; + +/* + * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has + * some extra padding to accomplish this. + */ +struct in_endpoints { + u_int16_t ie_fport; /* foreign port */ + u_int16_t ie_lport; /* local port */ + /* protocol dependent part, local and foreign addr */ + union { + /* foreign host table entry */ + struct in_addr_4in6 ie46_foreign; + struct in6_addr ie6_foreign; + } ie_dependfaddr; + union { + /* local host table entry */ + struct in_addr_4in6 ie46_local; + struct in6_addr ie6_local; + } ie_dependladdr; +}; +#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 +#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 +#define ie6_faddr ie_dependfaddr.ie6_foreign +#define ie6_laddr ie_dependladdr.ie6_local + +/* + * XXX The defines for inc_* are hacks and should be changed to direct + * references. + */ +struct in_conninfo { + u_int8_t inc_flags; + u_int8_t inc_len; + u_int16_t inc_fibnum; /* XXX was pad, 16 bits is plenty */ + /* protocol dependent part */ + struct in_endpoints inc_ie; +}; + +/* + * Flags for inc_flags. + */ +#define INC_ISIPV6 0x01 + +#define inc_isipv6 inc_flags /* temp compatability */ +#define inc_fport inc_ie.ie_fport +#define inc_lport inc_ie.ie_lport +#define inc_faddr inc_ie.ie_faddr +#define inc_laddr inc_ie.ie_laddr +#define inc6_faddr inc_ie.ie6_faddr +#define inc6_laddr inc_ie.ie6_laddr + +struct icmp6_filter; + +/*- + * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 + * and IPv6 sockets. In the case of TCP, further per-connection state is + * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb + * are static after creation or protected by a per-inpcb rwlock, inp_lock. A + * few fields also require the global pcbinfo lock for the inpcb to be held, + * when modified, such as the global connection lists and hashes, as well as + * binding information (which affects which hash a connection is on). This + * model means that connections can be looked up without holding the + * per-connection lock, which is important for performance when attempting to + * find the connection for a packet given its IP and port tuple. Writing to + * these fields that write locks be held on both the inpcb and global locks. + * + * Key: + * (c) - Constant after initialization + * (i) - Protected by the inpcb lock + * (p) - Protected by the pcbinfo lock for the inpcb + * (s) - Protected by another subsystem's locks + * (x) - Undefined locking + * + * A few other notes: + * + * When a read lock is held, stability of the field is guaranteed; to write + * to a field, a write lock must generally be held. + * + * netinet/netinet6-layer code should not assume that the inp_socket pointer + * is safe to dereference without inp_lock being held, even for protocols + * other than TCP (where the inpcb persists during TIMEWAIT even after the + * socket has been freed), or there may be close(2)-related races. + * + * The inp_vflag field is overloaded, and would otherwise ideally be (c). + */ +struct inpcb { + LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ + LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ + void *inp_ppcb; /* (i) pointer to per-protocol pcb */ + struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ + struct socket *inp_socket; /* (i) back pointer to socket */ + struct ucred *inp_cred; /* (c) cache of socket cred */ + u_int32_t inp_flow; /* (i) IPv6 flow information */ + int inp_flags; /* (i) generic IP/datagram flags */ + int inp_flags2; /* (i) generic IP/datagram flags #2*/ + u_char inp_vflag; /* (i) IP version flag (v4/v6) */ + u_char inp_ip_ttl; /* (i) time to live proto */ + u_char inp_ip_p; /* (c) protocol proto */ + u_char inp_ip_minttl; /* (i) minimum TTL or drop */ + uint32_t inp_flowid; /* (x) flow id / queue id */ + u_int inp_refcount; /* (i) refcount */ + void *inp_pspare[4]; /* (x) rtentry / general use */ + u_int inp_ispare[4]; /* general use */ + + /* Local and foreign ports, local and foreign addr. */ + struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */ + + /* MAC and IPSEC policy information. */ + struct label *inp_label; /* (i) MAC label */ + struct inpcbpolicy *inp_sp; /* (s) for IPSEC */ + + /* Protocol-dependent part; options. */ + struct { + u_char inp4_ip_tos; /* (i) type of service proto */ + struct mbuf *inp4_options; /* (i) IP options */ + struct ip_moptions *inp4_moptions; /* (i) IP mcast options */ + } inp_depend4; + struct { + /* (i) IP options */ + struct mbuf *inp6_options; + /* (i) IP6 options for outgoing packets */ + struct ip6_pktopts *inp6_outputopts; + /* (i) IP multicast options */ + struct ip6_moptions *inp6_moptions; + /* (i) ICMPv6 code type filter */ + struct icmp6_filter *inp6_icmp6filt; + /* (i) IPV6_CHECKSUM setsockopt */ + int inp6_cksum; + short inp6_hops; + } inp_depend6; + LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */ + struct inpcbport *inp_phd; /* (i/p) head of this list */ +#define inp_zero_size offsetof(struct inpcb, inp_gencnt) + inp_gen_t inp_gencnt; /* (c) generation count */ + struct llentry *inp_lle; /* cached L2 information */ + struct rtentry *inp_rt; /* cached L3 information */ + struct rwlock inp_lock; +}; +#define inp_fport inp_inc.inc_fport +#define inp_lport inp_inc.inc_lport +#define inp_faddr inp_inc.inc_faddr +#define inp_laddr inp_inc.inc_laddr +#define inp_ip_tos inp_depend4.inp4_ip_tos +#define inp_options inp_depend4.inp4_options +#define inp_moptions inp_depend4.inp4_moptions + +#define in6p_faddr inp_inc.inc6_faddr +#define in6p_laddr inp_inc.inc6_laddr +#define in6p_hops inp_depend6.inp6_hops /* default hop limit */ +#define in6p_flowinfo inp_flow +#define in6p_options inp_depend6.inp6_options +#define in6p_outputopts inp_depend6.inp6_outputopts +#define in6p_moptions inp_depend6.inp6_moptions +#define in6p_icmp6filt inp_depend6.inp6_icmp6filt +#define in6p_cksum inp_depend6.inp6_cksum + +#define inp_vnet inp_pcbinfo->ipi_vnet + +/* + * The range of the generation count, as used in this implementation, is 9e19. + * We would have to create 300 billion connections per second for this number + * to roll over in a year. This seems sufficiently unlikely that we simply + * don't concern ourselves with that possibility. + */ + +/* + * Interface exported to userland by various protocols which use inpcbs. Hack + * alert -- only define if struct xsocket is in scope. + */ +#ifdef _SYS_SOCKETVAR_HH_ +struct xinpcb { + size_t xi_len; /* length of this structure */ + struct inpcb xi_inp; + struct xsocket xi_socket; + u_quad_t xi_alignment_hack; +}; + +struct xinpgen { + size_t xig_len; /* length of this structure */ + u_int xig_count; /* number of PCBs at this time */ + inp_gen_t xig_gen; /* generation count at this time */ + so_gen_t xig_sogen; /* socket generation count at this time */ +}; +#endif /* _SYS_SOCKETVAR_HH_ */ + +struct inpcbport { + LIST_ENTRY(inpcbport) phd_hash; + struct inpcbhead phd_pcblist; + u_short phd_port; +}; + +/* + * Global data structure for each high-level protocol (UDP, TCP, ...) in both + * IPv4 and IPv6. Holds inpcb lists and information for managing them. + */ +struct inpcbinfo { + /* + * Global list of inpcbs on the protocol. + */ + struct inpcbhead *ipi_listhead; + u_int ipi_count; + + /* + * Global hash of inpcbs, hashed by local and foreign addresses and + * port numbers. + */ + struct inpcbhead *ipi_hashbase; + u_long ipi_hashmask; + + /* + * Global hash of inpcbs, hashed by only local port number. + */ + struct inpcbporthead *ipi_porthashbase; + u_long ipi_porthashmask; + + /* + * Fields associated with port lookup and allocation. + */ + u_short ipi_lastport; + u_short ipi_lastlow; + u_short ipi_lasthi; + + /* + * UMA zone from which inpcbs are allocated for this protocol. + */ + struct uma_zone *ipi_zone; + + /* + * Generation count--incremented each time a connection is allocated + * or freed. + */ + u_quad_t ipi_gencnt; + struct rwlock ipi_lock; + + /* + * Pointer to network stack instance + */ + struct vnet *ipi_vnet; + + /* + * general use 2 + */ + void *ipi_pspare[2]; +}; + +#define INP_LOCK_INIT(inp, d, t) \ + rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) +#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) +#define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) +#define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) +#define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock) +#define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock) +#define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock) +#define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock) +#define INP_TRY_UPGRADE(inp) rw_try_upgrade(&(inp)->inp_lock) +#define INP_DOWNGRADE(inp) rw_downgrade(&(inp)->inp_lock) +#define INP_WLOCKED(inp) rw_wowned(&(inp)->inp_lock) +#define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED) +#define INP_RLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_RLOCKED) +#define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED) +#define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED) + +#ifdef _KERNEL +/* + * These locking functions are for inpcb consumers outside of sys/netinet, + * more specifically, they were added for the benefit of TOE drivers. The + * macros are reserved for use by the stack. + */ +void inp_wlock(struct inpcb *); +void inp_wunlock(struct inpcb *); +void inp_rlock(struct inpcb *); +void inp_runlock(struct inpcb *); + +#ifdef INVARIANTS +void inp_lock_assert(struct inpcb *); +void inp_unlock_assert(struct inpcb *); +#else +static __inline void +inp_lock_assert(struct inpcb *inp __unused) +{ +} + +static __inline void +inp_unlock_assert(struct inpcb *inp __unused) +{ +} + +#endif + +void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg); +int inp_ip_tos_get(const struct inpcb *inp); +void inp_ip_tos_set(struct inpcb *inp, int val); +struct socket * + inp_inpcbtosocket(struct inpcb *inp); +struct tcpcb * + inp_inpcbtotcpcb(struct inpcb *inp); +void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, + uint32_t *faddr, uint16_t *fp); + +#endif /* _KERNEL */ + +#define INP_INFO_LOCK_INIT(ipi, d) \ + rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE) +#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock) +#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock) +#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock) +#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock) +#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock) +#define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock) +#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) +#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) +#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) +#define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED) +#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) +#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) + +#define INP_PCBHASH(faddr, lport, fport, mask) \ + (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) +#define INP_PCBPORTHASH(lport, mask) \ + (ntohs((lport)) & (mask)) + +/* + * Flags for inp_vflags -- historically version flags only + */ +#define INP_IPV4 0x1 +#define INP_IPV6 0x2 +#define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */ + +/* + * Flags for inp_flags. + */ +#define INP_RECVOPTS 0x00000001 /* receive incoming IP options */ +#define INP_RECVRETOPTS 0x00000002 /* receive IP options for reply */ +#define INP_RECVDSTADDR 0x00000004 /* receive IP dst address */ +#define INP_HDRINCL 0x00000008 /* user supplies entire IP header */ +#define INP_HIGHPORT 0x00000010 /* user wants "high" port binding */ +#define INP_LOWPORT 0x00000020 /* user wants "low" port binding */ +#define INP_ANONPORT 0x00000040 /* port chosen for user */ +#define INP_RECVIF 0x00000080 /* receive incoming interface */ +#define INP_MTUDISC 0x00000100 /* user can do MTU discovery */ +#define INP_FAITH 0x00000200 /* accept FAITH'ed connections */ +#define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */ +#define INP_DONTFRAG 0x00000800 /* don't fragment packet */ +#define INP_BINDANY 0x00001000 /* allow bind to any address */ +#define INP_INHASHLIST 0x00002000 /* in_pcbinshash() has been called */ +#define IN6P_IPV6_V6ONLY 0x00008000 /* restrict AF_INET6 socket for v6 */ +#define IN6P_PKTINFO 0x00010000 /* receive IP6 dst and I/F */ +#define IN6P_HOPLIMIT 0x00020000 /* receive hoplimit */ +#define IN6P_HOPOPTS 0x00040000 /* receive hop-by-hop options */ +#define IN6P_DSTOPTS 0x00080000 /* receive dst options after rthdr */ +#define IN6P_RTHDR 0x00100000 /* receive routing header */ +#define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */ +#define IN6P_TCLASS 0x00400000 /* receive traffic class value */ +#define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */ +#define INP_TIMEWAIT 0x01000000 /* in TIMEWAIT, ppcb is tcptw */ +#define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */ +#define INP_DROPPED 0x04000000 /* protocol drop flag */ +#define INP_SOCKREF 0x08000000 /* strong socket reference */ +#define INP_SW_FLOWID 0x10000000 /* software generated flow id */ +#define INP_HW_FLOWID 0x20000000 /* hardware generated flow id */ +#define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */ +#define IN6P_MTU 0x80000000 /* receive path MTU */ + +#define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ + INP_RECVIF|INP_RECVTTL|\ + IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ + IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ + IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\ + IN6P_MTU) + +/* + * Flags for inp_flags2. + */ +#define INP_LLE_VALID 0x00000001 /* cached lle is valid */ +#define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ + +#define INPLOOKUP_WILDCARD 1 +#define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) +#define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ + +#define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family + +#define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) + +#ifdef _KERNEL +VNET_DECLARE(int, ipport_reservedhigh); +VNET_DECLARE(int, ipport_reservedlow); +VNET_DECLARE(int, ipport_lowfirstauto); +VNET_DECLARE(int, ipport_lowlastauto); +VNET_DECLARE(int, ipport_firstauto); +VNET_DECLARE(int, ipport_lastauto); +VNET_DECLARE(int, ipport_hifirstauto); +VNET_DECLARE(int, ipport_hilastauto); +VNET_DECLARE(int, ipport_randomized); +VNET_DECLARE(int, ipport_randomcps); +VNET_DECLARE(int, ipport_randomtime); +VNET_DECLARE(int, ipport_stoprandom); +VNET_DECLARE(int, ipport_tcpallocs); + +#define V_ipport_reservedhigh VNET(ipport_reservedhigh) +#define V_ipport_reservedlow VNET(ipport_reservedlow) +#define V_ipport_lowfirstauto VNET(ipport_lowfirstauto) +#define V_ipport_lowlastauto VNET(ipport_lowlastauto) +#define V_ipport_firstauto VNET(ipport_firstauto) +#define V_ipport_lastauto VNET(ipport_lastauto) +#define V_ipport_hifirstauto VNET(ipport_hifirstauto) +#define V_ipport_hilastauto VNET(ipport_hilastauto) +#define V_ipport_randomized VNET(ipport_randomized) +#define V_ipport_randomcps VNET(ipport_randomcps) +#define V_ipport_randomtime VNET(ipport_randomtime) +#define V_ipport_stoprandom VNET(ipport_stoprandom) +#define V_ipport_tcpallocs VNET(ipport_tcpallocs) + +extern struct callout ipport_tick_callout; + +void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); +int in_pcballoc(struct socket *, struct inpcbinfo *); +int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); +int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, + u_short *, struct ucred *); +int in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *); +int in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *, + u_short *, in_addr_t *, u_short *, struct inpcb **, + struct ucred *); +void in_pcbdetach(struct inpcb *); +void in_pcbdisconnect(struct inpcb *); +void in_pcbdrop(struct inpcb *); +void in_pcbfree(struct inpcb *); +int in_pcbinshash(struct inpcb *); +struct inpcb * + in_pcblookup_local(struct inpcbinfo *, + struct in_addr, u_short, int, struct ucred *); +struct inpcb * + in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, + struct in_addr, u_int, int, struct ifnet *); +#ifndef __rtems__ +void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, + int, struct inpcb *(*)(struct inpcb *, int)); +#else +void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, + struct inpcb *(*notify)(struct inpcb *, int)); +#endif +void in_pcbref(struct inpcb *); +void in_pcbrehash(struct inpcb *); +int in_pcbrele(struct inpcb *); +void in_pcbsetsolabel(struct socket *so); +int in_getpeeraddr(struct socket *so, struct sockaddr **nam); +int in_getsockaddr(struct socket *so, struct sockaddr **nam); +struct sockaddr * + in_sockaddr(in_port_t port, struct in_addr *addr); +void in_pcbsosetlabel(struct socket *so); +void ipport_tick(void *xtp); +#endif /* _KERNEL */ + +#endif /* !_NETINET_IN_PCB_HH_ */ diff --git a/freebsd/sys/netinet/in_proto.c b/freebsd/sys/netinet/in_proto.c new file mode 100644 index 00000000..9be0b626 --- /dev/null +++ b/freebsd/sys/netinet/in_proto.c @@ -0,0 +1,400 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_proto.c 8.2 (Berkeley) 2/9/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef RADIX_MPATH +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * TCP/IP protocol family: IP, ICMP, UDP, TCP. + */ + +static struct pr_usrreqs nousrreqs; + +#ifdef IPSEC +#include +#endif /* IPSEC */ + +#ifdef SCTP +#include +#include +#include +#include +#endif /* SCTP */ + +#ifdef DEV_PFSYNC +#include +#include +#endif + +extern struct domain inetdomain; + +/* Spacer for loadable protocols. */ +#define IPPROTOSPACER \ +{ \ + .pr_domain = &inetdomain, \ + .pr_protocol = PROTO_SPACER, \ + .pr_usrreqs = &nousrreqs \ +} + +struct protosw inetsw[] = { +{ + .pr_type = 0, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_IP, + .pr_init = ip_init, +#ifdef VIMAGE + .pr_destroy = ip_destroy, +#endif + .pr_slowtimo = ip_slowtimo, + .pr_drain = ip_drain, + .pr_usrreqs = &nousrreqs +}, +{ + .pr_type = SOCK_DGRAM, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_UDP, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = udp_input, + .pr_ctlinput = udp_ctlinput, + .pr_ctloutput = udp_ctloutput, + .pr_init = udp_init, +#ifdef VIMAGE + .pr_destroy = udp_destroy, +#endif + .pr_usrreqs = &udp_usrreqs +}, +{ + .pr_type = SOCK_STREAM, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_TCP, + .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, + .pr_input = tcp_input, + .pr_ctlinput = tcp_ctlinput, + .pr_ctloutput = tcp_ctloutput, + .pr_init = tcp_init, +#ifdef VIMAGE + .pr_destroy = tcp_destroy, +#endif + .pr_slowtimo = tcp_slowtimo, + .pr_drain = tcp_drain, + .pr_usrreqs = &tcp_usrreqs +}, +#ifdef SCTP +{ + .pr_type = SOCK_DGRAM, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_SCTP, + .pr_flags = PR_WANTRCVD, + .pr_input = sctp_input, + .pr_ctlinput = sctp_ctlinput, + .pr_ctloutput = sctp_ctloutput, + .pr_init = sctp_init, +#ifdef VIMAGE + .pr_destroy = sctp_finish, +#endif + .pr_drain = sctp_drain, + .pr_usrreqs = &sctp_usrreqs +}, +{ + .pr_type = SOCK_SEQPACKET, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_SCTP, + .pr_flags = PR_WANTRCVD, + .pr_input = sctp_input, + .pr_ctlinput = sctp_ctlinput, + .pr_ctloutput = sctp_ctloutput, + .pr_drain = sctp_drain, + .pr_usrreqs = &sctp_usrreqs +}, + +{ + .pr_type = SOCK_STREAM, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_SCTP, + .pr_flags = PR_WANTRCVD, + .pr_input = sctp_input, + .pr_ctlinput = sctp_ctlinput, + .pr_ctloutput = sctp_ctloutput, + .pr_drain = sctp_drain, + .pr_usrreqs = &sctp_usrreqs +}, +#endif /* SCTP */ +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_RAW, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = rip_input, + .pr_ctlinput = rip_ctlinput, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}, +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_ICMP, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = icmp_input, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}, +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_IGMP, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = igmp_input, + .pr_ctloutput = rip_ctloutput, + .pr_fasttimo = igmp_fasttimo, + .pr_slowtimo = igmp_slowtimo, + .pr_usrreqs = &rip_usrreqs +}, +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_RSVP, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = rsvp_input, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}, +#ifdef IPSEC +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_AH, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = ah4_input, + .pr_ctlinput = ah4_ctlinput, + .pr_usrreqs = &nousrreqs +}, +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_ESP, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = esp4_input, + .pr_ctlinput = esp4_ctlinput, + .pr_usrreqs = &nousrreqs +}, +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_IPCOMP, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = ipcomp4_input, + .pr_usrreqs = &nousrreqs +}, +#endif /* IPSEC */ +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_IPV4, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = encap4_input, + .pr_ctloutput = rip_ctloutput, + .pr_init = encap_init, + .pr_usrreqs = &rip_usrreqs +}, +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_MOBILE, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = encap4_input, + .pr_ctloutput = rip_ctloutput, + .pr_init = encap_init, + .pr_usrreqs = &rip_usrreqs +}, +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_ETHERIP, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = encap4_input, + .pr_ctloutput = rip_ctloutput, + .pr_init = encap_init, + .pr_usrreqs = &rip_usrreqs +}, +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_GRE, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = encap4_input, + .pr_ctloutput = rip_ctloutput, + .pr_init = encap_init, + .pr_usrreqs = &rip_usrreqs +}, +# ifdef INET6 +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_IPV6, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = encap4_input, + .pr_ctloutput = rip_ctloutput, + .pr_init = encap_init, + .pr_usrreqs = &rip_usrreqs +}, +#endif +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_PIM, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = encap4_input, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}, +#ifdef DEV_PFSYNC +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_PFSYNC, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = pfsync_input, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}, +#endif /* DEV_PFSYNC */ +/* Spacer n-times for loadable protocols. */ +IPPROTOSPACER, +IPPROTOSPACER, +IPPROTOSPACER, +IPPROTOSPACER, +IPPROTOSPACER, +IPPROTOSPACER, +IPPROTOSPACER, +IPPROTOSPACER, +/* raw wildcard */ +{ + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = rip_input, + .pr_ctloutput = rip_ctloutput, + .pr_init = rip_init, +#ifdef VIMAGE + .pr_destroy = rip_destroy, +#endif + .pr_usrreqs = &rip_usrreqs +}, +}; + +extern int in_inithead(void **, int); +extern int in_detachhead(void **, int); + +struct domain inetdomain = { + .dom_family = AF_INET, + .dom_name = "internet", + .dom_protosw = inetsw, + .dom_protoswNPROTOSW = &inetsw[sizeof(inetsw)/sizeof(inetsw[0])], +#ifdef RADIX_MPATH + .dom_rtattach = rn4_mpath_inithead, +#else + .dom_rtattach = in_inithead, +#endif +#ifdef VIMAGE + .dom_rtdetach = in_detachhead, +#endif + .dom_rtoffset = 32, + .dom_maxrtkey = sizeof(struct sockaddr_in), + .dom_ifattach = in_domifattach, + .dom_ifdetach = in_domifdetach +}; + +VNET_DOMAIN_SET(inet); + +SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW, 0, + "Internet Family"); + +SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW, 0, "IP"); +SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW, 0, "ICMP"); +SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW, 0, "UDP"); +SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW, 0, "TCP"); +#ifdef SCTP +SYSCTL_NODE(_net_inet, IPPROTO_SCTP, sctp, CTLFLAG_RW, 0, "SCTP"); +#endif +SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW, 0, "IGMP"); +#ifdef IPSEC +/* XXX no protocol # to use, pick something "reserved" */ +SYSCTL_NODE(_net_inet, 253, ipsec, CTLFLAG_RW, 0, "IPSEC"); +SYSCTL_NODE(_net_inet, IPPROTO_AH, ah, CTLFLAG_RW, 0, "AH"); +SYSCTL_NODE(_net_inet, IPPROTO_ESP, esp, CTLFLAG_RW, 0, "ESP"); +SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW, 0, "IPCOMP"); +SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW, 0, "IPIP"); +#endif /* IPSEC */ +SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW, 0, "RAW"); +#ifdef DEV_PFSYNC +SYSCTL_NODE(_net_inet, IPPROTO_PFSYNC, pfsync, CTLFLAG_RW, 0, "PFSYNC"); +#endif diff --git a/freebsd/sys/netinet/in_rmx.c b/freebsd/sys/netinet/in_rmx.c new file mode 100644 index 00000000..25f99ea0 --- /dev/null +++ b/freebsd/sys/netinet/in_rmx.c @@ -0,0 +1,516 @@ +#include + +/*- + * Copyright 1994, 1995 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This code does two things necessary for the enhanced TCP metrics to + * function in a useful manner: + * 1) It marks all non-host routes as `cloning', thus ensuring that + * every actual reference to such a route actually gets turned + * into a reference to a host route to the specific destination + * requested. + * 2) When such routes lose all their references, it arranges for them + * to be deleted in some random collection of circumstances, so that + * a large quantity of stale routing data is not kept in kernel memory + * indefinitely. See in_rtqtimo() below for the exact mechanism. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +extern int in_inithead(void **head, int off); +#ifdef VIMAGE +extern int in_detachhead(void **head, int off); +#endif + +#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ + +/* + * Do what we need to do when inserting a route. + */ +static struct radix_node * +in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, + struct radix_node *treenodes) +{ + struct rtentry *rt = (struct rtentry *)treenodes; + struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt); + + RADIX_NODE_HEAD_WLOCK_ASSERT(head); + /* + * A little bit of help for both IP output and input: + * For host routes, we make sure that RTF_BROADCAST + * is set for anything that looks like a broadcast address. + * This way, we can avoid an expensive call to in_broadcast() + * in ip_output() most of the time (because the route passed + * to ip_output() is almost always a host route). + * + * We also do the same for local addresses, with the thought + * that this might one day be used to speed up ip_input(). + * + * We also mark routes to multicast addresses as such, because + * it's easy to do and might be useful (but this is much more + * dubious since it's so easy to inspect the address). + */ + if (rt->rt_flags & RTF_HOST) { + if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { + rt->rt_flags |= RTF_BROADCAST; + } else if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == + sin->sin_addr.s_addr) { + rt->rt_flags |= RTF_LOCAL; + } + } + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + rt->rt_flags |= RTF_MULTICAST; + + if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp) + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; + + return (rn_addroute(v_arg, n_arg, head, treenodes)); +} + +/* + * This code is the inverse of in_clsroute: on first reference, if we + * were managing the route, stop doing so and set the expiration timer + * back off again. + */ +static struct radix_node * +in_matroute(void *v_arg, struct radix_node_head *head) +{ + struct radix_node *rn = rn_match(v_arg, head); + struct rtentry *rt = (struct rtentry *)rn; + + if (rt) { + RT_LOCK(rt); + if (rt->rt_flags & RTPRF_OURS) { + rt->rt_flags &= ~RTPRF_OURS; + rt->rt_rmx.rmx_expire = 0; + } + RT_UNLOCK(rt); + } + return rn; +} + +static VNET_DEFINE(int, rtq_reallyold) = 60*60; /* one hour is "really old" */ +#define V_rtq_reallyold VNET(rtq_reallyold) +SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, + &VNET_NAME(rtq_reallyold), 0, + "Default expiration time on dynamically learned routes"); + +/* never automatically crank down to less */ +static VNET_DEFINE(int, rtq_minreallyold) = 10; +#define V_rtq_minreallyold VNET(rtq_minreallyold) +SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, + &VNET_NAME(rtq_minreallyold), 0, + "Minimum time to attempt to hold onto dynamically learned routes"); + +/* 128 cached routes is "too many" */ +static VNET_DEFINE(int, rtq_toomany) = 128; +#define V_rtq_toomany VNET(rtq_toomany) +SYSCTL_VNET_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, + &VNET_NAME(rtq_toomany), 0, + "Upper limit on dynamically learned routes"); + +/* + * On last reference drop, mark the route as belong to us so that it can be + * timed out. + */ +static void +in_clsroute(struct radix_node *rn, struct radix_node_head *head) +{ + struct rtentry *rt = (struct rtentry *)rn; + + RT_LOCK_ASSERT(rt); + + if (!(rt->rt_flags & RTF_UP)) + return; /* prophylactic measures */ + + if (rt->rt_flags & RTPRF_OURS) + return; + + if (!(rt->rt_flags & RTF_DYNAMIC)) + return; + + /* + * If rtq_reallyold is 0, just delete the route without + * waiting for a timeout cycle to kill it. + */ + if (V_rtq_reallyold != 0) { + rt->rt_flags |= RTPRF_OURS; + rt->rt_rmx.rmx_expire = time_uptime + V_rtq_reallyold; + } else { + rtexpunge(rt); + } +} + +struct rtqk_arg { + struct radix_node_head *rnh; + int draining; + int killed; + int found; + int updating; + time_t nextstop; +}; + +/* + * Get rid of old routes. When draining, this deletes everything, even when + * the timeout is not expired yet. When updating, this makes sure that + * nothing has a timeout longer than the current value of rtq_reallyold. + */ +static int +in_rtqkill(struct radix_node *rn, void *rock) +{ + struct rtqk_arg *ap = rock; + struct rtentry *rt = (struct rtentry *)rn; + int err; + + RADIX_NODE_HEAD_WLOCK_ASSERT(ap->rnh); + + if (rt->rt_flags & RTPRF_OURS) { + ap->found++; + + if (ap->draining || rt->rt_rmx.rmx_expire <= time_uptime) { + if (rt->rt_refcnt > 0) + panic("rtqkill route really not free"); + + err = in_rtrequest(RTM_DELETE, + (struct sockaddr *)rt_key(rt), + rt->rt_gateway, rt_mask(rt), + rt->rt_flags | RTF_RNH_LOCKED, 0, + rt->rt_fibnum); + if (err) { + log(LOG_WARNING, "in_rtqkill: error %d\n", err); + } else { + ap->killed++; + } + } else { + if (ap->updating && + (rt->rt_rmx.rmx_expire - time_uptime > + V_rtq_reallyold)) { + rt->rt_rmx.rmx_expire = + time_uptime + V_rtq_reallyold; + } + ap->nextstop = lmin(ap->nextstop, + rt->rt_rmx.rmx_expire); + } + } + + return 0; +} + +#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ +static VNET_DEFINE(int, rtq_timeout) = RTQ_TIMEOUT; +static VNET_DEFINE(struct callout, rtq_timer); + +#define V_rtq_timeout VNET(rtq_timeout) +#define V_rtq_timer VNET(rtq_timer) + +static void in_rtqtimo_one(void *rock); + +static void +in_rtqtimo(void *rock) +{ + CURVNET_SET((struct vnet *) rock); + int fibnum; + void *newrock; + struct timeval atv; + + for (fibnum = 0; fibnum < rt_numfibs; fibnum++) { + newrock = rt_tables_get_rnh(fibnum, AF_INET); + if (newrock != NULL) + in_rtqtimo_one(newrock); + } + atv.tv_usec = 0; + atv.tv_sec = V_rtq_timeout; + callout_reset(&V_rtq_timer, tvtohz(&atv), in_rtqtimo, rock); + CURVNET_RESTORE(); +} + +static void +in_rtqtimo_one(void *rock) +{ + struct radix_node_head *rnh = rock; + struct rtqk_arg arg; + static time_t last_adjusted_timeout = 0; + + arg.found = arg.killed = 0; + arg.rnh = rnh; + arg.nextstop = time_uptime + V_rtq_timeout; + arg.draining = arg.updating = 0; + RADIX_NODE_HEAD_LOCK(rnh); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + RADIX_NODE_HEAD_UNLOCK(rnh); + + /* + * Attempt to be somewhat dynamic about this: + * If there are ``too many'' routes sitting around taking up space, + * then crank down the timeout, and see if we can't make some more + * go away. However, we make sure that we will never adjust more + * than once in rtq_timeout seconds, to keep from cranking down too + * hard. + */ + if ((arg.found - arg.killed > V_rtq_toomany) && + (time_uptime - last_adjusted_timeout >= V_rtq_timeout) && + V_rtq_reallyold > V_rtq_minreallyold) { + V_rtq_reallyold = 2 * V_rtq_reallyold / 3; + if (V_rtq_reallyold < V_rtq_minreallyold) { + V_rtq_reallyold = V_rtq_minreallyold; + } + + last_adjusted_timeout = time_uptime; +#ifdef DIAGNOSTIC + log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", + V_rtq_reallyold); +#endif + arg.found = arg.killed = 0; + arg.updating = 1; + RADIX_NODE_HEAD_LOCK(rnh); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + RADIX_NODE_HEAD_UNLOCK(rnh); + } + +} + +void +in_rtqdrain(void) +{ + VNET_ITERATOR_DECL(vnet_iter); + struct radix_node_head *rnh; + struct rtqk_arg arg; + int fibnum; + + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + + for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { + rnh = rt_tables_get_rnh(fibnum, AF_INET); + arg.found = arg.killed = 0; + arg.rnh = rnh; + arg.nextstop = 0; + arg.draining = 1; + arg.updating = 0; + RADIX_NODE_HEAD_LOCK(rnh); + rnh->rnh_walktree(rnh, in_rtqkill, &arg); + RADIX_NODE_HEAD_UNLOCK(rnh); + } + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +} + +static int _in_rt_was_here; +/* + * Initialize our routing tree. + */ +int +in_inithead(void **head, int off) +{ + struct radix_node_head *rnh; + + /* XXX MRT + * This can be called from vfs_export.c too in which case 'off' + * will be 0. We know the correct value so just use that and + * return directly if it was 0. + * This is a hack that replaces an even worse hack on a bad hack + * on a bad design. After RELENG_7 this should be fixed but that + * will change the ABI, so for now do it this way. + */ + if (!rn_inithead(head, 32)) + return 0; + + if (off == 0) /* XXX MRT see above */ + return 1; /* only do the rest for a real routing table */ + + rnh = *head; + rnh->rnh_addaddr = in_addroute; + rnh->rnh_matchaddr = in_matroute; + rnh->rnh_close = in_clsroute; + if (_in_rt_was_here == 0 ) { + callout_init(&V_rtq_timer, CALLOUT_MPSAFE); + callout_reset(&V_rtq_timer, 1, in_rtqtimo, curvnet); + _in_rt_was_here = 1; + } + return 1; +} + +#ifdef VIMAGE +int +in_detachhead(void **head, int off) +{ + + callout_drain(&V_rtq_timer); + return (1); +} +#endif + +/* + * This zaps old routes when the interface goes down or interface + * address is deleted. In the latter case, it deletes static routes + * that point to this address. If we don't do this, we may end up + * using the old address in the future. The ones we always want to + * get rid of are things like ARP entries, since the user might down + * the interface, walk over to a completely different network, and + * plug back in. + */ +struct in_ifadown_arg { + struct ifaddr *ifa; + int del; +}; + +static int +in_ifadownkill(struct radix_node *rn, void *xap) +{ + struct in_ifadown_arg *ap = xap; + struct rtentry *rt = (struct rtentry *)rn; + + RT_LOCK(rt); + if (rt->rt_ifa == ap->ifa && + (ap->del || !(rt->rt_flags & RTF_STATIC))) { + /* + * Aquire a reference so that it can later be freed + * as the refcount would be 0 here in case of at least + * ap->del. + */ + RT_ADDREF(rt); + /* + * Disconnect it from the tree and permit protocols + * to cleanup. + */ + rtexpunge(rt); + /* + * At this point it is an rttrash node, and in case + * the above is the only reference we must free it. + * If we do not noone will have a pointer and the + * rtentry will be leaked forever. + * In case someone else holds a reference, we are + * fine as we only decrement the refcount. In that + * case if the other entity calls RT_REMREF, we + * will still be leaking but at least we tried. + */ + RTFREE_LOCKED(rt); + return (0); + } + RT_UNLOCK(rt); + return 0; +} + +int +in_ifadown(struct ifaddr *ifa, int delete) +{ + struct in_ifadown_arg arg; + struct radix_node_head *rnh; + int fibnum; + + if (ifa->ifa_addr->sa_family != AF_INET) + return 1; + + for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { + rnh = rt_tables_get_rnh(fibnum, AF_INET); + arg.ifa = ifa; + arg.del = delete; + RADIX_NODE_HEAD_LOCK(rnh); + rnh->rnh_walktree(rnh, in_ifadownkill, &arg); + RADIX_NODE_HEAD_UNLOCK(rnh); + ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */ + } + return 0; +} + +/* + * inet versions of rt functions. These have fib extensions and + * for now will just reference the _fib variants. + * eventually this order will be reversed, + */ +void +in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) +{ + rtalloc_ign_fib(ro, ignflags, fibnum); +} + +int +in_rtrequest( int req, + struct sockaddr *dst, + struct sockaddr *gateway, + struct sockaddr *netmask, + int flags, + struct rtentry **ret_nrt, + u_int fibnum) +{ + return (rtrequest_fib(req, dst, gateway, netmask, + flags, ret_nrt, fibnum)); +} + +struct rtentry * +in_rtalloc1(struct sockaddr *dst, int report, u_long ignflags, u_int fibnum) +{ + return (rtalloc1_fib(dst, report, ignflags, fibnum)); +} + +void +in_rtredirect(struct sockaddr *dst, + struct sockaddr *gateway, + struct sockaddr *netmask, + int flags, + struct sockaddr *src, + u_int fibnum) +{ + rtredirect_fib(dst, gateway, netmask, flags, src, fibnum); +} + +void +in_rtalloc(struct route *ro, u_int fibnum) +{ + rtalloc_ign_fib(ro, 0UL, fibnum); +} + +#if 0 +int in_rt_getifa(struct rt_addrinfo *, u_int fibnum); +int in_rtioctl(u_long, caddr_t, u_int); +int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int); +#endif + + diff --git a/freebsd/sys/netinet/in_systm.h b/freebsd/sys/netinet/in_systm.h new file mode 100644 index 00000000..68bb190e --- /dev/null +++ b/freebsd/sys/netinet/in_systm.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/netinet/in_var.h b/freebsd/sys/netinet/in_var.h new file mode 100644 index 00000000..c921ad31 --- /dev/null +++ b/freebsd/sys/netinet/in_var.h @@ -0,0 +1,475 @@ +/*- + * Copyright (c) 1985, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)in_var.h 8.2 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IN_VAR_HH_ +#define _NETINET_IN_VAR_HH_ + +#include +#include +#include + +struct igmp_ifinfo; +struct in_multi; +struct lltable; + +/* + * IPv4 per-interface state. + */ +struct in_ifinfo { + struct lltable *ii_llt; /* ARP state */ + struct igmp_ifinfo *ii_igmp; /* IGMP state */ + struct in_multi *ii_allhosts; /* 224.0.0.1 membership */ +}; + +/* + * Interface address, Internet version. One of these structures + * is allocated for each Internet address on an interface. + * The ifaddr structure contains the protocol-independent part + * of the structure and is assumed to be first. + */ +struct in_ifaddr { + struct ifaddr ia_ifa; /* protocol-independent info */ +#define ia_ifp ia_ifa.ifa_ifp +#define ia_flags ia_ifa.ifa_flags + /* ia_{,sub}net{,mask} in host order */ + u_long ia_net; /* network number of interface */ + u_long ia_netmask; /* mask of net part */ + u_long ia_subnet; /* subnet number, including net */ + u_long ia_subnetmask; /* mask of subnet part */ + struct in_addr ia_netbroadcast; /* to recognize net broadcasts */ + LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */ + TAILQ_ENTRY(in_ifaddr) ia_link; /* list of internet addresses */ + struct sockaddr_in ia_addr; /* reserve space for interface name */ + struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ +#define ia_broadaddr ia_dstaddr + struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ +}; + +struct in_aliasreq { + char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ + struct sockaddr_in ifra_addr; + struct sockaddr_in ifra_broadaddr; +#define ifra_dstaddr ifra_broadaddr + struct sockaddr_in ifra_mask; +}; +/* + * Given a pointer to an in_ifaddr (ifaddr), + * return a pointer to the addr as a sockaddr_in. + */ +#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr)) +#define IA_DSTSIN(ia) (&(((struct in_ifaddr *)(ia))->ia_dstaddr)) + +#define IN_LNAOF(in, ifa) \ + ((ntohl((in).s_addr) & ~((struct in_ifaddr *)(ifa)->ia_subnetmask)) + + +#ifdef _KERNEL +extern u_char inetctlerrmap[]; + +#define LLTABLE(ifp) \ + ((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt +/* + * Hash table for IP addresses. + */ +TAILQ_HEAD(in_ifaddrhead, in_ifaddr); +LIST_HEAD(in_ifaddrhashhead, in_ifaddr); + +VNET_DECLARE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); +VNET_DECLARE(struct in_ifaddrhead, in_ifaddrhead); +VNET_DECLARE(u_long, in_ifaddrhmask); /* mask for hash table */ + +#define V_in_ifaddrhashtbl VNET(in_ifaddrhashtbl) +#define V_in_ifaddrhead VNET(in_ifaddrhead) +#define V_in_ifaddrhmask VNET(in_ifaddrhmask) + +#define INADDR_NHASH_LOG2 9 +#define INADDR_NHASH (1 << INADDR_NHASH_LOG2) +#define INADDR_HASHVAL(x) fnv_32_buf((&(x)), sizeof(x), FNV1_32_INIT) +#define INADDR_HASH(x) \ + (&V_in_ifaddrhashtbl[INADDR_HASHVAL(x) & V_in_ifaddrhmask]) + +extern struct rwlock in_ifaddr_lock; + +#define IN_IFADDR_LOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_LOCKED) +#define IN_IFADDR_RLOCK() rw_rlock(&in_ifaddr_lock) +#define IN_IFADDR_RLOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_RLOCKED) +#define IN_IFADDR_RUNLOCK() rw_runlock(&in_ifaddr_lock) +#define IN_IFADDR_WLOCK() rw_wlock(&in_ifaddr_lock) +#define IN_IFADDR_WLOCK_ASSERT() rw_assert(&in_ifaddr_lock, RA_WLOCKED) +#define IN_IFADDR_WUNLOCK() rw_wunlock(&in_ifaddr_lock) + +/* + * Macro for finding the internet address structure (in_ifaddr) + * corresponding to one of our IP addresses (in_addr). + */ +#define INADDR_TO_IFADDR(addr, ia) \ + /* struct in_addr addr; */ \ + /* struct in_ifaddr *ia; */ \ +do { \ +\ + LIST_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \ + if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ + break; \ +} while (0) + +/* + * Macro for finding the interface (ifnet structure) corresponding to one + * of our IP addresses. + */ +#define INADDR_TO_IFP(addr, ifp) \ + /* struct in_addr addr; */ \ + /* struct ifnet *ifp; */ \ +{ \ + struct in_ifaddr *ia; \ +\ + INADDR_TO_IFADDR(addr, ia); \ + (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ +} + +/* + * Macro for finding the internet address structure (in_ifaddr) corresponding + * to a given interface (ifnet structure). + */ +#define IFP_TO_IA(ifp, ia) \ + /* struct ifnet *ifp; */ \ + /* struct in_ifaddr *ia; */ \ +{ \ + for ((ia) = TAILQ_FIRST(&V_in_ifaddrhead); \ + (ia) != NULL && (ia)->ia_ifp != (ifp); \ + (ia) = TAILQ_NEXT((ia), ia_link)) \ + continue; \ + if ((ia) != NULL) \ + ifa_ref(&(ia)->ia_ifa); \ +} +#endif + +/* + * IP datagram reassembly. + */ +#define IPREASS_NHASH_LOG2 6 +#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) +#define IPREASS_HMASK (IPREASS_NHASH - 1) +#define IPREASS_HASH(x,y) \ + (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK) + +/* + * Legacy IPv4 IGMP per-link structure. + */ +struct router_info { + struct ifnet *rti_ifp; + int rti_type; /* type of router which is querier on this interface */ + int rti_time; /* # of slow timeouts since last old query */ + SLIST_ENTRY(router_info) rti_list; +}; + +/* + * Per-interface IGMP router version information. + */ +struct igmp_ifinfo { + LIST_ENTRY(igmp_ifinfo) igi_link; + struct ifnet *igi_ifp; /* interface this instance belongs to */ + uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ + uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ + uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ + uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ + uint32_t igi_flags; /* IGMP per-interface flags */ + uint32_t igi_rv; /* IGMPv3 Robustness Variable */ + uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ + uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ + uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ + SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */ + struct ifqueue igi_gq; /* queue of general query responses */ +}; + +#define IGIF_SILENT 0x00000001 /* Do not use IGMP on this ifp */ +#define IGIF_LOOPBACK 0x00000002 /* Send IGMP reports to loopback */ + +/* + * IPv4 multicast IGMP-layer source entry. + */ +struct ip_msource { + RB_ENTRY(ip_msource) ims_link; /* RB tree links */ + in_addr_t ims_haddr; /* host byte order */ + struct ims_st { + uint16_t ex; /* # of exclusive members */ + uint16_t in; /* # of inclusive members */ + } ims_st[2]; /* state at t0, t1 */ + uint8_t ims_stp; /* pending query */ +}; + +/* + * IPv4 multicast PCB-layer source entry. + */ +struct in_msource { + RB_ENTRY(ip_msource) ims_link; /* RB tree links */ + in_addr_t ims_haddr; /* host byte order */ + uint8_t imsl_st[2]; /* state before/at commit */ +}; + +RB_HEAD(ip_msource_tree, ip_msource); /* define struct ip_msource_tree */ + +static __inline int +ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b) +{ + + if (a->ims_haddr < b->ims_haddr) + return (-1); + if (a->ims_haddr == b->ims_haddr) + return (0); + return (1); +} +RB_PROTOTYPE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); + +/* + * IPv4 multicast PCB-layer group filter descriptor. + */ +struct in_mfilter { + struct ip_msource_tree imf_sources; /* source list for (S,G) */ + u_long imf_nsrc; /* # of source entries */ + uint8_t imf_st[2]; /* state before/at commit */ +}; + +/* + * IPv4 group descriptor. + * + * For every entry on an ifnet's if_multiaddrs list which represents + * an IP multicast group, there is one of these structures. + * + * If any source filters are present, then a node will exist in the RB-tree + * to permit fast lookup by source whenever an operation takes place. + * This permits pre-order traversal when we issue reports. + * Source filter trees are kept separately from the socket layer to + * greatly simplify locking. + * + * When IGMPv3 is active, inm_timer is the response to group query timer. + * The state-change timer inm_sctimer is separate; whenever state changes + * for the group the state change record is generated and transmitted, + * and kept if retransmissions are necessary. + * + * FUTURE: inm_link is now only used when groups are being purged + * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but + * because it is at the very start of the struct, we can't do this + * w/o breaking the ABI for ifmcstat. + */ +struct in_multi { + LIST_ENTRY(in_multi) inm_link; /* to-be-released by in_ifdetach */ + struct in_addr inm_addr; /* IP multicast address, convenience */ + struct ifnet *inm_ifp; /* back pointer to ifnet */ + struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ + u_int inm_timer; /* IGMPv1/v2 group / v3 query timer */ + u_int inm_state; /* state of the membership */ + void *inm_rti; /* unused, legacy field */ + u_int inm_refcount; /* reference count */ + + /* New fields for IGMPv3 follow. */ + struct igmp_ifinfo *inm_igi; /* IGMP info */ + SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */ + struct ip_msource_tree inm_srcs; /* tree of sources */ + u_long inm_nsrc; /* # of tree entries */ + + struct ifqueue inm_scq; /* queue of pending + * state-change packets */ + struct timeval inm_lastgsrtv; /* Time of last G-S-R query */ + uint16_t inm_sctimer; /* state-change timer */ + uint16_t inm_scrv; /* state-change rexmit count */ + + /* + * SSM state counters which track state at T0 (the time the last + * state-change report's RV timer went to zero) and T1 + * (time of pending report, i.e. now). + * Used for computing IGMPv3 state-change reports. Several refcounts + * are maintained here to optimize for common use-cases. + */ + struct inm_st { + uint16_t iss_fmode; /* IGMP filter mode */ + uint16_t iss_asm; /* # of ASM listeners */ + uint16_t iss_ex; /* # of exclusive members */ + uint16_t iss_in; /* # of inclusive members */ + uint16_t iss_rec; /* # of recorded sources */ + } inm_st[2]; /* state at t0, t1 */ +}; + +/* + * Helper function to derive the filter mode on a source entry + * from its internal counters. Predicates are: + * A source is only excluded if all listeners exclude it. + * A source is only included if no listeners exclude it, + * and at least one listener includes it. + * May be used by ifmcstat(8). + */ +static __inline uint8_t +ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims, + uint8_t t) +{ + + t = !!t; + if (inm->inm_st[t].iss_ex > 0 && + inm->inm_st[t].iss_ex == ims->ims_st[t].ex) + return (MCAST_EXCLUDE); + else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0) + return (MCAST_INCLUDE); + return (MCAST_UNDEFINED); +} + +#ifdef _KERNEL + +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_raw); +#endif + +/* + * Lock macros for IPv4 layer multicast address lists. IPv4 lock goes + * before link layer multicast locks in the lock order. In most cases, + * consumers of IN_*_MULTI() macros should acquire the locks before + * calling them; users of the in_{add,del}multi() functions should not. + */ +extern struct mtx in_multi_mtx; +#define IN_MULTI_LOCK() mtx_lock(&in_multi_mtx) +#define IN_MULTI_UNLOCK() mtx_unlock(&in_multi_mtx) +#define IN_MULTI_LOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_OWNED) +#define IN_MULTI_UNLOCK_ASSERT() mtx_assert(&in_multi_mtx, MA_NOTOWNED) + +/* + * Function for looking up an in_multi record for an IPv4 multicast address + * on a given interface. ifp must be valid. If no record found, return NULL. + * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held. + */ +static __inline struct in_multi * +inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina) +{ + struct ifmultiaddr *ifma; + struct in_multi *inm; + + IN_MULTI_LOCK_ASSERT(); + IF_ADDR_LOCK_ASSERT(ifp); + + inm = NULL; + TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { + if (ifma->ifma_addr->sa_family == AF_INET) { + inm = (struct in_multi *)ifma->ifma_protospec; + if (inm->inm_addr.s_addr == ina.s_addr) + break; + inm = NULL; + } + } + return (inm); +} + +/* + * Wrapper for inm_lookup_locked(). + * The IF_ADDR_LOCK will be taken on ifp and released on return. + */ +static __inline struct in_multi * +inm_lookup(struct ifnet *ifp, const struct in_addr ina) +{ + struct in_multi *inm; + + IN_MULTI_LOCK_ASSERT(); + IF_ADDR_LOCK(ifp); + inm = inm_lookup_locked(ifp, ina); + IF_ADDR_UNLOCK(ifp); + + return (inm); +} + +/* Acquire an in_multi record. */ +static __inline void +inm_acquire_locked(struct in_multi *inm) +{ + + IN_MULTI_LOCK_ASSERT(); + ++inm->inm_refcount; +} + +/* + * Return values for imo_multi_filter(). + */ +#define MCAST_PASS 0 /* Pass */ +#define MCAST_NOTGMEMBER 1 /* This host not a member of group */ +#define MCAST_NOTSMEMBER 2 /* This host excluded source */ +#define MCAST_MUTED 3 /* [deprecated] */ + +struct rtentry; +struct route; +struct ip_moptions; + +int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, + const struct sockaddr *, const struct sockaddr *); +void inm_commit(struct in_multi *); +void inm_clear_recorded(struct in_multi *); +void inm_print(const struct in_multi *); +int inm_record_source(struct in_multi *inm, const in_addr_t); +void inm_release(struct in_multi *); +void inm_release_locked(struct in_multi *); +struct in_multi * + in_addmulti(struct in_addr *, struct ifnet *); +void in_delmulti(struct in_multi *); +int in_joingroup(struct ifnet *, const struct in_addr *, + /*const*/ struct in_mfilter *, struct in_multi **); +int in_joingroup_locked(struct ifnet *, const struct in_addr *, + /*const*/ struct in_mfilter *, struct in_multi **); +int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *); +int in_leavegroup_locked(struct in_multi *, + /*const*/ struct in_mfilter *); +int in_control(struct socket *, u_long, caddr_t, struct ifnet *, + struct thread *); +void in_rtqdrain(void); +void ip_input(struct mbuf *); +int in_ifadown(struct ifaddr *ifa, int); +void in_ifscrub(struct ifnet *, struct in_ifaddr *); +struct mbuf *ip_fastforward(struct mbuf *); +void *in_domifattach(struct ifnet *); +void in_domifdetach(struct ifnet *, void *); + + +/* XXX */ +void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); +void in_rtalloc(struct route *ro, u_int fibnum); +struct rtentry *in_rtalloc1(struct sockaddr *, int, u_long, u_int); +void in_rtredirect(struct sockaddr *, struct sockaddr *, + struct sockaddr *, int, struct sockaddr *, u_int); +int in_rtrequest(int, struct sockaddr *, + struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); + +#if 0 +int in_rt_getifa(struct rt_addrinfo *, u_int fibnum); +int in_rtioctl(u_long, caddr_t, u_int); +int in_rtrequest1(int, struct rt_addrinfo *, struct rtentry **, u_int); +#endif +#endif /* _KERNEL */ + +/* INET6 stuff */ +#include + +#endif /* _NETINET_IN_VAR_HH_ */ diff --git a/freebsd/sys/netinet/ip.h b/freebsd/sys/netinet/ip.h new file mode 100644 index 00000000..9d5d8a9c --- /dev/null +++ b/freebsd/sys/netinet/ip.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/netinet/ip6.h b/freebsd/sys/netinet/ip6.h new file mode 100644 index 00000000..f30da6d1 --- /dev/null +++ b/freebsd/sys/netinet/ip6.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/netinet/ip_carp.c b/freebsd/sys/netinet/ip_carp.c new file mode 100644 index 00000000..25b20895 --- /dev/null +++ b/freebsd/sys/netinet/ip_carp.c @@ -0,0 +1,2427 @@ +#include + +/* + * Copyright (c) 2002 Michael Shalayeff. All rights reserved. + * Copyright (c) 2003 Ryan McBride. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef __rtems__ +#include +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#include +#include +#include +#include +#include +#endif + +#ifdef INET6 +#include +#include +#include +#include +#include +#include +#endif + +#include +#include + +#define CARP_IFNAME "carp" +static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces"); +SYSCTL_DECL(_net_inet_carp); + +struct carp_softc { + struct ifnet *sc_ifp; /* Interface clue */ + struct ifnet *sc_carpdev; /* Pointer to parent interface */ + struct in_ifaddr *sc_ia; /* primary iface address */ + struct ip_moptions sc_imo; +#ifdef INET6 + struct in6_ifaddr *sc_ia6; /* primary iface address v6 */ + struct ip6_moptions sc_im6o; +#endif /* INET6 */ + TAILQ_ENTRY(carp_softc) sc_list; + + enum { INIT = 0, BACKUP, MASTER } sc_state; + + int sc_flags_backup; + int sc_suppress; + + int sc_sendad_errors; +#define CARP_SENDAD_MAX_ERRORS 3 + int sc_sendad_success; +#define CARP_SENDAD_MIN_SUCCESS 3 + + int sc_vhid; + int sc_advskew; + int sc_naddrs; + int sc_naddrs6; + int sc_advbase; /* seconds */ + int sc_init_counter; + u_int64_t sc_counter; + + /* authentication */ +#define CARP_HMAC_PAD 64 + unsigned char sc_key[CARP_KEY_LEN]; + unsigned char sc_pad[CARP_HMAC_PAD]; + SHA1_CTX sc_sha1; + + struct callout sc_ad_tmo; /* advertisement timeout */ + struct callout sc_md_tmo; /* master down timeout */ + struct callout sc_md6_tmo; /* master down timeout */ + + LIST_ENTRY(carp_softc) sc_next; /* Interface clue */ +}; +#define SC2IFP(sc) ((sc)->sc_ifp) + +int carp_suppress_preempt = 0; +int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 }; /* XXX for now */ +SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW, 0, "CARP"); +SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW, + &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets"); +SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW, + &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode"); +SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW, + &carp_opts[CARPCTL_LOG], 0, "log bad carp packets"); +SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW, + &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses"); +SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD, + &carp_suppress_preempt, 0, "Preemption is suppressed"); + +struct carpstats carpstats; +SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW, + &carpstats, carpstats, + "CARP statistics (struct carpstats, netinet/ip_carp.h)"); + +struct carp_if { + TAILQ_HEAD(, carp_softc) vhif_vrs; + int vhif_nvrs; + + struct ifnet *vhif_ifp; + struct mtx vhif_mtx; +}; + +#define CARP_INET 0 +#define CARP_INET6 1 +static int proto_reg[] = {-1, -1}; + +/* Get carp_if from softc. Valid after carp_set_addr{,6}. */ +#define SC2CIF(sc) ((struct carp_if *)(sc)->sc_carpdev->if_carp) + +/* lock per carp_if queue */ +#define CARP_LOCK_INIT(cif) mtx_init(&(cif)->vhif_mtx, "carp_if", \ + NULL, MTX_DEF) +#define CARP_LOCK_DESTROY(cif) mtx_destroy(&(cif)->vhif_mtx) +#define CARP_LOCK_ASSERT(cif) mtx_assert(&(cif)->vhif_mtx, MA_OWNED) +#define CARP_LOCK(cif) mtx_lock(&(cif)->vhif_mtx) +#define CARP_UNLOCK(cif) mtx_unlock(&(cif)->vhif_mtx) + +#define CARP_SCLOCK(sc) mtx_lock(&SC2CIF(sc)->vhif_mtx) +#define CARP_SCUNLOCK(sc) mtx_unlock(&SC2CIF(sc)->vhif_mtx) +#define CARP_SCLOCK_ASSERT(sc) mtx_assert(&SC2CIF(sc)->vhif_mtx, MA_OWNED) + +#define CARP_LOG(...) do { \ + if (carp_opts[CARPCTL_LOG] > 0) \ + log(LOG_INFO, __VA_ARGS__); \ +} while (0) + +#define CARP_DEBUG(...) do { \ + if (carp_opts[CARPCTL_LOG] > 1) \ + log(LOG_DEBUG, __VA_ARGS__); \ +} while (0) + +static void carp_hmac_prepare(struct carp_softc *); +static void carp_hmac_generate(struct carp_softc *, u_int32_t *, + unsigned char *); +static int carp_hmac_verify(struct carp_softc *, u_int32_t *, + unsigned char *); +static void carp_setroute(struct carp_softc *, int); +static void carp_input_c(struct mbuf *, struct carp_header *, sa_family_t); +static int carp_clone_create(struct if_clone *, int, caddr_t); +static void carp_clone_destroy(struct ifnet *); +static void carpdetach(struct carp_softc *, int); +static int carp_prepare_ad(struct mbuf *, struct carp_softc *, + struct carp_header *); +static void carp_send_ad_all(void); +static void carp_send_ad(void *); +static void carp_send_ad_locked(struct carp_softc *); +static void carp_send_arp(struct carp_softc *); +static void carp_master_down(void *); +static void carp_master_down_locked(struct carp_softc *); +static int carp_ioctl(struct ifnet *, u_long, caddr_t); +static int carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *, + struct route *); +static void carp_start(struct ifnet *); +static void carp_setrun(struct carp_softc *, sa_family_t); +static void carp_set_state(struct carp_softc *, int); +static int carp_addrcount(struct carp_if *, struct in_ifaddr *, int); +enum { CARP_COUNT_MASTER, CARP_COUNT_RUNNING }; + +static void carp_multicast_cleanup(struct carp_softc *); +static int carp_set_addr(struct carp_softc *, struct sockaddr_in *); +static int carp_del_addr(struct carp_softc *, struct sockaddr_in *); +static void carp_carpdev_state_locked(struct carp_if *); +static void carp_sc_state_locked(struct carp_softc *); +#ifdef INET6 +static void carp_send_na(struct carp_softc *); +static int carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *); +static int carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *); +static void carp_multicast6_cleanup(struct carp_softc *); +#endif + +static LIST_HEAD(, carp_softc) carpif_list; +static struct mtx carp_mtx; +IFC_SIMPLE_DECLARE(carp, 0); + +static eventhandler_tag if_detach_event_tag; + +static __inline u_int16_t +carp_cksum(struct mbuf *m, int len) +{ + return (in_cksum(m, len)); +} + +static void +carp_hmac_prepare(struct carp_softc *sc) +{ + u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT; + u_int8_t vhid = sc->sc_vhid & 0xff; + struct ifaddr *ifa; + int i, found; +#ifdef INET + struct in_addr last, cur, in; +#endif +#ifdef INET6 + struct in6_addr last6, cur6, in6; +#endif + + if (sc->sc_carpdev) + CARP_SCLOCK(sc); + + /* XXX: possible race here */ + + /* compute ipad from key */ + bzero(sc->sc_pad, sizeof(sc->sc_pad)); + bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key)); + for (i = 0; i < sizeof(sc->sc_pad); i++) + sc->sc_pad[i] ^= 0x36; + + /* precompute first part of inner hash */ + SHA1Init(&sc->sc_sha1); + SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad)); + SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version)); + SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type)); + SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid)); +#ifdef INET + cur.s_addr = 0; + do { + found = 0; + last = cur; + cur.s_addr = 0xffffffff; + IF_ADDR_LOCK(SC2IFP(sc)); + TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr; + if (ifa->ifa_addr->sa_family == AF_INET && + ntohl(in.s_addr) > ntohl(last.s_addr) && + ntohl(in.s_addr) < ntohl(cur.s_addr)) { + cur.s_addr = in.s_addr; + found++; + } + } + IF_ADDR_UNLOCK(SC2IFP(sc)); + if (found) + SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur)); + } while (found); +#endif /* INET */ +#ifdef INET6 + memset(&cur6, 0, sizeof(cur6)); + do { + found = 0; + last6 = cur6; + memset(&cur6, 0xff, sizeof(cur6)); + IF_ADDR_LOCK(SC2IFP(sc)); + TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + in6 = ifatoia6(ifa)->ia_addr.sin6_addr; + if (IN6_IS_SCOPE_EMBED(&in6)) + in6.s6_addr16[1] = 0; + if (ifa->ifa_addr->sa_family == AF_INET6 && + memcmp(&in6, &last6, sizeof(in6)) > 0 && + memcmp(&in6, &cur6, sizeof(in6)) < 0) { + cur6 = in6; + found++; + } + } + IF_ADDR_UNLOCK(SC2IFP(sc)); + if (found) + SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6)); + } while (found); +#endif /* INET6 */ + + /* convert ipad to opad */ + for (i = 0; i < sizeof(sc->sc_pad); i++) + sc->sc_pad[i] ^= 0x36 ^ 0x5c; + + if (sc->sc_carpdev) + CARP_SCUNLOCK(sc); +} + +static void +carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2], + unsigned char md[20]) +{ + SHA1_CTX sha1ctx; + + /* fetch first half of inner hash */ + bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx)); + + SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter)); + SHA1Final(md, &sha1ctx); + + /* outer hash */ + SHA1Init(&sha1ctx); + SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad)); + SHA1Update(&sha1ctx, md, 20); + SHA1Final(md, &sha1ctx); +} + +static int +carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2], + unsigned char md[20]) +{ + unsigned char md2[20]; + + CARP_SCLOCK_ASSERT(sc); + + carp_hmac_generate(sc, counter, md2); + + return (bcmp(md, md2, sizeof(md2))); +} + +static void +carp_setroute(struct carp_softc *sc, int cmd) +{ + struct ifaddr *ifa; + int s; + + if (sc->sc_carpdev) + CARP_SCLOCK_ASSERT(sc); + + s = splnet(); + TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + if (ifa->ifa_addr->sa_family == AF_INET && + sc->sc_carpdev != NULL) { + int count = carp_addrcount( + (struct carp_if *)sc->sc_carpdev->if_carp, + ifatoia(ifa), CARP_COUNT_MASTER); + + if ((cmd == RTM_ADD && count == 1) || + (cmd == RTM_DELETE && count == 0)) + rtinit(ifa, cmd, RTF_UP | RTF_HOST); + } + } + splx(s); +} + +static int +carp_clone_create(struct if_clone *ifc, int unit, caddr_t params) +{ + + struct carp_softc *sc; + struct ifnet *ifp; + + sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO); + ifp = SC2IFP(sc) = if_alloc(IFT_ETHER); + if (ifp == NULL) { + free(sc, M_CARP); + return (ENOSPC); + } + + sc->sc_flags_backup = 0; + sc->sc_suppress = 0; + sc->sc_advbase = CARP_DFLTINTV; + sc->sc_vhid = -1; /* required setting */ + sc->sc_advskew = 0; + sc->sc_init_counter = 1; + sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */ + sc->sc_imo.imo_membership = (struct in_multi **)malloc( + (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP, + M_WAITOK); + sc->sc_imo.imo_mfilters = NULL; + sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS; + sc->sc_imo.imo_multicast_vif = -1; +#ifdef INET6 + sc->sc_im6o.im6o_membership = (struct in6_multi **)malloc( + (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP, + M_WAITOK); + sc->sc_im6o.im6o_mfilters = NULL; + sc->sc_im6o.im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; + sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL; +#endif + + callout_init(&sc->sc_ad_tmo, CALLOUT_MPSAFE); + callout_init(&sc->sc_md_tmo, CALLOUT_MPSAFE); + callout_init(&sc->sc_md6_tmo, CALLOUT_MPSAFE); + + ifp->if_softc = sc; + if_initname(ifp, CARP_IFNAME, unit); + ifp->if_mtu = ETHERMTU; + ifp->if_flags = IFF_LOOPBACK; + ifp->if_ioctl = carp_ioctl; + ifp->if_output = carp_looutput; + ifp->if_start = carp_start; + ifp->if_type = IFT_CARP; + ifp->if_snd.ifq_maxlen = ifqmaxlen; + ifp->if_hdrlen = 0; + if_attach(ifp); + bpfattach(SC2IFP(sc), DLT_NULL, sizeof(u_int32_t)); + mtx_lock(&carp_mtx); + LIST_INSERT_HEAD(&carpif_list, sc, sc_next); + mtx_unlock(&carp_mtx); + return (0); +} + +static void +carp_clone_destroy(struct ifnet *ifp) +{ + struct carp_softc *sc = ifp->if_softc; + + if (sc->sc_carpdev) + CARP_SCLOCK(sc); + carpdetach(sc, 1); /* Returns unlocked. */ + + mtx_lock(&carp_mtx); + LIST_REMOVE(sc, sc_next); + mtx_unlock(&carp_mtx); + bpfdetach(ifp); + if_detach(ifp); + if_free_type(ifp, IFT_ETHER); + free(sc->sc_imo.imo_membership, M_CARP); +#ifdef INET6 + free(sc->sc_im6o.im6o_membership, M_CARP); +#endif + free(sc, M_CARP); +} + +/* + * This function can be called on CARP interface destroy path, + * and in case of the removal of the underlying interface as + * well. We differentiate these two cases. In the latter case + * we do not cleanup our multicast memberships, since they + * are already freed. Also, in the latter case we do not + * release the lock on return, because the function will be + * called once more, for another CARP instance on the same + * interface. + */ +static void +carpdetach(struct carp_softc *sc, int unlock) +{ + struct carp_if *cif; + + callout_stop(&sc->sc_ad_tmo); + callout_stop(&sc->sc_md_tmo); + callout_stop(&sc->sc_md6_tmo); + + if (sc->sc_suppress) + carp_suppress_preempt--; + sc->sc_suppress = 0; + + if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) + carp_suppress_preempt--; + sc->sc_sendad_errors = 0; + + carp_set_state(sc, INIT); + SC2IFP(sc)->if_flags &= ~IFF_UP; + carp_setrun(sc, 0); + if (unlock) + carp_multicast_cleanup(sc); +#ifdef INET6 + carp_multicast6_cleanup(sc); +#endif + + if (sc->sc_carpdev != NULL) { + cif = (struct carp_if *)sc->sc_carpdev->if_carp; + CARP_LOCK_ASSERT(cif); + TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); + if (!--cif->vhif_nvrs) { + ifpromisc(sc->sc_carpdev, 0); + sc->sc_carpdev->if_carp = NULL; + CARP_LOCK_DESTROY(cif); + free(cif, M_CARP); + } else if (unlock) + CARP_UNLOCK(cif); + sc->sc_carpdev = NULL; + } +} + +/* Detach an interface from the carp. */ +static void +carp_ifdetach(void *arg __unused, struct ifnet *ifp) +{ + struct carp_if *cif = (struct carp_if *)ifp->if_carp; + struct carp_softc *sc, *nextsc; + + if (cif == NULL) + return; + + /* + * XXX: At the end of for() cycle the lock will be destroyed. + */ + CARP_LOCK(cif); + for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) { + nextsc = TAILQ_NEXT(sc, sc_list); + carpdetach(sc, 0); + } +} + +/* + * process input packet. + * we have rearranged checks order compared to the rfc, + * but it seems more efficient this way or not possible otherwise. + */ +void +carp_input(struct mbuf *m, int hlen) +{ + struct ip *ip = mtod(m, struct ip *); + struct carp_header *ch; + int iplen, len; + + CARPSTATS_INC(carps_ipackets); + + if (!carp_opts[CARPCTL_ALLOW]) { + m_freem(m); + return; + } + + /* check if received on a valid carp interface */ + if (m->m_pkthdr.rcvif->if_carp == NULL) { + CARPSTATS_INC(carps_badif); + CARP_DEBUG("carp_input: packet received on non-carp " + "interface: %s\n", + m->m_pkthdr.rcvif->if_xname); + m_freem(m); + return; + } + + /* verify that the IP TTL is 255. */ + if (ip->ip_ttl != CARP_DFLTTL) { + CARPSTATS_INC(carps_badttl); + CARP_DEBUG("carp_input: received ttl %d != 255 on %s\n", + ip->ip_ttl, + m->m_pkthdr.rcvif->if_xname); + m_freem(m); + return; + } + + iplen = ip->ip_hl << 2; + + if (m->m_pkthdr.len < iplen + sizeof(*ch)) { + CARPSTATS_INC(carps_badlen); + CARP_DEBUG("carp_input: received len %zd < " + "sizeof(struct carp_header) on %s\n", + m->m_len - sizeof(struct ip), + m->m_pkthdr.rcvif->if_xname); + m_freem(m); + return; + } + + if (iplen + sizeof(*ch) < m->m_len) { + if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) { + CARPSTATS_INC(carps_hdrops); + CARP_DEBUG("carp_input: pullup failed\n"); + return; + } + ip = mtod(m, struct ip *); + } + ch = (struct carp_header *)((char *)ip + iplen); + + /* + * verify that the received packet length is + * equal to the CARP header + */ + len = iplen + sizeof(*ch); + if (len > m->m_pkthdr.len) { + CARPSTATS_INC(carps_badlen); + CARP_DEBUG("carp_input: packet too short %d on %s\n", + m->m_pkthdr.len, + m->m_pkthdr.rcvif->if_xname); + m_freem(m); + return; + } + + if ((m = m_pullup(m, len)) == NULL) { + CARPSTATS_INC(carps_hdrops); + return; + } + ip = mtod(m, struct ip *); + ch = (struct carp_header *)((char *)ip + iplen); + + /* verify the CARP checksum */ + m->m_data += iplen; + if (carp_cksum(m, len - iplen)) { + CARPSTATS_INC(carps_badsum); + CARP_DEBUG("carp_input: checksum failed on %s\n", + m->m_pkthdr.rcvif->if_xname); + m_freem(m); + return; + } + m->m_data -= iplen; + + carp_input_c(m, ch, AF_INET); +} + +#ifdef INET6 +int +carp6_input(struct mbuf **mp, int *offp, int proto) +{ + struct mbuf *m = *mp; + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + struct carp_header *ch; + u_int len; + + CARPSTATS_INC(carps_ipackets6); + + if (!carp_opts[CARPCTL_ALLOW]) { + m_freem(m); + return (IPPROTO_DONE); + } + + /* check if received on a valid carp interface */ + if (m->m_pkthdr.rcvif->if_carp == NULL) { + CARPSTATS_INC(carps_badif); + CARP_DEBUG("carp6_input: packet received on non-carp " + "interface: %s\n", + m->m_pkthdr.rcvif->if_xname); + m_freem(m); + return (IPPROTO_DONE); + } + + /* verify that the IP TTL is 255 */ + if (ip6->ip6_hlim != CARP_DFLTTL) { + CARPSTATS_INC(carps_badttl); + CARP_DEBUG("carp6_input: received ttl %d != 255 on %s\n", + ip6->ip6_hlim, + m->m_pkthdr.rcvif->if_xname); + m_freem(m); + return (IPPROTO_DONE); + } + + /* verify that we have a complete carp packet */ + len = m->m_len; + IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch)); + if (ch == NULL) { + CARPSTATS_INC(carps_badlen); + CARP_DEBUG("carp6_input: packet size %u too small\n", len); + return (IPPROTO_DONE); + } + + + /* verify the CARP checksum */ + m->m_data += *offp; + if (carp_cksum(m, sizeof(*ch))) { + CARPSTATS_INC(carps_badsum); + CARP_DEBUG("carp6_input: checksum failed, on %s\n", + m->m_pkthdr.rcvif->if_xname); + m_freem(m); + return (IPPROTO_DONE); + } + m->m_data -= *offp; + + carp_input_c(m, ch, AF_INET6); + return (IPPROTO_DONE); +} +#endif /* INET6 */ + +static void +carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af) +{ + struct ifnet *ifp = m->m_pkthdr.rcvif; + struct carp_softc *sc; + u_int64_t tmp_counter; + struct timeval sc_tv, ch_tv; + + /* verify that the VHID is valid on the receiving interface */ + CARP_LOCK(ifp->if_carp); + TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list) + if (sc->sc_vhid == ch->carp_vhid) + break; + + if (!sc || !((SC2IFP(sc)->if_flags & IFF_UP) && + (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) { + CARPSTATS_INC(carps_badvhid); + CARP_UNLOCK(ifp->if_carp); + m_freem(m); + return; + } + + getmicrotime(&SC2IFP(sc)->if_lastchange); + SC2IFP(sc)->if_ipackets++; + SC2IFP(sc)->if_ibytes += m->m_pkthdr.len; + + if (bpf_peers_present(SC2IFP(sc)->if_bpf)) { + struct ip *ip = mtod(m, struct ip *); + uint32_t af1 = af; + + /* BPF wants net byte order */ + ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2)); + ip->ip_off = htons(ip->ip_off); + bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m); + } + + /* verify the CARP version. */ + if (ch->carp_version != CARP_VERSION) { + CARPSTATS_INC(carps_badver); + SC2IFP(sc)->if_ierrors++; + CARP_UNLOCK(ifp->if_carp); + CARP_DEBUG("%s; invalid version %d\n", + SC2IFP(sc)->if_xname, + ch->carp_version); + m_freem(m); + return; + } + + /* verify the hash */ + if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) { + CARPSTATS_INC(carps_badauth); + SC2IFP(sc)->if_ierrors++; + CARP_UNLOCK(ifp->if_carp); + CARP_DEBUG("%s: incorrect hash\n", SC2IFP(sc)->if_xname); + m_freem(m); + return; + } + + tmp_counter = ntohl(ch->carp_counter[0]); + tmp_counter = tmp_counter<<32; + tmp_counter += ntohl(ch->carp_counter[1]); + + /* XXX Replay protection goes here */ + + sc->sc_init_counter = 0; + sc->sc_counter = tmp_counter; + + sc_tv.tv_sec = sc->sc_advbase; + if (carp_suppress_preempt && sc->sc_advskew < 240) + sc_tv.tv_usec = 240 * 1000000 / 256; + else + sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256; + ch_tv.tv_sec = ch->carp_advbase; + ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256; + + switch (sc->sc_state) { + case INIT: + break; + case MASTER: + /* + * If we receive an advertisement from a master who's going to + * be more frequent than us, go into BACKUP state. + */ + if (timevalcmp(&sc_tv, &ch_tv, >) || + timevalcmp(&sc_tv, &ch_tv, ==)) { + callout_stop(&sc->sc_ad_tmo); + CARP_LOG("%s: MASTER -> BACKUP " + "(more frequent advertisement received)\n", + SC2IFP(sc)->if_xname); + carp_set_state(sc, BACKUP); + carp_setrun(sc, 0); + carp_setroute(sc, RTM_DELETE); + } + break; + case BACKUP: + /* + * If we're pre-empting masters who advertise slower than us, + * and this one claims to be slower, treat him as down. + */ + if (carp_opts[CARPCTL_PREEMPT] && + timevalcmp(&sc_tv, &ch_tv, <)) { + CARP_LOG("%s: BACKUP -> MASTER " + "(preempting a slower master)\n", + SC2IFP(sc)->if_xname); + carp_master_down_locked(sc); + break; + } + + /* + * If the master is going to advertise at such a low frequency + * that he's guaranteed to time out, we'd might as well just + * treat him as timed out now. + */ + sc_tv.tv_sec = sc->sc_advbase * 3; + if (timevalcmp(&sc_tv, &ch_tv, <)) { + CARP_LOG("%s: BACKUP -> MASTER " + "(master timed out)\n", + SC2IFP(sc)->if_xname); + carp_master_down_locked(sc); + break; + } + + /* + * Otherwise, we reset the counter and wait for the next + * advertisement. + */ + carp_setrun(sc, af); + break; + } + + CARP_UNLOCK(ifp->if_carp); + + m_freem(m); + return; +} + +static int +carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch) +{ + struct m_tag *mtag; + struct ifnet *ifp = SC2IFP(sc); + + if (sc->sc_init_counter) { + /* this could also be seconds since unix epoch */ + sc->sc_counter = arc4random(); + sc->sc_counter = sc->sc_counter << 32; + sc->sc_counter += arc4random(); + } else + sc->sc_counter++; + + ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff); + ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff); + + carp_hmac_generate(sc, ch->carp_counter, ch->carp_md); + + /* Tag packet for carp_output */ + mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT); + if (mtag == NULL) { + m_freem(m); + SC2IFP(sc)->if_oerrors++; + return (ENOMEM); + } + bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *)); + m_tag_prepend(m, mtag); + + return (0); +} + +static void +carp_send_ad_all(void) +{ + struct carp_softc *sc; + + mtx_lock(&carp_mtx); + LIST_FOREACH(sc, &carpif_list, sc_next) { + if (sc->sc_carpdev == NULL) + continue; + CARP_SCLOCK(sc); + if ((SC2IFP(sc)->if_flags & IFF_UP) && + (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING) && + sc->sc_state == MASTER) + carp_send_ad_locked(sc); + CARP_SCUNLOCK(sc); + } + mtx_unlock(&carp_mtx); +} + +static void +carp_send_ad(void *v) +{ + struct carp_softc *sc = v; + + CARP_SCLOCK(sc); + carp_send_ad_locked(sc); + CARP_SCUNLOCK(sc); +} + +static void +carp_send_ad_locked(struct carp_softc *sc) +{ + struct carp_header ch; + struct timeval tv; + struct carp_header *ch_ptr; + struct mbuf *m; + int len, advbase, advskew; + + CARP_SCLOCK_ASSERT(sc); + + /* bow out if we've lost our UPness or RUNNINGuiness */ + if (!((SC2IFP(sc)->if_flags & IFF_UP) && + (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) { + advbase = 255; + advskew = 255; + } else { + advbase = sc->sc_advbase; + if (!carp_suppress_preempt || sc->sc_advskew > 240) + advskew = sc->sc_advskew; + else + advskew = 240; + tv.tv_sec = advbase; + tv.tv_usec = advskew * 1000000 / 256; + } + + ch.carp_version = CARP_VERSION; + ch.carp_type = CARP_ADVERTISEMENT; + ch.carp_vhid = sc->sc_vhid; + ch.carp_advbase = advbase; + ch.carp_advskew = advskew; + ch.carp_authlen = 7; /* XXX DEFINE */ + ch.carp_pad1 = 0; /* must be zero */ + ch.carp_cksum = 0; + +#ifdef INET + if (sc->sc_ia) { + struct ip *ip; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + SC2IFP(sc)->if_oerrors++; + CARPSTATS_INC(carps_onomem); + /* XXX maybe less ? */ + if (advbase != 255 || advskew != 255) + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); + return; + } + len = sizeof(*ip) + sizeof(ch); + m->m_pkthdr.len = len; + m->m_pkthdr.rcvif = NULL; + m->m_len = len; + MH_ALIGN(m, m->m_len); + m->m_flags |= M_MCAST; + ip = mtod(m, struct ip *); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + ip->ip_tos = IPTOS_LOWDELAY; + ip->ip_len = len; + ip->ip_id = ip_newid(); + ip->ip_off = IP_DF; + ip->ip_ttl = CARP_DFLTTL; + ip->ip_p = IPPROTO_CARP; + ip->ip_sum = 0; + ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr; + ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP); + + ch_ptr = (struct carp_header *)(&ip[1]); + bcopy(&ch, ch_ptr, sizeof(ch)); + if (carp_prepare_ad(m, sc, ch_ptr)) + return; + + m->m_data += sizeof(*ip); + ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip)); + m->m_data -= sizeof(*ip); + + getmicrotime(&SC2IFP(sc)->if_lastchange); + SC2IFP(sc)->if_opackets++; + SC2IFP(sc)->if_obytes += len; + CARPSTATS_INC(carps_opackets); + + if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) { + SC2IFP(sc)->if_oerrors++; + if (sc->sc_sendad_errors < INT_MAX) + sc->sc_sendad_errors++; + if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { + carp_suppress_preempt++; + if (carp_suppress_preempt == 1) { + CARP_SCUNLOCK(sc); + carp_send_ad_all(); + CARP_SCLOCK(sc); + } + } + sc->sc_sendad_success = 0; + } else { + if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { + if (++sc->sc_sendad_success >= + CARP_SENDAD_MIN_SUCCESS) { + carp_suppress_preempt--; + sc->sc_sendad_errors = 0; + } + } else + sc->sc_sendad_errors = 0; + } + } +#endif /* INET */ +#ifdef INET6 + if (sc->sc_ia6) { + struct ip6_hdr *ip6; + + MGETHDR(m, M_DONTWAIT, MT_HEADER); + if (m == NULL) { + SC2IFP(sc)->if_oerrors++; + CARPSTATS_INC(carps_onomem); + /* XXX maybe less ? */ + if (advbase != 255 || advskew != 255) + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); + return; + } + len = sizeof(*ip6) + sizeof(ch); + m->m_pkthdr.len = len; + m->m_pkthdr.rcvif = NULL; + m->m_len = len; + MH_ALIGN(m, m->m_len); + m->m_flags |= M_MCAST; + ip6 = mtod(m, struct ip6_hdr *); + bzero(ip6, sizeof(*ip6)); + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_hlim = CARP_DFLTTL; + ip6->ip6_nxt = IPPROTO_CARP; + bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src, + sizeof(struct in6_addr)); + /* set the multicast destination */ + + ip6->ip6_dst.s6_addr16[0] = htons(0xff02); + ip6->ip6_dst.s6_addr8[15] = 0x12; + if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) { + SC2IFP(sc)->if_oerrors++; + m_freem(m); + CARP_DEBUG("%s: in6_setscope failed\n", __func__); + return; + } + + ch_ptr = (struct carp_header *)(&ip6[1]); + bcopy(&ch, ch_ptr, sizeof(ch)); + if (carp_prepare_ad(m, sc, ch_ptr)) + return; + + m->m_data += sizeof(*ip6); + ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6)); + m->m_data -= sizeof(*ip6); + + getmicrotime(&SC2IFP(sc)->if_lastchange); + SC2IFP(sc)->if_opackets++; + SC2IFP(sc)->if_obytes += len; + CARPSTATS_INC(carps_opackets6); + + if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) { + SC2IFP(sc)->if_oerrors++; + if (sc->sc_sendad_errors < INT_MAX) + sc->sc_sendad_errors++; + if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) { + carp_suppress_preempt++; + if (carp_suppress_preempt == 1) { + CARP_SCUNLOCK(sc); + carp_send_ad_all(); + CARP_SCLOCK(sc); + } + } + sc->sc_sendad_success = 0; + } else { + if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) { + if (++sc->sc_sendad_success >= + CARP_SENDAD_MIN_SUCCESS) { + carp_suppress_preempt--; + sc->sc_sendad_errors = 0; + } + } else + sc->sc_sendad_errors = 0; + } + } +#endif /* INET6 */ + + if (advbase != 255 || advskew != 255) + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); + +} + +/* + * Broadcast a gratuitous ARP request containing + * the virtual router MAC address for each IP address + * associated with the virtual router. + */ +static void +carp_send_arp(struct carp_softc *sc) +{ + struct ifaddr *ifa; + + TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + +/* arprequest(sc->sc_carpdev, &in, &in, IF_LLADDR(sc->sc_ifp)); */ + arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp)); + + DELAY(1000); /* XXX */ + } +} + +#ifdef INET6 +static void +carp_send_na(struct carp_softc *sc) +{ + struct ifaddr *ifa; + struct in6_addr *in6; + static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT; + + TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + + if (ifa->ifa_addr->sa_family != AF_INET6) + continue; + + in6 = &ifatoia6(ifa)->ia_addr.sin6_addr; + nd6_na_output(sc->sc_carpdev, &mcast, in6, + ND_NA_FLAG_OVERRIDE, 1, NULL); + DELAY(1000); /* XXX */ + } +} +#endif /* INET6 */ + +static int +carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type) +{ + struct carp_softc *vh; + struct ifaddr *ifa; + int count = 0; + + CARP_LOCK_ASSERT(cif); + + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { + if ((type == CARP_COUNT_RUNNING && + (SC2IFP(vh)->if_flags & IFF_UP) && + (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) || + (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) { + IF_ADDR_LOCK(SC2IFP(vh)); + TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, + ifa_list) { + if (ifa->ifa_addr->sa_family == AF_INET && + ia->ia_addr.sin_addr.s_addr == + ifatoia(ifa)->ia_addr.sin_addr.s_addr) + count++; + } + IF_ADDR_UNLOCK(SC2IFP(vh)); + } + } + return (count); +} + +int +carp_iamatch(struct ifnet *ifp, struct in_ifaddr *ia, + struct in_addr *isaddr, u_int8_t **enaddr) +{ + struct carp_if *cif; + struct carp_softc *vh; + int index, count = 0; + struct ifaddr *ifa; + + cif = ifp->if_carp; + CARP_LOCK(cif); + + if (carp_opts[CARPCTL_ARPBALANCE]) { + /* + * XXX proof of concept implementation. + * We use the source ip to decide which virtual host should + * handle the request. If we're master of that virtual host, + * then we respond, otherwise, just drop the arp packet on + * the floor. + */ + count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING); + if (count == 0) { + /* should never reach this */ + CARP_UNLOCK(cif); + return (0); + } + + /* this should be a hash, like pf_hash() */ + index = ntohl(isaddr->s_addr) % count; + count = 0; + + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { + if ((SC2IFP(vh)->if_flags & IFF_UP) && + (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) { + IF_ADDR_LOCK(SC2IFP(vh)); + TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, + ifa_list) { + if (ifa->ifa_addr->sa_family == + AF_INET && + ia->ia_addr.sin_addr.s_addr == + ifatoia(ifa)->ia_addr.sin_addr.s_addr) { + if (count == index) { + if (vh->sc_state == + MASTER) { + *enaddr = IF_LLADDR(vh->sc_ifp); + IF_ADDR_UNLOCK(SC2IFP(vh)); + CARP_UNLOCK(cif); + return (1); + } else { + IF_ADDR_UNLOCK(SC2IFP(vh)); + CARP_UNLOCK(cif); + return (0); + } + } + count++; + } + } + IF_ADDR_UNLOCK(SC2IFP(vh)); + } + } + } else { + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { + if ((SC2IFP(vh)->if_flags & IFF_UP) && + (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && + ia->ia_ifp == SC2IFP(vh) && + vh->sc_state == MASTER) { + *enaddr = IF_LLADDR(vh->sc_ifp); + CARP_UNLOCK(cif); + return (1); + } + } + } + CARP_UNLOCK(cif); + return (0); +} + +#ifdef INET6 +struct ifaddr * +carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr) +{ + struct carp_if *cif; + struct carp_softc *vh; + struct ifaddr *ifa; + + cif = ifp->if_carp; + CARP_LOCK(cif); + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) { + IF_ADDR_LOCK(SC2IFP(vh)); + TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) { + if (IN6_ARE_ADDR_EQUAL(taddr, + &ifatoia6(ifa)->ia_addr.sin6_addr) && + (SC2IFP(vh)->if_flags & IFF_UP) && + (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && + vh->sc_state == MASTER) { + ifa_ref(ifa); + IF_ADDR_UNLOCK(SC2IFP(vh)); + CARP_UNLOCK(cif); + return (ifa); + } + } + IF_ADDR_UNLOCK(SC2IFP(vh)); + } + CARP_UNLOCK(cif); + + return (NULL); +} + +caddr_t +carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr) +{ + struct m_tag *mtag; + struct carp_if *cif; + struct carp_softc *sc; + struct ifaddr *ifa; + + cif = ifp->if_carp; + CARP_LOCK(cif); + TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) { + IF_ADDR_LOCK(SC2IFP(sc)); + TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) { + if (IN6_ARE_ADDR_EQUAL(taddr, + &ifatoia6(ifa)->ia_addr.sin6_addr) && + (SC2IFP(sc)->if_flags & IFF_UP) && + (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING)) { + struct ifnet *ifp = SC2IFP(sc); + mtag = m_tag_get(PACKET_TAG_CARP, + sizeof(struct ifnet *), M_NOWAIT); + if (mtag == NULL) { + /* better a bit than nothing */ + IF_ADDR_UNLOCK(SC2IFP(sc)); + CARP_UNLOCK(cif); + return (IF_LLADDR(sc->sc_ifp)); + } + bcopy(&ifp, (caddr_t)(mtag + 1), + sizeof(struct ifnet *)); + m_tag_prepend(m, mtag); + + IF_ADDR_UNLOCK(SC2IFP(sc)); + CARP_UNLOCK(cif); + return (IF_LLADDR(sc->sc_ifp)); + } + } + IF_ADDR_UNLOCK(SC2IFP(sc)); + } + CARP_UNLOCK(cif); + + return (NULL); +} +#endif + +struct ifnet * +carp_forus(struct ifnet *ifp, u_char *dhost) +{ + struct carp_if *cif; + struct carp_softc *vh; + u_int8_t *ena = dhost; + + if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1) + return (NULL); + + cif = ifp->if_carp; + CARP_LOCK(cif); + TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) + if ((SC2IFP(vh)->if_flags & IFF_UP) && + (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) && + vh->sc_state == MASTER && + !bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) { + CARP_UNLOCK(cif); + return (SC2IFP(vh)); + } + + CARP_UNLOCK(cif); + return (NULL); +} + +static void +carp_master_down(void *v) +{ + struct carp_softc *sc = v; + + CARP_SCLOCK(sc); + carp_master_down_locked(sc); + CARP_SCUNLOCK(sc); +} + +static void +carp_master_down_locked(struct carp_softc *sc) +{ + if (sc->sc_carpdev) + CARP_SCLOCK_ASSERT(sc); + + switch (sc->sc_state) { + case INIT: + printf("%s: master_down event in INIT state\n", + SC2IFP(sc)->if_xname); + break; + case MASTER: + break; + case BACKUP: + carp_set_state(sc, MASTER); + carp_send_ad_locked(sc); + carp_send_arp(sc); +#ifdef INET6 + carp_send_na(sc); +#endif /* INET6 */ + carp_setrun(sc, 0); + carp_setroute(sc, RTM_ADD); + break; + } +} + +/* + * When in backup state, af indicates whether to reset the master down timer + * for v4 or v6. If it's set to zero, reset the ones which are already pending. + */ +static void +carp_setrun(struct carp_softc *sc, sa_family_t af) +{ + struct timeval tv; + + if (sc->sc_carpdev == NULL) { + SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; + carp_set_state(sc, INIT); + return; + } else + CARP_SCLOCK_ASSERT(sc); + + if (SC2IFP(sc)->if_flags & IFF_UP && + sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6) && + sc->sc_carpdev->if_link_state == LINK_STATE_UP) + SC2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING; + else { + SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; + carp_setroute(sc, RTM_DELETE); + return; + } + + switch (sc->sc_state) { + case INIT: + if (carp_opts[CARPCTL_PREEMPT] && !carp_suppress_preempt) { + carp_send_ad_locked(sc); + carp_send_arp(sc); +#ifdef INET6 + carp_send_na(sc); +#endif /* INET6 */ + CARP_LOG("%s: INIT -> MASTER (preempting)\n", + SC2IFP(sc)->if_xname); + carp_set_state(sc, MASTER); + carp_setroute(sc, RTM_ADD); + } else { + CARP_LOG("%s: INIT -> BACKUP\n", SC2IFP(sc)->if_xname); + carp_set_state(sc, BACKUP); + carp_setroute(sc, RTM_DELETE); + carp_setrun(sc, 0); + } + break; + case BACKUP: + callout_stop(&sc->sc_ad_tmo); + tv.tv_sec = 3 * sc->sc_advbase; + tv.tv_usec = sc->sc_advskew * 1000000 / 256; + switch (af) { +#ifdef INET + case AF_INET: + callout_reset(&sc->sc_md_tmo, tvtohz(&tv), + carp_master_down, sc); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), + carp_master_down, sc); + break; +#endif /* INET6 */ + default: + if (sc->sc_naddrs) + callout_reset(&sc->sc_md_tmo, tvtohz(&tv), + carp_master_down, sc); + if (sc->sc_naddrs6) + callout_reset(&sc->sc_md6_tmo, tvtohz(&tv), + carp_master_down, sc); + break; + } + break; + case MASTER: + tv.tv_sec = sc->sc_advbase; + tv.tv_usec = sc->sc_advskew * 1000000 / 256; + callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), + carp_send_ad, sc); + break; + } +} + +static void +carp_multicast_cleanup(struct carp_softc *sc) +{ + struct ip_moptions *imo = &sc->sc_imo; + u_int16_t n = imo->imo_num_memberships; + + /* Clean up our own multicast memberships */ + while (n-- > 0) { + if (imo->imo_membership[n] != NULL) { + in_delmulti(imo->imo_membership[n]); + imo->imo_membership[n] = NULL; + } + } + KASSERT(imo->imo_mfilters == NULL, + ("%s: imo_mfilters != NULL", __func__)); + imo->imo_num_memberships = 0; + imo->imo_multicast_ifp = NULL; +} + +#ifdef INET6 +static void +carp_multicast6_cleanup(struct carp_softc *sc) +{ + struct ip6_moptions *im6o = &sc->sc_im6o; + u_int16_t n = im6o->im6o_num_memberships; + + while (n-- > 0) { + if (im6o->im6o_membership[n] != NULL) { + in6_mc_leave(im6o->im6o_membership[n], NULL); + im6o->im6o_membership[n] = NULL; + } + } + KASSERT(im6o->im6o_mfilters == NULL, + ("%s: im6o_mfilters != NULL", __func__)); + im6o->im6o_num_memberships = 0; + im6o->im6o_multicast_ifp = NULL; +} +#endif + +static int +carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin) +{ + struct ifnet *ifp; + struct carp_if *cif; + struct in_ifaddr *ia, *ia_if; + struct ip_moptions *imo = &sc->sc_imo; + struct in_addr addr; + u_long iaddr = htonl(sin->sin_addr.s_addr); + int own, error; + + if (sin->sin_addr.s_addr == 0) { + if (!(SC2IFP(sc)->if_flags & IFF_UP)) + carp_set_state(sc, INIT); + if (sc->sc_naddrs) + SC2IFP(sc)->if_flags |= IFF_UP; + if (sc->sc_carpdev) + CARP_SCLOCK(sc); + carp_setrun(sc, 0); + if (sc->sc_carpdev) + CARP_SCUNLOCK(sc); + return (0); + } + + /* we have to do it by hands to check we won't match on us */ + ia_if = NULL; own = 0; + IN_IFADDR_RLOCK(); + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + /* and, yeah, we need a multicast-capable iface too */ + if (ia->ia_ifp != SC2IFP(sc) && + (ia->ia_ifp->if_flags & IFF_MULTICAST) && + (iaddr & ia->ia_subnetmask) == ia->ia_subnet) { + if (!ia_if) + ia_if = ia; + if (sin->sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) + own++; + } + } + + if (!ia_if) { + IN_IFADDR_RUNLOCK(); + return (EADDRNOTAVAIL); + } + + ia = ia_if; + ifa_ref(&ia->ia_ifa); + IN_IFADDR_RUNLOCK(); + + ifp = ia->ia_ifp; + + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || + (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp)) { + ifa_free(&ia->ia_ifa); + return (EADDRNOTAVAIL); + } + + if (imo->imo_num_memberships == 0) { + addr.s_addr = htonl(INADDR_CARP_GROUP); + if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) == + NULL) { + ifa_free(&ia->ia_ifa); + return (ENOBUFS); + } + imo->imo_num_memberships++; + imo->imo_multicast_ifp = ifp; + imo->imo_multicast_ttl = CARP_DFLTTL; + imo->imo_multicast_loop = 0; + } + + if (!ifp->if_carp) { + + cif = malloc(sizeof(*cif), M_CARP, + M_WAITOK|M_ZERO); + if (!cif) { + error = ENOBUFS; + goto cleanup; + } + if ((error = ifpromisc(ifp, 1))) { + free(cif, M_CARP); + goto cleanup; + } + + CARP_LOCK_INIT(cif); + CARP_LOCK(cif); + cif->vhif_ifp = ifp; + TAILQ_INIT(&cif->vhif_vrs); + ifp->if_carp = cif; + + } else { + struct carp_softc *vr; + + cif = (struct carp_if *)ifp->if_carp; + CARP_LOCK(cif); + TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) + if (vr != sc && vr->sc_vhid == sc->sc_vhid) { + CARP_UNLOCK(cif); + error = EEXIST; + goto cleanup; + } + } + sc->sc_ia = ia; + sc->sc_carpdev = ifp; + + { /* XXX prevent endless loop if already in queue */ + struct carp_softc *vr, *after = NULL; + int myself = 0; + cif = (struct carp_if *)ifp->if_carp; + + /* XXX: cif should not change, right? So we still hold the lock */ + CARP_LOCK_ASSERT(cif); + + TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { + if (vr == sc) + myself = 1; + if (vr->sc_vhid < sc->sc_vhid) + after = vr; + } + + if (!myself) { + /* We're trying to keep things in order */ + if (after == NULL) { + TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); + } else { + TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); + } + cif->vhif_nvrs++; + } + } + + sc->sc_naddrs++; + SC2IFP(sc)->if_flags |= IFF_UP; + if (own) + sc->sc_advskew = 0; + carp_sc_state_locked(sc); + carp_setrun(sc, 0); + + CARP_UNLOCK(cif); + ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */ + + return (0); + +cleanup: + in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); + ifa_free(&ia->ia_ifa); + return (error); +} + +static int +carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin) +{ + int error = 0; + + if (!--sc->sc_naddrs) { + struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp; + struct ip_moptions *imo = &sc->sc_imo; + + CARP_LOCK(cif); + callout_stop(&sc->sc_ad_tmo); + SC2IFP(sc)->if_flags &= ~IFF_UP; + SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; + sc->sc_vhid = -1; + in_delmulti(imo->imo_membership[--imo->imo_num_memberships]); + imo->imo_multicast_ifp = NULL; + TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); + if (!--cif->vhif_nvrs) { + sc->sc_carpdev->if_carp = NULL; + CARP_LOCK_DESTROY(cif); + free(cif, M_CARP); + } else { + CARP_UNLOCK(cif); + } + } + + return (error); +} + +#ifdef INET6 +static int +carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) +{ + struct ifnet *ifp; + struct carp_if *cif; + struct in6_ifaddr *ia, *ia_if; + struct ip6_moptions *im6o = &sc->sc_im6o; + struct in6_addr in6; + int own, error; + + error = 0; + + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + if (!(SC2IFP(sc)->if_flags & IFF_UP)) + carp_set_state(sc, INIT); + if (sc->sc_naddrs6) + SC2IFP(sc)->if_flags |= IFF_UP; + if (sc->sc_carpdev) + CARP_SCLOCK(sc); + carp_setrun(sc, 0); + if (sc->sc_carpdev) + CARP_SCUNLOCK(sc); + return (0); + } + + /* we have to do it by hands to check we won't match on us */ + ia_if = NULL; own = 0; + IN6_IFADDR_RLOCK(); + TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { + int i; + + for (i = 0; i < 4; i++) { + if ((sin6->sin6_addr.s6_addr32[i] & + ia->ia_prefixmask.sin6_addr.s6_addr32[i]) != + (ia->ia_addr.sin6_addr.s6_addr32[i] & + ia->ia_prefixmask.sin6_addr.s6_addr32[i])) + break; + } + /* and, yeah, we need a multicast-capable iface too */ + if (ia->ia_ifp != SC2IFP(sc) && + (ia->ia_ifp->if_flags & IFF_MULTICAST) && + (i == 4)) { + if (!ia_if) + ia_if = ia; + if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, + &ia->ia_addr.sin6_addr)) + own++; + } + } + + if (!ia_if) { + IN6_IFADDR_RUNLOCK(); + return (EADDRNOTAVAIL); + } + ia = ia_if; + ifa_ref(&ia->ia_ifa); + IN6_IFADDR_RUNLOCK(); + ifp = ia->ia_ifp; + + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 || + (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp)) { + ifa_free(&ia->ia_ifa); + return (EADDRNOTAVAIL); + } + + if (!sc->sc_naddrs6) { + struct in6_multi *in6m; + + im6o->im6o_multicast_ifp = ifp; + + /* join CARP multicast address */ + bzero(&in6, sizeof(in6)); + in6.s6_addr16[0] = htons(0xff02); + in6.s6_addr8[15] = 0x12; + if (in6_setscope(&in6, ifp, NULL) != 0) + goto cleanup; + in6m = NULL; + error = in6_mc_join(ifp, &in6, NULL, &in6m, 0); + if (error) + goto cleanup; + im6o->im6o_membership[0] = in6m; + im6o->im6o_num_memberships++; + + /* join solicited multicast address */ + bzero(&in6, sizeof(in6)); + in6.s6_addr16[0] = htons(0xff02); + in6.s6_addr32[1] = 0; + in6.s6_addr32[2] = htonl(1); + in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3]; + in6.s6_addr8[12] = 0xff; + if (in6_setscope(&in6, ifp, NULL) != 0) + goto cleanup; + in6m = NULL; + error = in6_mc_join(ifp, &in6, NULL, &in6m, 0); + if (error) + goto cleanup; + im6o->im6o_membership[1] = in6m; + im6o->im6o_num_memberships++; + } + + if (!ifp->if_carp) { + cif = malloc(sizeof(*cif), M_CARP, + M_WAITOK|M_ZERO); + if (!cif) { + error = ENOBUFS; + goto cleanup; + } + if ((error = ifpromisc(ifp, 1))) { + free(cif, M_CARP); + goto cleanup; + } + + CARP_LOCK_INIT(cif); + CARP_LOCK(cif); + cif->vhif_ifp = ifp; + TAILQ_INIT(&cif->vhif_vrs); + ifp->if_carp = cif; + + } else { + struct carp_softc *vr; + + cif = (struct carp_if *)ifp->if_carp; + CARP_LOCK(cif); + TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) + if (vr != sc && vr->sc_vhid == sc->sc_vhid) { + CARP_UNLOCK(cif); + error = EINVAL; + goto cleanup; + } + } + sc->sc_ia6 = ia; + sc->sc_carpdev = ifp; + + { /* XXX prevent endless loop if already in queue */ + struct carp_softc *vr, *after = NULL; + int myself = 0; + cif = (struct carp_if *)ifp->if_carp; + CARP_LOCK_ASSERT(cif); + + TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) { + if (vr == sc) + myself = 1; + if (vr->sc_vhid < sc->sc_vhid) + after = vr; + } + + if (!myself) { + /* We're trying to keep things in order */ + if (after == NULL) { + TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list); + } else { + TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list); + } + cif->vhif_nvrs++; + } + } + + sc->sc_naddrs6++; + SC2IFP(sc)->if_flags |= IFF_UP; + if (own) + sc->sc_advskew = 0; + carp_sc_state_locked(sc); + carp_setrun(sc, 0); + + CARP_UNLOCK(cif); + ifa_free(&ia->ia_ifa); /* XXXRW: should hold reference for softc. */ + + return (0); + +cleanup: + if (!sc->sc_naddrs6) + carp_multicast6_cleanup(sc); + ifa_free(&ia->ia_ifa); + return (error); +} + +static int +carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6) +{ + int error = 0; + + if (!--sc->sc_naddrs6) { + struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp; + + CARP_LOCK(cif); + callout_stop(&sc->sc_ad_tmo); + SC2IFP(sc)->if_flags &= ~IFF_UP; + SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; + sc->sc_vhid = -1; + carp_multicast6_cleanup(sc); + TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list); + if (!--cif->vhif_nvrs) { + CARP_LOCK_DESTROY(cif); + sc->sc_carpdev->if_carp = NULL; + free(cif, M_CARP); + } else + CARP_UNLOCK(cif); + } + + return (error); +} +#endif /* INET6 */ + +static int +carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr) +{ + struct carp_softc *sc = ifp->if_softc, *vr; + struct carpreq carpr; + struct ifaddr *ifa; + struct ifreq *ifr; + struct ifaliasreq *ifra; + int locked = 0, error = 0; + + ifa = (struct ifaddr *)addr; + ifra = (struct ifaliasreq *)addr; + ifr = (struct ifreq *)addr; + + switch (cmd) { + case SIOCSIFADDR: + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + SC2IFP(sc)->if_flags |= IFF_UP; + bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, + sizeof(struct sockaddr)); + error = carp_set_addr(sc, satosin(ifa->ifa_addr)); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + SC2IFP(sc)->if_flags |= IFF_UP; + error = carp_set_addr6(sc, satosin6(ifa->ifa_addr)); + break; +#endif /* INET6 */ + default: + error = EAFNOSUPPORT; + break; + } + break; + + case SIOCAIFADDR: + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + SC2IFP(sc)->if_flags |= IFF_UP; + bcopy(ifa->ifa_addr, ifa->ifa_dstaddr, + sizeof(struct sockaddr)); + error = carp_set_addr(sc, satosin(&ifra->ifra_addr)); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + SC2IFP(sc)->if_flags |= IFF_UP; + error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr)); + break; +#endif /* INET6 */ + default: + error = EAFNOSUPPORT; + break; + } + break; + + case SIOCDIFADDR: + switch (ifa->ifa_addr->sa_family) { +#ifdef INET + case AF_INET: + error = carp_del_addr(sc, satosin(&ifra->ifra_addr)); + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr)); + break; +#endif /* INET6 */ + default: + error = EAFNOSUPPORT; + break; + } + break; + + case SIOCSIFFLAGS: + if (sc->sc_carpdev) { + locked = 1; + CARP_SCLOCK(sc); + } + if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) { + callout_stop(&sc->sc_ad_tmo); + callout_stop(&sc->sc_md_tmo); + callout_stop(&sc->sc_md6_tmo); + if (sc->sc_state == MASTER) + carp_send_ad_locked(sc); + carp_set_state(sc, INIT); + carp_setrun(sc, 0); + } else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) { + SC2IFP(sc)->if_flags |= IFF_UP; + carp_setrun(sc, 0); + } + break; + + case SIOCSVH: + error = priv_check(curthread, PRIV_NETINET_CARP); + if (error) + break; + if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr))) + break; + error = 1; + if (sc->sc_carpdev) { + locked = 1; + CARP_SCLOCK(sc); + } + if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) { + switch (carpr.carpr_state) { + case BACKUP: + callout_stop(&sc->sc_ad_tmo); + carp_set_state(sc, BACKUP); + carp_setrun(sc, 0); + carp_setroute(sc, RTM_DELETE); + break; + case MASTER: + carp_master_down_locked(sc); + break; + default: + break; + } + } + if (carpr.carpr_vhid > 0) { + if (carpr.carpr_vhid > 255) { + error = EINVAL; + break; + } + if (sc->sc_carpdev) { + struct carp_if *cif; + cif = (struct carp_if *)sc->sc_carpdev->if_carp; + TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) + if (vr != sc && + vr->sc_vhid == carpr.carpr_vhid) { + error = EEXIST; + break; + } + if (error == EEXIST) + break; + } + sc->sc_vhid = carpr.carpr_vhid; + IF_LLADDR(sc->sc_ifp)[0] = 0; + IF_LLADDR(sc->sc_ifp)[1] = 0; + IF_LLADDR(sc->sc_ifp)[2] = 0x5e; + IF_LLADDR(sc->sc_ifp)[3] = 0; + IF_LLADDR(sc->sc_ifp)[4] = 1; + IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid; + error--; + } + if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) { + if (carpr.carpr_advskew >= 255) { + error = EINVAL; + break; + } + if (carpr.carpr_advbase > 255) { + error = EINVAL; + break; + } + sc->sc_advbase = carpr.carpr_advbase; + sc->sc_advskew = carpr.carpr_advskew; + error--; + } + bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key)); + if (error > 0) + error = EINVAL; + else { + error = 0; + carp_setrun(sc, 0); + } + break; + + case SIOCGVH: + /* XXX: lockless read */ + bzero(&carpr, sizeof(carpr)); + carpr.carpr_state = sc->sc_state; + carpr.carpr_vhid = sc->sc_vhid; + carpr.carpr_advbase = sc->sc_advbase; + carpr.carpr_advskew = sc->sc_advskew; + error = priv_check(curthread, PRIV_NETINET_CARP); + if (error == 0) + bcopy(sc->sc_key, carpr.carpr_key, + sizeof(carpr.carpr_key)); + error = copyout(&carpr, ifr->ifr_data, sizeof(carpr)); + break; + + default: + error = EINVAL; + } + + if (locked) + CARP_SCUNLOCK(sc); + + carp_hmac_prepare(sc); + + return (error); +} + +/* + * XXX: this is looutput. We should eventually use it from there. + */ +static int +carp_looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, + struct route *ro) +{ + u_int32_t af; + struct rtentry *rt = NULL; + + M_ASSERTPKTHDR(m); /* check if we have the packet header */ + + if (ro != NULL) + rt = ro->ro_rt; + if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + m_freem(m); + return (rt->rt_flags & RTF_BLACKHOLE ? 0 : + rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + } + + ifp->if_opackets++; + ifp->if_obytes += m->m_pkthdr.len; + + /* BPF writes need to be handled specially. */ + if (dst->sa_family == AF_UNSPEC) { + bcopy(dst->sa_data, &af, sizeof(af)); + dst->sa_family = af; + } + +#if 1 /* XXX */ + switch (dst->sa_family) { + case AF_INET: + case AF_INET6: + case AF_IPX: + case AF_APPLETALK: + break; + default: + printf("carp_looutput: af=%d unexpected\n", dst->sa_family); + m_freem(m); + return (EAFNOSUPPORT); + } +#endif + return(if_simloop(ifp, m, dst->sa_family, 0)); +} + +/* + * Start output on carp interface. This function should never be called. + */ +static void +carp_start(struct ifnet *ifp) +{ +#ifdef DEBUG + printf("%s: start called\n", ifp->if_xname); +#endif +} + +int +carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa, + struct rtentry *rt) +{ + struct m_tag *mtag; + struct carp_softc *sc; + struct ifnet *carp_ifp; + + if (!sa) + return (0); + + switch (sa->sa_family) { +#ifdef INET + case AF_INET: + break; +#endif /* INET */ +#ifdef INET6 + case AF_INET6: + break; +#endif /* INET6 */ + default: + return (0); + } + + mtag = m_tag_find(m, PACKET_TAG_CARP, NULL); + if (mtag == NULL) + return (0); + + bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *)); + sc = carp_ifp->if_softc; + + /* Set the source MAC address to Virtual Router MAC Address */ + switch (ifp->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: { + struct ether_header *eh; + + eh = mtod(m, struct ether_header *); + eh->ether_shost[0] = 0; + eh->ether_shost[1] = 0; + eh->ether_shost[2] = 0x5e; + eh->ether_shost[3] = 0; + eh->ether_shost[4] = 1; + eh->ether_shost[5] = sc->sc_vhid; + } + break; + case IFT_FDDI: { + struct fddi_header *fh; + + fh = mtod(m, struct fddi_header *); + fh->fddi_shost[0] = 0; + fh->fddi_shost[1] = 0; + fh->fddi_shost[2] = 0x5e; + fh->fddi_shost[3] = 0; + fh->fddi_shost[4] = 1; + fh->fddi_shost[5] = sc->sc_vhid; + } + break; + case IFT_ISO88025: { + struct iso88025_header *th; + th = mtod(m, struct iso88025_header *); + th->iso88025_shost[0] = 3; + th->iso88025_shost[1] = 0; + th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1); + th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1); + th->iso88025_shost[4] = 0; + th->iso88025_shost[5] = 0; + } + break; + default: + printf("%s: carp is not supported for this interface type\n", + ifp->if_xname); + return (EOPNOTSUPP); + } + + return (0); +} + +static void +carp_set_state(struct carp_softc *sc, int state) +{ + int link_state; + + if (sc->sc_carpdev) + CARP_SCLOCK_ASSERT(sc); + + if (sc->sc_state == state) + return; + + sc->sc_state = state; + switch (state) { + case BACKUP: + link_state = LINK_STATE_DOWN; + break; + case MASTER: + link_state = LINK_STATE_UP; + break; + default: + link_state = LINK_STATE_UNKNOWN; + break; + } + if_link_state_change(SC2IFP(sc), link_state); +} + +void +carp_carpdev_state(struct ifnet *ifp) +{ + struct carp_if *cif; + + cif = ifp->if_carp; + CARP_LOCK(cif); + carp_carpdev_state_locked(cif); + CARP_UNLOCK(cif); +} + +static void +carp_carpdev_state_locked(struct carp_if *cif) +{ + struct carp_softc *sc; + + TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) + carp_sc_state_locked(sc); +} + +static void +carp_sc_state_locked(struct carp_softc *sc) +{ + CARP_SCLOCK_ASSERT(sc); + + if (sc->sc_carpdev->if_link_state != LINK_STATE_UP || + !(sc->sc_carpdev->if_flags & IFF_UP)) { + sc->sc_flags_backup = SC2IFP(sc)->if_flags; + SC2IFP(sc)->if_flags &= ~IFF_UP; + SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING; + callout_stop(&sc->sc_ad_tmo); + callout_stop(&sc->sc_md_tmo); + callout_stop(&sc->sc_md6_tmo); + carp_set_state(sc, INIT); + carp_setrun(sc, 0); + if (!sc->sc_suppress) { + carp_suppress_preempt++; + if (carp_suppress_preempt == 1) { + CARP_SCUNLOCK(sc); + carp_send_ad_all(); + CARP_SCLOCK(sc); + } + } + sc->sc_suppress = 1; + } else { + SC2IFP(sc)->if_flags |= sc->sc_flags_backup; + carp_set_state(sc, INIT); + carp_setrun(sc, 0); + if (sc->sc_suppress) + carp_suppress_preempt--; + sc->sc_suppress = 0; + } + + return; +} + +#ifdef INET +extern struct domain inetdomain; +static struct protosw in_carp_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_CARP, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = carp_input, + .pr_output = (pr_output_t *)rip_output, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}; +#endif + +#ifdef INET6 +extern struct domain inet6domain; +static struct ip6protosw in6_carp_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inet6domain, + .pr_protocol = IPPROTO_CARP, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = carp6_input, + .pr_output = rip6_output, + .pr_ctloutput = rip6_ctloutput, + .pr_usrreqs = &rip6_usrreqs +}; +#endif + +static void +carp_mod_cleanup(void) +{ + + if (if_detach_event_tag == NULL) + return; + EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); + if_clone_detach(&carp_cloner); +#ifdef INET + if (proto_reg[CARP_INET] == 0) { + (void)ipproto_unregister(IPPROTO_CARP); + pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW); + proto_reg[CARP_INET] = -1; + } + carp_iamatch_p = NULL; +#endif +#ifdef INET6 + if (proto_reg[CARP_INET6] == 0) { + (void)ip6proto_unregister(IPPROTO_CARP); + pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW); + proto_reg[CARP_INET6] = -1; + } + carp_iamatch6_p = NULL; + carp_macmatch6_p = NULL; +#endif + carp_linkstate_p = NULL; + carp_forus_p = NULL; + carp_output_p = NULL; + mtx_destroy(&carp_mtx); +} + +static int +carp_mod_load(void) +{ + int err; + + if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, + carp_ifdetach, NULL, EVENTHANDLER_PRI_ANY); + if (if_detach_event_tag == NULL) + return (ENOMEM); + mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF); + LIST_INIT(&carpif_list); + if_clone_attach(&carp_cloner); + carp_linkstate_p = carp_carpdev_state; + carp_forus_p = carp_forus; + carp_output_p = carp_output; +#ifdef INET6 + carp_iamatch6_p = carp_iamatch6; + carp_macmatch6_p = carp_macmatch6; + proto_reg[CARP_INET6] = pf_proto_register(PF_INET6, + (struct protosw *)&in6_carp_protosw); + if (proto_reg[CARP_INET6] != 0) { + printf("carp: error %d attaching to PF_INET6\n", + proto_reg[CARP_INET6]); + carp_mod_cleanup(); + return (EINVAL); + } + err = ip6proto_register(IPPROTO_CARP); + if (err) { + printf("carp: error %d registering with INET6\n", err); + carp_mod_cleanup(); + return (EINVAL); + } +#endif +#ifdef INET + carp_iamatch_p = carp_iamatch; + proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw); + if (proto_reg[CARP_INET] != 0) { + printf("carp: error %d attaching to PF_INET\n", + proto_reg[CARP_INET]); + carp_mod_cleanup(); + return (EINVAL); + } + err = ipproto_register(IPPROTO_CARP); + if (err) { + printf("carp: error %d registering with INET\n", err); + carp_mod_cleanup(); + return (EINVAL); + } +#endif + return 0; +} + +static int +carp_modevent(module_t mod, int type, void *data) +{ + switch (type) { + case MOD_LOAD: + return carp_mod_load(); + /* NOTREACHED */ + case MOD_UNLOAD: + /* + * XXX: For now, disallow module unloading by default due to + * a race condition where a thread may dereference one of the + * function pointer hooks after the module has been + * unloaded, during processing of a packet, causing a panic. + */ +#ifdef CARPMOD_CAN_UNLOAD + carp_mod_cleanup(); +#else + return (EBUSY); +#endif + break; + + default: + return (EINVAL); + } + + return (0); +} + +static moduledata_t carp_mod = { + "carp", + carp_modevent, + 0 +}; + +DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); diff --git a/freebsd/sys/netinet/ip_carp.h b/freebsd/sys/netinet/ip_carp.h new file mode 100644 index 00000000..2f2b4f28 --- /dev/null +++ b/freebsd/sys/netinet/ip_carp.h @@ -0,0 +1,191 @@ +/* $FreeBSD$ */ +/* $OpenBSD: ip_carp.h,v 1.8 2004/07/29 22:12:15 mcbride Exp $ */ + +/* + * Copyright (c) 2002 Michael Shalayeff. All rights reserved. + * Copyright (c) 2003 Ryan McBride. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _IP_CARP_H +#define _IP_CARP_H + +/* + * The CARP header layout is as follows: + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |Version| Type | VirtualHostID | AdvSkew | Auth Len | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Reserved | AdvBase | Checksum | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Counter (1) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Counter (2) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHA-1 HMAC (1) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHA-1 HMAC (2) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHA-1 HMAC (3) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHA-1 HMAC (4) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SHA-1 HMAC (5) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + */ + +struct carp_header { +#if BYTE_ORDER == LITTLE_ENDIAN + u_int8_t carp_type:4, + carp_version:4; +#endif +#if BYTE_ORDER == BIG_ENDIAN + u_int8_t carp_version:4, + carp_type:4; +#endif + u_int8_t carp_vhid; /* virtual host id */ + u_int8_t carp_advskew; /* advertisement skew */ + u_int8_t carp_authlen; /* size of counter+md, 32bit chunks */ + u_int8_t carp_pad1; /* reserved */ + u_int8_t carp_advbase; /* advertisement interval */ + u_int16_t carp_cksum; + u_int32_t carp_counter[2]; + unsigned char carp_md[20]; /* SHA1 HMAC */ +} __packed; + +#ifdef CTASSERT +CTASSERT(sizeof(struct carp_header) == 36); +#endif + +#define CARP_DFLTTL 255 + +/* carp_version */ +#define CARP_VERSION 2 + +/* carp_type */ +#define CARP_ADVERTISEMENT 0x01 + +#define CARP_KEY_LEN 20 /* a sha1 hash of a passphrase */ + +/* carp_advbase */ +#define CARP_DFLTINTV 1 + +/* + * Statistics. + */ +struct carpstats { + uint64_t carps_ipackets; /* total input packets, IPv4 */ + uint64_t carps_ipackets6; /* total input packets, IPv6 */ + uint64_t carps_badif; /* wrong interface */ + uint64_t carps_badttl; /* TTL is not CARP_DFLTTL */ + uint64_t carps_hdrops; /* packets shorter than hdr */ + uint64_t carps_badsum; /* bad checksum */ + uint64_t carps_badver; /* bad (incl unsupp) version */ + uint64_t carps_badlen; /* data length does not match */ + uint64_t carps_badauth; /* bad authentication */ + uint64_t carps_badvhid; /* bad VHID */ + uint64_t carps_badaddrs; /* bad address list */ + + uint64_t carps_opackets; /* total output packets, IPv4 */ + uint64_t carps_opackets6; /* total output packets, IPv6 */ + uint64_t carps_onomem; /* no memory for an mbuf */ + uint64_t carps_ostates; /* total state updates sent */ + + uint64_t carps_preempt; /* if enabled, preemptions */ +}; + +#ifdef _KERNEL +#define CARPSTATS_ADD(name, val) carpstats.name += (val) +#define CARPSTATS_INC(name) CARPSTATS_ADD(name, 1) +#endif + +/* + * Configuration structure for SIOCSVH SIOCGVH + */ +struct carpreq { + int carpr_state; +#define CARP_STATES "INIT", "BACKUP", "MASTER" +#define CARP_MAXSTATE 2 + int carpr_vhid; + int carpr_advskew; + int carpr_advbase; + unsigned char carpr_key[CARP_KEY_LEN]; +}; +#define SIOCSVH _IOWR('i', 245, struct ifreq) +#define SIOCGVH _IOWR('i', 246, struct ifreq) + +/* + * Names for CARP sysctl objects + */ +#define CARPCTL_ALLOW 1 /* accept incoming CARP packets */ +#define CARPCTL_PREEMPT 2 /* high-pri backup preemption mode */ +#define CARPCTL_LOG 3 /* log bad packets */ +#define CARPCTL_STATS 4 /* statistics (read-only) */ +#define CARPCTL_ARPBALANCE 5 /* balance arp responses */ +#define CARPCTL_MAXID 6 + +#define CARPCTL_NAMES { \ + { 0, 0 }, \ + { "allow", CTLTYPE_INT }, \ + { "preempt", CTLTYPE_INT }, \ + { "log", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "arpbalance", CTLTYPE_INT }, \ +} + +#ifdef _KERNEL +void carp_carpdev_state(struct ifnet *); +void carp_input (struct mbuf *, int); +int carp6_input (struct mbuf **, int *, int); +int carp_output (struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +int carp_iamatch (struct ifnet *, struct in_ifaddr *, struct in_addr *, + u_int8_t **); +struct ifaddr *carp_iamatch6(struct ifnet *, struct in6_addr *); +caddr_t carp_macmatch6(struct ifnet *, struct mbuf *, const struct in6_addr *); +struct ifnet *carp_forus (struct ifnet *, u_char *); + +/* These are external networking stack hooks for CARP */ +/* net/if.c */ +extern void (*carp_linkstate_p)(struct ifnet *); +/* net/if_bridge.c net/if_ethersubr.c */ +extern struct ifnet *(*carp_forus_p)(struct ifnet *, u_char *); +/* net/if_ethersubr.c */ +extern int (*carp_output_p)(struct ifnet *, struct mbuf *, + struct sockaddr *, struct rtentry *); +#ifdef INET +/* netinet/if_ether.c */ +extern int (*carp_iamatch_p)(struct ifnet *, struct in_ifaddr *, + struct in_addr *, u_int8_t **); +#endif +#ifdef INET6 +/* netinet6/nd6_nbr.c */ +extern struct ifaddr *(*carp_iamatch6_p)(struct ifnet *, struct in6_addr *); +extern caddr_t (*carp_macmatch6_p)(struct ifnet *, struct mbuf *, + const struct in6_addr *); +#endif +#endif +#endif /* _IP_CARP_H */ diff --git a/freebsd/sys/netinet/ip_divert.c b/freebsd/sys/netinet/ip_divert.c new file mode 100644 index 00000000..13999825 --- /dev/null +++ b/freebsd/sys/netinet/ip_divert.c @@ -0,0 +1,818 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#if !defined(KLD_MODULE) +#include +#include +#ifndef INET +#error "IPDIVERT requires INET." +#endif +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef SCTP +#include +#endif + +#include + +/* + * Divert sockets + */ + +/* + * Allocate enough space to hold a full IP packet + */ +#define DIVSNDQ (65536 + 100) +#define DIVRCVQ (65536 + 100) + +/* + * Divert sockets work in conjunction with ipfw or other packet filters, + * see the divert(4) manpage for features. + * Packets are selected by the packet filter and tagged with an + * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by + * the packet filter) and information on the matching filter rule for + * subsequent reinjection. The divert_port is used to put the packet + * on the corresponding divert socket, while the rule number is passed + * up (at least partially) as the sin_port in the struct sockaddr. + * + * Packets written to the divert socket carry in sin_addr a + * destination address, and in sin_port the number of the filter rule + * after which to continue processing. + * If the destination address is INADDR_ANY, the packet is treated as + * as outgoing and sent to ip_output(); otherwise it is treated as + * incoming and sent to ip_input(). + * Further, sin_zero carries some information on the interface, + * which can be used in the reinject -- see comments in the code. + * + * On reinjection, processing in ip_input() and ip_output() + * will be exactly the same as for the original packet, except that + * packet filter processing will start at the rule number after the one + * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0 + * will apply the entire ruleset to the packet). + */ + +/* Internal variables. */ +static VNET_DEFINE(struct inpcbhead, divcb); +static VNET_DEFINE(struct inpcbinfo, divcbinfo); + +#define V_divcb VNET(divcb) +#define V_divcbinfo VNET(divcbinfo) + +static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ +static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ + +static eventhandler_tag ip_divert_event_tag; + +/* + * Initialize divert connection block queue. + */ +static void +div_zone_change(void *tag) +{ + + uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); +} + +static int +div_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp = mem; + + INP_LOCK_INIT(inp, "inp", "divinp"); + return (0); +} + +static void +div_inpcb_fini(void *mem, int size) +{ + struct inpcb *inp = mem; + + INP_LOCK_DESTROY(inp); +} + +static void +div_init(void) +{ + + INP_INFO_LOCK_INIT(&V_divcbinfo, "div"); + LIST_INIT(&V_divcb); + V_divcbinfo.ipi_listhead = &V_divcb; +#ifdef VIMAGE + V_divcbinfo.ipi_vnet = curvnet; +#endif + /* + * XXX We don't use the hash list for divert IP, but it's easier + * to allocate a one entry hash list than it is to check all + * over the place for hashbase == NULL. + */ + V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask); + V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB, + &V_divcbinfo.ipi_porthashmask); + V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), + NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR, + UMA_ZONE_NOFREE); + uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); +} + +static void +div_destroy(void) +{ + + INP_INFO_LOCK_DESTROY(&V_divcbinfo); + uma_zdestroy(V_divcbinfo.ipi_zone); + hashdestroy(V_divcbinfo.ipi_hashbase, M_PCB, V_divcbinfo.ipi_hashmask); + hashdestroy(V_divcbinfo.ipi_porthashbase, M_PCB, + V_divcbinfo.ipi_porthashmask); +} + +/* + * IPPROTO_DIVERT is not in the real IP protocol number space; this + * function should never be called. Just in case, drop any packets. + */ +static void +div_input(struct mbuf *m, int off) +{ + + KMOD_IPSTAT_INC(ips_noproto); + m_freem(m); +} + +/* + * Divert a packet by passing it up to the divert socket at port 'port'. + * + * Setup generic address and protocol structures for div_input routine, + * then pass them along with mbuf chain. + */ +static void +divert_packet(struct mbuf *m, int incoming) +{ + struct ip *ip; + struct inpcb *inp; + struct socket *sa; + u_int16_t nport; + struct sockaddr_in divsrc; + struct m_tag *mtag; + + mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); + if (mtag == NULL) { + m_freem(m); + return; + } + /* Assure header */ + if (m->m_len < sizeof(struct ip) && + (m = m_pullup(m, sizeof(struct ip))) == 0) + return; + ip = mtod(m, struct ip *); + + /* Delayed checksums are currently not compatible with divert. */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + ip->ip_len = ntohs(ip->ip_len); + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + ip->ip_len = htons(ip->ip_len); + } +#ifdef SCTP + if (m->m_pkthdr.csum_flags & CSUM_SCTP) { + ip->ip_len = ntohs(ip->ip_len); + sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); + m->m_pkthdr.csum_flags &= ~CSUM_SCTP; + ip->ip_len = htons(ip->ip_len); + } +#endif + bzero(&divsrc, sizeof(divsrc)); + divsrc.sin_len = sizeof(divsrc); + divsrc.sin_family = AF_INET; + /* record matching rule, in host format */ + divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum; + /* + * Record receive interface address, if any. + * But only for incoming packets. + */ + if (incoming) { + struct ifaddr *ifa; + struct ifnet *ifp; + + /* Sanity check */ + M_ASSERTPKTHDR(m); + + /* Find IP address for receive interface */ + ifp = m->m_pkthdr.rcvif; + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + divsrc.sin_addr = + ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; + break; + } + if_addr_runlock(ifp); + } + /* + * Record the incoming interface name whenever we have one. + */ + if (m->m_pkthdr.rcvif) { + /* + * Hide the actual interface name in there in the + * sin_zero array. XXX This needs to be moved to a + * different sockaddr type for divert, e.g. + * sockaddr_div with multiple fields like + * sockaddr_dl. Presently we have only 7 bytes + * but that will do for now as most interfaces + * are 4 or less + 2 or less bytes for unit. + * There is probably a faster way of doing this, + * possibly taking it from the sockaddr_dl on the iface. + * This solves the problem of a P2P link and a LAN interface + * having the same address, which can result in the wrong + * interface being assigned to the packet when fed back + * into the divert socket. Theoretically if the daemon saves + * and re-uses the sockaddr_in as suggested in the man pages, + * this iface name will come along for the ride. + * (see div_output for the other half of this.) + */ + strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, + sizeof(divsrc.sin_zero)); + } + + /* Put packet on socket queue, if any */ + sa = NULL; + nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); + INP_INFO_RLOCK(&V_divcbinfo); + LIST_FOREACH(inp, &V_divcb, inp_list) { + /* XXX why does only one socket match? */ + if (inp->inp_lport == nport) { + INP_RLOCK(inp); + sa = inp->inp_socket; + SOCKBUF_LOCK(&sa->so_rcv); + if (sbappendaddr_locked(&sa->so_rcv, + (struct sockaddr *)&divsrc, m, + (struct mbuf *)0) == 0) { + SOCKBUF_UNLOCK(&sa->so_rcv); + sa = NULL; /* force mbuf reclaim below */ + } else + sorwakeup_locked(sa); + INP_RUNLOCK(inp); + break; + } + } + INP_INFO_RUNLOCK(&V_divcbinfo); + if (sa == NULL) { + m_freem(m); + KMOD_IPSTAT_INC(ips_noproto); + KMOD_IPSTAT_DEC(ips_delivered); + } +} + +/* + * Deliver packet back into the IP processing machinery. + * + * If no address specified, or address is 0.0.0.0, send to ip_output(); + * otherwise, send to ip_input() and mark as having been received on + * the interface with that address. + */ +static int +div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, + struct mbuf *control) +{ + struct m_tag *mtag; + struct ipfw_rule_ref *dt; + int error = 0; + struct mbuf *options; + + /* + * An mbuf may hasn't come from userland, but we pretend + * that it has. + */ + m->m_pkthdr.rcvif = NULL; + m->m_nextpkt = NULL; + M_SETFIB(m, so->so_fibnum); + + if (control) + m_freem(control); /* XXX */ + + mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); + if (mtag == NULL) { + /* this should be normal */ + mtag = m_tag_alloc(MTAG_IPFW_RULE, 0, + sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); + if (mtag == NULL) { + error = ENOBUFS; + goto cantsend; + } + m_tag_prepend(m, mtag); + } + dt = (struct ipfw_rule_ref *)(mtag+1); + + /* Loopback avoidance and state recovery */ + if (sin) { + int i; + + /* set the starting point. We provide a non-zero slot, + * but a non_matching chain_id to skip that info and use + * the rulenum/rule_id. + */ + dt->slot = 1; /* dummy, chain_id is invalid */ + dt->chain_id = 0; + dt->rulenum = sin->sin_port+1; /* host format ? */ + dt->rule_id = 0; + /* + * Find receive interface with the given name, stuffed + * (if it exists) in the sin_zero[] field. + * The name is user supplied data so don't trust its size + * or that it is zero terminated. + */ + for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) + ; + if ( i > 0 && i < sizeof(sin->sin_zero)) + m->m_pkthdr.rcvif = ifunit(sin->sin_zero); + } + + /* Reinject packet into the system as incoming or outgoing */ + if (!sin || sin->sin_addr.s_addr == 0) { + struct ip *const ip = mtod(m, struct ip *); + struct inpcb *inp; + + dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT; + inp = sotoinpcb(so); + INP_RLOCK(inp); + /* + * Don't allow both user specified and setsockopt options, + * and don't allow packet length sizes that will crash + */ + if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || + ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { + error = EINVAL; + INP_RUNLOCK(inp); + m_freem(m); + } else { + /* Convert fields to host order for ip_output() */ + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + /* Send packet to output processing */ + KMOD_IPSTAT_INC(ips_rawout); /* XXX */ + +#ifdef MAC + mac_inpcb_create_mbuf(inp, m); +#endif + /* + * Get ready to inject the packet into ip_output(). + * Just in case socket options were specified on the + * divert socket, we duplicate them. This is done + * to avoid having to hold the PCB locks over the call + * to ip_output(), as doing this results in a number of + * lock ordering complexities. + * + * Note that we set the multicast options argument for + * ip_output() to NULL since it should be invariant that + * they are not present. + */ + KASSERT(inp->inp_moptions == NULL, + ("multicast options set on a divert socket")); + options = NULL; + /* + * XXXCSJP: It is unclear to me whether or not it makes + * sense for divert sockets to have options. However, + * for now we will duplicate them with the INP locks + * held so we can use them in ip_output() without + * requring a reference to the pcb. + */ + if (inp->inp_options != NULL) { + options = m_dup(inp->inp_options, M_DONTWAIT); + if (options == NULL) + error = ENOBUFS; + } + INP_RUNLOCK(inp); + if (error == ENOBUFS) { + m_freem(m); + return (error); + } + error = ip_output(m, options, NULL, + ((so->so_options & SO_DONTROUTE) ? + IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST | + IP_RAWOUTPUT, NULL, NULL); + if (options != NULL) + m_freem(options); + } + } else { + dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN; + if (m->m_pkthdr.rcvif == NULL) { + /* + * No luck with the name, check by IP address. + * Clear the port and the ifname to make sure + * there are no distractions for ifa_ifwithaddr. + */ + struct ifaddr *ifa; + + bzero(sin->sin_zero, sizeof(sin->sin_zero)); + sin->sin_port = 0; + ifa = ifa_ifwithaddr((struct sockaddr *) sin); + if (ifa == NULL) { + error = EADDRNOTAVAIL; + goto cantsend; + } + m->m_pkthdr.rcvif = ifa->ifa_ifp; + ifa_free(ifa); + } +#ifdef MAC + mac_socket_create_mbuf(so, m); +#endif + /* Send packet to input processing via netisr */ + netisr_queue_src(NETISR_IP, (uintptr_t)so, m); + } + + return error; + +cantsend: + m_freem(m); + return error; +} + +static int +div_attach(struct socket *so, int proto, struct thread *td) +{ + struct inpcb *inp; + int error; + + inp = sotoinpcb(so); + KASSERT(inp == NULL, ("div_attach: inp != NULL")); + if (td != NULL) { + error = priv_check(td, PRIV_NETINET_DIVERT); + if (error) + return (error); + } + error = soreserve(so, div_sendspace, div_recvspace); + if (error) + return error; + INP_INFO_WLOCK(&V_divcbinfo); + error = in_pcballoc(so, &V_divcbinfo); + if (error) { + INP_INFO_WUNLOCK(&V_divcbinfo); + return error; + } + inp = (struct inpcb *)so->so_pcb; + INP_INFO_WUNLOCK(&V_divcbinfo); + inp->inp_ip_p = proto; + inp->inp_vflag |= INP_IPV4; + inp->inp_flags |= INP_HDRINCL; + INP_WUNLOCK(inp); + return 0; +} + +static void +div_detach(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("div_detach: inp == NULL")); + INP_INFO_WLOCK(&V_divcbinfo); + INP_WLOCK(inp); + in_pcbdetach(inp); + in_pcbfree(inp); + INP_INFO_WUNLOCK(&V_divcbinfo); +} + +static int +div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp; + int error; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("div_bind: inp == NULL")); + /* in_pcbbind assumes that nam is a sockaddr_in + * and in_pcbbind requires a valid address. Since divert + * sockets don't we need to make sure the address is + * filled in properly. + * XXX -- divert should not be abusing in_pcbind + * and should probably have its own family. + */ + if (nam->sa_family != AF_INET) + return EAFNOSUPPORT; + ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; + INP_INFO_WLOCK(&V_divcbinfo); + INP_WLOCK(inp); + error = in_pcbbind(inp, nam, td->td_ucred); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_divcbinfo); + return error; +} + +static int +div_shutdown(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); + INP_WLOCK(inp); + socantsendmore(so); + INP_WUNLOCK(inp); + return 0; +} + +static int +div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct thread *td) +{ + + /* Packet must have a header (but that's about it) */ + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == 0) { + KMOD_IPSTAT_INC(ips_toosmall); + m_freem(m); + return EINVAL; + } + + /* Send packet */ + return div_output(so, m, (struct sockaddr_in *)nam, control); +} + +static void +div_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + struct in_addr faddr; + + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; + if (PRC_IS_REDIRECT(cmd)) + return; +} + +static int +div_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = V_divcbinfo.ipi_count; + n += imax(n / 8, 10); + req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + INP_INFO_RLOCK(&V_divcbinfo); + gencnt = V_divcbinfo.ipi_gencnt; + n = V_divcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_divcbinfo); + + error = sysctl_wire_old_buffer(req, + 2 * sizeof(xig) + n*sizeof(struct xinpcb)); + if (error != 0) + return (error); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return error; + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return ENOMEM; + + INP_INFO_RLOCK(&V_divcbinfo); + for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + INP_WLOCK(inp); + if (inp->inp_gencnt <= gencnt && + cr_canseeinpcb(req->td->td_ucred, inp) == 0) { + in_pcbref(inp); + inp_list[i++] = inp; + } + INP_WUNLOCK(inp); + } + INP_INFO_RUNLOCK(&V_divcbinfo); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_RLOCK(inp); + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + bzero(&xi, sizeof(xi)); + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + INP_RUNLOCK(inp); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } else + INP_RUNLOCK(inp); + } + INP_INFO_WLOCK(&V_divcbinfo); + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_WLOCK(inp); + if (!in_pcbrele(inp)) + INP_WUNLOCK(inp); + } + INP_INFO_WUNLOCK(&V_divcbinfo); + + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + INP_INFO_RLOCK(&V_divcbinfo); + xig.xig_gen = V_divcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = V_divcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_divcbinfo); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return error; +} + +#ifdef SYSCTL_NODE +SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT"); +SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, + div_pcblist, "S,xinpcb", "List of active divert sockets"); +#endif + +struct pr_usrreqs div_usrreqs = { + .pru_attach = div_attach, + .pru_bind = div_bind, + .pru_control = in_control, + .pru_detach = div_detach, + .pru_peeraddr = in_getpeeraddr, + .pru_send = div_send, + .pru_shutdown = div_shutdown, + .pru_sockaddr = in_getsockaddr, + .pru_sosetlabel = in_pcbsosetlabel +}; + +struct protosw div_protosw = { + .pr_type = SOCK_RAW, + .pr_protocol = IPPROTO_DIVERT, + .pr_flags = PR_ATOMIC|PR_ADDR, + .pr_input = div_input, + .pr_ctlinput = div_ctlinput, + .pr_ctloutput = ip_ctloutput, + .pr_init = div_init, +#ifdef VIMAGE + .pr_destroy = div_destroy, +#endif + .pr_usrreqs = &div_usrreqs +}; + +static int +div_modevent(module_t mod, int type, void *unused) +{ + int err = 0; +#ifndef VIMAGE + int n; +#endif + + switch (type) { + case MOD_LOAD: + /* + * Protocol will be initialized by pf_proto_register(). + * We don't have to register ip_protox because we are not + * a true IP protocol that goes over the wire. + */ + err = pf_proto_register(PF_INET, &div_protosw); + if (err != 0) + return (err); + ip_divert_ptr = divert_packet; + ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change, + div_zone_change, NULL, EVENTHANDLER_PRI_ANY); + break; + case MOD_QUIESCE: + /* + * IPDIVERT may normally not be unloaded because of the + * potential race conditions. Tell kldunload we can't be + * unloaded unless the unload is forced. + */ + err = EPERM; + break; + case MOD_UNLOAD: +#ifdef VIMAGE + err = EPERM; + break; +#else + /* + * Forced unload. + * + * Module ipdivert can only be unloaded if no sockets are + * connected. Maybe this can be changed later to forcefully + * disconnect any open sockets. + * + * XXXRW: Note that there is a slight race here, as a new + * socket open request could be spinning on the lock and then + * we destroy the lock. + */ + INP_INFO_WLOCK(&V_divcbinfo); + n = V_divcbinfo.ipi_count; + if (n != 0) { + err = EBUSY; + INP_INFO_WUNLOCK(&V_divcbinfo); + break; + } + ip_divert_ptr = NULL; + err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); + INP_INFO_WUNLOCK(&V_divcbinfo); + div_destroy(); + EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag); + break; +#endif /* !VIMAGE */ + default: + err = EOPNOTSUPP; + break; + } + return err; +} + +static moduledata_t ipdivertmod = { + "ipdivert", + div_modevent, + 0 +}; + +DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); +MODULE_DEPEND(ipdivert, ipfw, 2, 2, 2); +MODULE_VERSION(ipdivert, 1); diff --git a/freebsd/sys/netinet/ip_divert.h b/freebsd/sys/netinet/ip_divert.h new file mode 100644 index 00000000..eb9b33d4 --- /dev/null +++ b/freebsd/sys/netinet/ip_divert.h @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 2003 Sam Leffler, Errno Consulting + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any + * redistribution must be conditioned upon including a substantially + * similar Disclaimer requirement for further binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + * of any contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY + * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGES. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_DIVERT_HH_ +#define _NETINET_IP_DIVERT_HH_ + +/* + * divert has no custom kernel-userland API. + * + * All communication occurs through a sockaddr_in socket where + * + * kernel-->userland + * sin_port = matching rule, host format; + * sin_addr = IN: first address of the incoming interface; + * OUT: INADDR_ANY + * sin_zero = if fits, the interface name (max 7 bytes + NUL) + * + * userland->kernel + * sin_port = restart-rule - 1, host order + * (we restart at sin_port + 1) + * sin_addr = IN: address of the incoming interface; + * OUT: INADDR_ANY + */ +#endif /* _NETINET_IP_DIVERT_HH_ */ diff --git a/freebsd/sys/netinet/ip_dummynet.h b/freebsd/sys/netinet/ip_dummynet.h new file mode 100644 index 00000000..0bbc3263 --- /dev/null +++ b/freebsd/sys/netinet/ip_dummynet.h @@ -0,0 +1,263 @@ +/*- + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IP_DUMMYNET_H +#define _IP_DUMMYNET_H + +/* + * Definition of the kernel-userland API for dummynet. + * + * Setsockopt() and getsockopt() pass a batch of objects, each + * of them starting with a "struct dn_id" which should fully identify + * the object and its relation with others in the sequence. + * The first object in each request should have + * type= DN_CMD_*, id = DN_API_VERSION. + * For other objects, type and subtype specify the object, len indicates + * the total length including the header, and 'id' identifies the specific + * object. + * + * Most objects are numbered with an identifier in the range 1..65535. + * DN_MAX_ID indicates the first value outside the range. + */ + +#define DN_API_VERSION 12500000 +#define DN_MAX_ID 0x10000 + +struct dn_id { + uint16_t len; /* total obj len including this header */ + uint8_t type; + uint8_t subtype; + uint32_t id; /* generic id */ +}; + +/* + * These values are in the type field of struct dn_id. + * To preserve the ABI, never rearrange the list or delete + * entries with the exception of DN_LAST + */ +enum { + DN_NONE = 0, + DN_LINK = 1, + DN_FS, + DN_SCH, + DN_SCH_I, + DN_QUEUE, + DN_DELAY_LINE, + DN_PROFILE, + DN_FLOW, /* struct dn_flow */ + DN_TEXT, /* opaque text is the object */ + + DN_CMD_CONFIG = 0x80, /* objects follow */ + DN_CMD_DELETE, /* subtype + list of entries */ + DN_CMD_GET, /* subtype + list of entries */ + DN_CMD_FLUSH, + /* for compatibility with FreeBSD 7.2/8 */ + DN_COMPAT_PIPE, + DN_COMPAT_QUEUE, + DN_GET_COMPAT, + + /* special commands for emulation of sysctl variables */ + DN_SYSCTL_GET, + DN_SYSCTL_SET, + + DN_LAST, +} ; + +enum { /* subtype for schedulers, flowset and the like */ + DN_SCHED_UNKNOWN = 0, + DN_SCHED_FIFO = 1, + DN_SCHED_WF2QP = 2, + /* others are in individual modules */ +} ; + +enum { /* user flags */ + DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */ + DN_NOERROR = 0x0002, /* do not report errors */ + DN_QHT_HASH = 0x0004, /* qht is a hash table */ + DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */ + DN_HAS_PROFILE = 0x0010, /* a link has a profile */ + DN_IS_RED = 0x0020, + DN_IS_GENTLE_RED= 0x0040, + DN_PIPE_CMD = 0x1000, /* pipe config... */ +}; + +/* + * link template. + */ +struct dn_link { + struct dn_id oid; + + /* + * Userland sets bw and delay in bits/s and milliseconds. + * The kernel converts this back and forth to bits/tick and ticks. + * XXX what about burst ? + */ + int32_t link_nr; + int bandwidth; /* bit/s or bits/tick. */ + int delay; /* ms and ticks */ + uint64_t burst; /* scaled. bits*Hz XXX */ +} ; + +/* + * A flowset, which is a template for flows. Contains parameters + * from the command line: id, target scheduler, queue sizes, plr, + * flow masks, buckets for the flow hash, and possibly scheduler- + * specific parameters (weight, quantum and so on). + */ +struct dn_fs { + struct dn_id oid; + uint32_t fs_nr; /* the flowset number */ + uint32_t flags; /* userland flags */ + int qsize ; /* queue size in slots or bytes */ + int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ + uint32_t buckets; /* buckets used for the queue hash table */ + + struct ipfw_flow_id flow_mask ; + uint32_t sched_nr; /* the scheduler we attach to */ + /* generic scheduler parameters. Leave them at -1 if unset. + * Now we use 0: weight, 1: lmax, 2: priority + */ + int par[4]; + + /* RED/GRED parameters. + * weight and probabilities are in the range 0..1 represented + * in fixed point arithmetic with SCALE_RED decimal bits. + */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + +}; + +/* + * dn_flow collects flow_id and stats for queues and scheduler + * instances, and is used to pass these info to userland. + * oid.type/oid.subtype describe the object, oid.id is number + * of the parent object. + */ +struct dn_flow { + struct dn_id oid; + struct ipfw_flow_id fid; + uint64_t tot_pkts; /* statistics counters */ + uint64_t tot_bytes; + uint32_t length; /* Queue lenght, in packets */ + uint32_t len_bytes; /* Queue lenght, in bytes */ + uint32_t drops; +}; + + + /* + * Scheduler template, mostly indicating the name, number, + * sched_mask and buckets. + */ +struct dn_sch { + struct dn_id oid; + uint32_t sched_nr; /* N, scheduler number */ + uint32_t buckets; /* number of buckets for the instances */ + uint32_t flags; /* have_mask, ... */ + + char name[16]; /* null terminated */ + /* mask to select the appropriate scheduler instance */ + struct ipfw_flow_id sched_mask; /* M */ +}; + + +/* A delay profile is attached to a link. + * Note that a profile, as any other object, cannot be longer than 2^16 + */ +#define ED_MAX_SAMPLES_NO 1024 +struct dn_profile { + struct dn_id oid; + /* fields to simulate a delay profile */ +#define ED_MAX_NAME_LEN 32 + char name[ED_MAX_NAME_LEN]; + int link_nr; + int loss_level; + int bandwidth; // XXX use link bandwidth? + int samples_no; /* actual length of samples[] */ + int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ +}; + + + +/* + * Overall structure of dummynet + +In dummynet, packets are selected with the firewall rules, and passed +to two different objects: PIPE or QUEUE (bad name). + +A QUEUE defines a classifier, which groups packets into flows +according to a 'mask', puts them into independent queues (one +per flow) with configurable size and queue management policy, +and passes flows to a scheduler: + + (flow_mask|sched_mask) sched_mask + +---------+ weight Wx +-------------+ + | |->-[flow]-->--| |-+ + -->--| QUEUE x | ... | | | + | |->-[flow]-->--| SCHEDuler N | | + +---------+ | | | + ... | +--[LINK N]-->-- + +---------+ weight Wy | | +--[LINK N]-->-- + | |->-[flow]-->--| | | + -->--| QUEUE y | ... | | | + | |->-[flow]-->--| | | + +---------+ +-------------+ | + +-------------+ + +Many QUEUE objects can connect to the same scheduler, each +QUEUE object can have its own set of parameters. + +In turn, the SCHEDuler 'forks' multiple instances according +to a 'sched_mask', each instance manages its own set of queues +and transmits on a private instance of a configurable LINK. + +A PIPE is a simplified version of the above, where there +is no flow_mask, and each scheduler instance handles a single queue. + +The following data structures (visible from userland) describe +the objects used by dummynet: + + + dn_link, contains the main configuration parameters related + to delay and bandwidth; + + dn_profile describes a delay profile; + + dn_flow describes the flow status (flow id, statistics) + + + dn_sch describes a scheduler + + dn_fs describes a flowset (msk, weight, queue parameters) + + * + */ + +#endif /* _IP_DUMMYNET_H */ diff --git a/freebsd/sys/netinet/ip_ecn.c b/freebsd/sys/netinet/ip_ecn.c new file mode 100644 index 00000000..97b32b2c --- /dev/null +++ b/freebsd/sys/netinet/ip_ecn.c @@ -0,0 +1,194 @@ +#include + +/* $KAME: ip_ecn.c,v 1.12 2002/01/07 11:34:47 kjc Exp $ */ + +/*- + * Copyright (C) 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * ECN consideration on tunnel ingress/egress operation. + * http://www.aciri.org/floyd/papers/draft-ipsec-ecn-00.txt + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include +#ifdef INET6 +#include +#endif + +/* + * ECN and TOS (or TCLASS) processing rules at tunnel encapsulation and + * decapsulation from RFC3168: + * + * Outer Hdr at Inner Hdr at + * Encapsulator Decapsulator + * Header fields: -------------------- ------------ + * DS Field copied from inner hdr no change + * ECN Field constructed by (I) constructed by (E) + * + * ECN_ALLOWED (full functionality): + * (I) if the ECN field in the inner header is set to CE, then set the + * ECN field in the outer header to ECT(0). + * otherwise, copy the ECN field to the outer header. + * + * (E) if the ECN field in the outer header is set to CE and the ECN + * field of the inner header is not-ECT, drop the packet. + * if the ECN field in the inner header is set to ECT(0) or ECT(1) + * and the ECN field in the outer header is set to CE, then copy CE to + * the inner header. otherwise, make no change to the inner header. + * + * ECN_FORBIDDEN (limited functionality): + * (I) set the ECN field to not-ECT in the outer header. + * + * (E) if the ECN field in the outer header is set to CE, drop the packet. + * otherwise, make no change to the ECN field in the inner header. + * + * the drop rule is for backward compatibility and protection against + * erasure of CE. + */ + +/* + * modify outer ECN (TOS) field on ingress operation (tunnel encapsulation). + */ +void +ip_ecn_ingress(int mode, u_int8_t *outer, const u_int8_t *inner) +{ + + if (!outer || !inner) + panic("NULL pointer passed to ip_ecn_ingress"); + + *outer = *inner; + switch (mode) { + case ECN_ALLOWED: /* ECN allowed */ + /* + * full-functionality: if the inner is CE, set ECT(0) + * to the outer. otherwise, copy the ECN field. + */ + if ((*inner & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + *outer &= ~IPTOS_ECN_ECT1; + break; + case ECN_FORBIDDEN: /* ECN forbidden */ + /* + * limited-functionality: set not-ECT to the outer + */ + *outer &= ~IPTOS_ECN_MASK; + break; + case ECN_NOCARE: /* no consideration to ECN */ + break; + } +} + +/* + * modify inner ECN (TOS) field on egress operation (tunnel decapsulation). + * the caller should drop the packet if the return value is 0. + */ +int +ip_ecn_egress(int mode, const u_int8_t *outer, u_int8_t *inner) +{ + + if (!outer || !inner) + panic("NULL pointer passed to ip_ecn_egress"); + + switch (mode) { + case ECN_ALLOWED: + /* + * full-functionality: if the outer is CE and the inner is + * not-ECT, should drop it. otherwise, copy CE. + */ + if ((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { + if ((*inner & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) + return (0); + *inner |= IPTOS_ECN_CE; + } + break; + case ECN_FORBIDDEN: /* ECN forbidden */ + /* + * limited-functionality: if the outer is CE, should drop it. + * otherwise, leave the inner. + */ + if ((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + return (0); + break; + case ECN_NOCARE: /* no consideration to ECN */ + break; + } + return (1); +} + +#ifdef INET6 +void +ip6_ecn_ingress(int mode, u_int32_t *outer, const u_int32_t *inner) +{ + u_int8_t outer8, inner8; + + if (!outer || !inner) + panic("NULL pointer passed to ip6_ecn_ingress"); + + inner8 = (ntohl(*inner) >> 20) & 0xff; + ip_ecn_ingress(mode, &outer8, &inner8); + *outer &= ~htonl(0xff << 20); + *outer |= htonl((u_int32_t)outer8 << 20); +} + +int +ip6_ecn_egress(int mode, const u_int32_t *outer, u_int32_t *inner) +{ + u_int8_t outer8, inner8, oinner8; + + if (!outer || !inner) + panic("NULL pointer passed to ip6_ecn_egress"); + + outer8 = (ntohl(*outer) >> 20) & 0xff; + inner8 = oinner8 = (ntohl(*inner) >> 20) & 0xff; + if (ip_ecn_egress(mode, &outer8, &inner8) == 0) + return (0); + if (inner8 != oinner8) { + *inner &= ~htonl(0xff << 20); + *inner |= htonl((u_int32_t)inner8 << 20); + } + return (1); +} +#endif diff --git a/freebsd/sys/netinet/ip_ecn.h b/freebsd/sys/netinet/ip_ecn.h new file mode 100644 index 00000000..271c8a47 --- /dev/null +++ b/freebsd/sys/netinet/ip_ecn.h @@ -0,0 +1,53 @@ +/* $FreeBSD$ */ +/* $KAME: ip_ecn.h,v 1.8 2002/01/07 11:34:47 kjc Exp $ */ + +/*- + * Copyright (C) 1999 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * ECN consideration on tunnel ingress/egress operation. + * http://www.aciri.org/floyd/papers/draft-ipsec-ecn-00.txt + */ + +#ifndef _NETINET_IP_ECN_HH_ +#define _NETINET_IP_ECN_HH_ + +#if defined(_KERNEL) && !defined(_LKM) +#include +#endif + +#define ECN_ALLOWED 1 /* ECN allowed */ +#define ECN_FORBIDDEN 0 /* ECN forbidden */ +#define ECN_NOCARE (-1) /* no consideration to ECN */ + +#ifdef _KERNEL +extern void ip_ecn_ingress(int, u_int8_t *, const u_int8_t *); +extern int ip_ecn_egress(int, const u_int8_t *, u_int8_t *); +#endif +#endif diff --git a/freebsd/sys/netinet/ip_encap.c b/freebsd/sys/netinet/ip_encap.c new file mode 100644 index 00000000..45b0593c --- /dev/null +++ b/freebsd/sys/netinet/ip_encap.c @@ -0,0 +1,465 @@ +#include + +/* $KAME: ip_encap.c,v 1.41 2001/03/15 08:35:08 itojun Exp $ */ + +/*- + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * My grandfather said that there's a devil inside tunnelling technology... + * + * We have surprisingly many protocols that want packets with IP protocol + * #4 or #41. Here's a list of protocols that want protocol #41: + * RFC1933 configured tunnel + * RFC1933 automatic tunnel + * RFC2401 IPsec tunnel + * RFC2473 IPv6 generic packet tunnelling + * RFC2529 6over4 tunnel + * mobile-ip6 (uses RFC2473) + * RFC3056 6to4 tunnel + * isatap tunnel + * Here's a list of protocol that want protocol #4: + * RFC1853 IPv4-in-IPv4 tunnelling + * RFC2003 IPv4 encapsulation within IPv4 + * RFC2344 reverse tunnelling for mobile-ip4 + * RFC2401 IPsec tunnel + * Well, what can I say. They impose different en/decapsulation mechanism + * from each other, so they need separate protocol handler. The only one + * we can easily determine by protocol # is IPsec, which always has + * AH/ESP/IPComp header right after outer IP header. + * + * So, clearly good old protosw does not work for protocol #4 and #41. + * The code will let you match protocol via src/dst address pair. + */ +/* XXX is M_NETADDR correct? */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#ifdef INET6 +#include +#include +#include +#endif + +#include + +#include +#include +static MALLOC_DEFINE(M_NETADDR, "encap_export_host", "Export host address structure"); + +static void encap_add(struct encaptab *); +static int mask_match(const struct encaptab *, const struct sockaddr *, + const struct sockaddr *); +static void encap_fillarg(struct mbuf *, const struct encaptab *); + +/* + * All global variables in ip_encap.c are locked using encapmtx. + */ +static struct mtx encapmtx; +MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF); +LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab); + +/* + * We currently keey encap_init() for source code compatibility reasons -- + * it's referenced by KAME pieces in netinet6. + */ +void +encap_init(void) +{ +} + +#ifdef INET +void +encap4_input(struct mbuf *m, int off) +{ + struct ip *ip; + int proto; + struct sockaddr_in s, d; + const struct protosw *psw; + struct encaptab *ep, *match; + int prio, matchprio; + + ip = mtod(m, struct ip *); + proto = ip->ip_p; + + bzero(&s, sizeof(s)); + s.sin_family = AF_INET; + s.sin_len = sizeof(struct sockaddr_in); + s.sin_addr = ip->ip_src; + bzero(&d, sizeof(d)); + d.sin_family = AF_INET; + d.sin_len = sizeof(struct sockaddr_in); + d.sin_addr = ip->ip_dst; + + match = NULL; + matchprio = 0; + mtx_lock(&encapmtx); + LIST_FOREACH(ep, &encaptab, chain) { + if (ep->af != AF_INET) + continue; + if (ep->proto >= 0 && ep->proto != proto) + continue; + if (ep->func) + prio = (*ep->func)(m, off, proto, ep->arg); + else { + /* + * it's inbound traffic, we need to match in reverse + * order + */ + prio = mask_match(ep, (struct sockaddr *)&d, + (struct sockaddr *)&s); + } + + /* + * We prioritize the matches by using bit length of the + * matches. mask_match() and user-supplied matching function + * should return the bit length of the matches (for example, + * if both src/dst are matched for IPv4, 64 should be returned). + * 0 or negative return value means "it did not match". + * + * The question is, since we have two "mask" portion, we + * cannot really define total order between entries. + * For example, which of these should be preferred? + * mask_match() returns 48 (32 + 16) for both of them. + * src=3ffe::/16, dst=3ffe:501::/32 + * src=3ffe:501::/32, dst=3ffe::/16 + * + * We need to loop through all the possible candidates + * to get the best match - the search takes O(n) for + * n attachments (i.e. interfaces). + */ + if (prio <= 0) + continue; + if (prio > matchprio) { + matchprio = prio; + match = ep; + } + } + mtx_unlock(&encapmtx); + + if (match) { + /* found a match, "match" has the best one */ + psw = match->psw; + if (psw && psw->pr_input) { + encap_fillarg(m, match); + (*psw->pr_input)(m, off); + } else + m_freem(m); + return; + } + + /* last resort: inject to raw socket */ + rip_input(m, off); +} +#endif + +#ifdef INET6 +int +encap6_input(struct mbuf **mp, int *offp, int proto) +{ + struct mbuf *m = *mp; + struct ip6_hdr *ip6; + struct sockaddr_in6 s, d; + const struct ip6protosw *psw; + struct encaptab *ep, *match; + int prio, matchprio; + + ip6 = mtod(m, struct ip6_hdr *); + + bzero(&s, sizeof(s)); + s.sin6_family = AF_INET6; + s.sin6_len = sizeof(struct sockaddr_in6); + s.sin6_addr = ip6->ip6_src; + bzero(&d, sizeof(d)); + d.sin6_family = AF_INET6; + d.sin6_len = sizeof(struct sockaddr_in6); + d.sin6_addr = ip6->ip6_dst; + + match = NULL; + matchprio = 0; + mtx_lock(&encapmtx); + LIST_FOREACH(ep, &encaptab, chain) { + if (ep->af != AF_INET6) + continue; + if (ep->proto >= 0 && ep->proto != proto) + continue; + if (ep->func) + prio = (*ep->func)(m, *offp, proto, ep->arg); + else { + /* + * it's inbound traffic, we need to match in reverse + * order + */ + prio = mask_match(ep, (struct sockaddr *)&d, + (struct sockaddr *)&s); + } + + /* see encap4_input() for issues here */ + if (prio <= 0) + continue; + if (prio > matchprio) { + matchprio = prio; + match = ep; + } + } + mtx_unlock(&encapmtx); + + if (match) { + /* found a match */ + psw = (const struct ip6protosw *)match->psw; + if (psw && psw->pr_input) { + encap_fillarg(m, match); + return (*psw->pr_input)(mp, offp, proto); + } else { + m_freem(m); + return IPPROTO_DONE; + } + } + + /* last resort: inject to raw socket */ + return rip6_input(mp, offp, proto); +} +#endif + +/*lint -sem(encap_add, custodial(1)) */ +static void +encap_add(struct encaptab *ep) +{ + + mtx_assert(&encapmtx, MA_OWNED); + LIST_INSERT_HEAD(&encaptab, ep, chain); +} + +/* + * sp (src ptr) is always my side, and dp (dst ptr) is always remote side. + * length of mask (sm and dm) is assumed to be same as sp/dp. + * Return value will be necessary as input (cookie) for encap_detach(). + */ +const struct encaptab * +encap_attach(int af, int proto, const struct sockaddr *sp, + const struct sockaddr *sm, const struct sockaddr *dp, + const struct sockaddr *dm, const struct protosw *psw, void *arg) +{ + struct encaptab *ep; + + /* sanity check on args */ + if (sp->sa_len > sizeof(ep->src) || dp->sa_len > sizeof(ep->dst)) + return (NULL); + if (sp->sa_len != dp->sa_len) + return (NULL); + if (af != sp->sa_family || af != dp->sa_family) + return (NULL); + + /* check if anyone have already attached with exactly same config */ + mtx_lock(&encapmtx); + LIST_FOREACH(ep, &encaptab, chain) { + if (ep->af != af) + continue; + if (ep->proto != proto) + continue; + if (ep->src.ss_len != sp->sa_len || + bcmp(&ep->src, sp, sp->sa_len) != 0 || + bcmp(&ep->srcmask, sm, sp->sa_len) != 0) + continue; + if (ep->dst.ss_len != dp->sa_len || + bcmp(&ep->dst, dp, dp->sa_len) != 0 || + bcmp(&ep->dstmask, dm, dp->sa_len) != 0) + continue; + + mtx_unlock(&encapmtx); + return (NULL); + } + + ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ + if (ep == NULL) { + mtx_unlock(&encapmtx); + return (NULL); + } + bzero(ep, sizeof(*ep)); + + ep->af = af; + ep->proto = proto; + bcopy(sp, &ep->src, sp->sa_len); + bcopy(sm, &ep->srcmask, sp->sa_len); + bcopy(dp, &ep->dst, dp->sa_len); + bcopy(dm, &ep->dstmask, dp->sa_len); + ep->psw = psw; + ep->arg = arg; + + encap_add(ep); + mtx_unlock(&encapmtx); + return (ep); +} + +const struct encaptab * +encap_attach_func(int af, int proto, + int (*func)(const struct mbuf *, int, int, void *), + const struct protosw *psw, void *arg) +{ + struct encaptab *ep; + + /* sanity check on args */ + if (!func) + return (NULL); + + ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT); /*XXX*/ + if (ep == NULL) + return (NULL); + bzero(ep, sizeof(*ep)); + + ep->af = af; + ep->proto = proto; + ep->func = func; + ep->psw = psw; + ep->arg = arg; + + mtx_lock(&encapmtx); + encap_add(ep); + mtx_unlock(&encapmtx); + return (ep); +} + +int +encap_detach(const struct encaptab *cookie) +{ + const struct encaptab *ep = cookie; + struct encaptab *p; + + mtx_lock(&encapmtx); + LIST_FOREACH(p, &encaptab, chain) { + if (p == ep) { + LIST_REMOVE(p, chain); + mtx_unlock(&encapmtx); + free(p, M_NETADDR); /*XXX*/ + return 0; + } + } + mtx_unlock(&encapmtx); + + return EINVAL; +} + +static int +mask_match(const struct encaptab *ep, const struct sockaddr *sp, + const struct sockaddr *dp) +{ + struct sockaddr_storage s; + struct sockaddr_storage d; + int i; + const u_int8_t *p, *q; + u_int8_t *r; + int matchlen; + + if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d)) + return 0; + if (sp->sa_family != ep->af || dp->sa_family != ep->af) + return 0; + if (sp->sa_len != ep->src.ss_len || dp->sa_len != ep->dst.ss_len) + return 0; + + matchlen = 0; + + p = (const u_int8_t *)sp; + q = (const u_int8_t *)&ep->srcmask; + r = (u_int8_t *)&s; + for (i = 0 ; i < sp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX estimate */ + matchlen += (q[i] ? 8 : 0); + } + + p = (const u_int8_t *)dp; + q = (const u_int8_t *)&ep->dstmask; + r = (u_int8_t *)&d; + for (i = 0 ; i < dp->sa_len; i++) { + r[i] = p[i] & q[i]; + /* XXX rough estimate */ + matchlen += (q[i] ? 8 : 0); + } + + /* need to overwrite len/family portion as we don't compare them */ + s.ss_len = sp->sa_len; + s.ss_family = sp->sa_family; + d.ss_len = dp->sa_len; + d.ss_family = dp->sa_family; + + if (bcmp(&s, &ep->src, ep->src.ss_len) == 0 && + bcmp(&d, &ep->dst, ep->dst.ss_len) == 0) { + return matchlen; + } else + return 0; +} + +static void +encap_fillarg(struct mbuf *m, const struct encaptab *ep) +{ + struct m_tag *tag; + + tag = m_tag_get(PACKET_TAG_ENCAP, sizeof (void*), M_NOWAIT); + if (tag) { + *(void**)(tag+1) = ep->arg; + m_tag_prepend(m, tag); + } +} + +void * +encap_getarg(struct mbuf *m) +{ + void *p = NULL; + struct m_tag *tag; + + tag = m_tag_find(m, PACKET_TAG_ENCAP, NULL); + if (tag) { + p = *(void**)(tag+1); + m_tag_delete(m, tag); + } + return p; +} diff --git a/freebsd/sys/netinet/ip_encap.h b/freebsd/sys/netinet/ip_encap.h new file mode 100644 index 00000000..44dd1a0d --- /dev/null +++ b/freebsd/sys/netinet/ip_encap.h @@ -0,0 +1,64 @@ +/* $FreeBSD$ */ +/* $KAME: ip_encap.h,v 1.7 2000/03/25 07:23:37 sumikawa Exp $ */ + +/*- + * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NETINET_IP_ENCAP_HH_ +#define _NETINET_IP_ENCAP_HH_ + +#ifdef _KERNEL + +struct encaptab { + LIST_ENTRY(encaptab) chain; + int af; + int proto; /* -1: don't care, I'll check myself */ + struct sockaddr_storage src; /* my addr */ + struct sockaddr_storage srcmask; + struct sockaddr_storage dst; /* remote addr */ + struct sockaddr_storage dstmask; + int (*func)(const struct mbuf *, int, int, void *); + const struct protosw *psw; /* only pr_input will be used */ + void *arg; /* passed via m->m_pkthdr.aux */ +}; + +void encap_init(void); +void encap4_input(struct mbuf *, int); +int encap6_input(struct mbuf **, int *, int); +const struct encaptab *encap_attach(int, int, const struct sockaddr *, + const struct sockaddr *, const struct sockaddr *, + const struct sockaddr *, const struct protosw *, void *); +const struct encaptab *encap_attach_func(int, int, + int (*)(const struct mbuf *, int, int, void *), + const struct protosw *, void *); +int encap_detach(const struct encaptab *); +void *encap_getarg(struct mbuf *); +#endif + +#endif /*_NETINET_IP_ENCAP_HH_*/ diff --git a/freebsd/sys/netinet/ip_fastfwd.c b/freebsd/sys/netinet/ip_fastfwd.c new file mode 100644 index 00000000..6d406b2b --- /dev/null +++ b/freebsd/sys/netinet/ip_fastfwd.c @@ -0,0 +1,619 @@ +#include + +/*- + * Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * ip_fastforward gets its speed from processing the forwarded packet to + * completion (if_output on the other side) without any queues or netisr's. + * The receiving interface DMAs the packet into memory, the upper half of + * driver calls ip_fastforward, we do our routing table lookup and directly + * send it off to the outgoing interface, which DMAs the packet to the + * network card. The only part of the packet we touch with the CPU is the + * IP header (unless there are complex firewall rules touching other parts + * of the packet, but that is up to you). We are essentially limited by bus + * bandwidth and how fast the network card/driver can set up receives and + * transmits. + * + * We handle basic errors, IP header errors, checksum errors, + * destination unreachable, fragmentation and fragmentation needed and + * report them via ICMP to the sender. + * + * Else if something is not pure IPv4 unicast forwarding we fall back to + * the normal ip_input processing path. We should only be called from + * interfaces connected to the outside world. + * + * Firewalling is fully supported including divert, ipfw fwd and ipfilter + * ipnat and address rewrite. + * + * IPSEC is not supported if this host is a tunnel broker. IPSEC is + * supported for connections to/from local host. + * + * We try to do the least expensive (in CPU ops) checks and operations + * first to catch junk with as little overhead as possible. + * + * We take full advantage of hardware support for IP checksum and + * fragmentation offloading. + * + * We don't do ICMP redirect in the fast forwarding path. I have had my own + * cases where two core routers with Zebra routing suite would send millions + * ICMP redirects to connected hosts if the destination router was not the + * default gateway. In one case it was filling the routing table of a host + * with approximately 300.000 cloned redirect entries until it ran out of + * kernel memory. However the networking code proved very robust and it didn't + * crash or fail in other ways. + */ + +/* + * Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which + * is being followed here. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +static VNET_DEFINE(int, ipfastforward_active); +#define V_ipfastforward_active VNET(ipfastforward_active) + +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fastforwarding, CTLFLAG_RW, + &VNET_NAME(ipfastforward_active), 0, "Enable fast IP forwarding"); + +static struct sockaddr_in * +ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m) +{ + struct sockaddr_in *dst; + struct rtentry *rt; + + /* + * Find route to destination. + */ + bzero(ro, sizeof(*ro)); + dst = (struct sockaddr_in *)&ro->ro_dst; + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr.s_addr = dest.s_addr; + in_rtalloc_ign(ro, 0, M_GETFIB(m)); + + /* + * Route there and interface still up? + */ + rt = ro->ro_rt; + if (rt && (rt->rt_flags & RTF_UP) && + (rt->rt_ifp->if_flags & IFF_UP) && + (rt->rt_ifp->if_drv_flags & IFF_DRV_RUNNING)) { + if (rt->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)rt->rt_gateway; + } else { + IPSTAT_INC(ips_noroute); + IPSTAT_INC(ips_cantforward); + if (rt) + RTFREE(rt); + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); + return NULL; + } + return dst; +} + +/* + * Try to forward a packet based on the destination address. + * This is a fast path optimized for the plain forwarding case. + * If the packet is handled (and consumed) here then we return 1; + * otherwise 0 is returned and the packet should be delivered + * to ip_input for full processing. + */ +struct mbuf * +ip_fastforward(struct mbuf *m) +{ + struct ip *ip; + struct mbuf *m0 = NULL; + struct route ro; + struct sockaddr_in *dst = NULL; + struct ifnet *ifp; + struct in_addr odest, dest; + u_short sum, ip_len; + int error = 0; + int hlen, mtu; +#ifdef IPFIREWALL_FORWARD + struct m_tag *fwd_tag; +#endif + + /* + * Are we active and forwarding packets? + */ + if (!V_ipfastforward_active || !V_ipforwarding) + return m; + + M_ASSERTVALID(m); + M_ASSERTPKTHDR(m); + + bzero(&ro, sizeof(ro)); + + /* + * Step 1: check for packet drop conditions (and sanity checks) + */ + + /* + * Is entire packet big enough? + */ + if (m->m_pkthdr.len < sizeof(struct ip)) { + IPSTAT_INC(ips_tooshort); + goto drop; + } + + /* + * Is first mbuf large enough for ip header and is header present? + */ + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == NULL) { + IPSTAT_INC(ips_toosmall); + return NULL; /* mbuf already free'd */ + } + + ip = mtod(m, struct ip *); + + /* + * Is it IPv4? + */ + if (ip->ip_v != IPVERSION) { + IPSTAT_INC(ips_badvers); + goto drop; + } + + /* + * Is IP header length correct and is it in first mbuf? + */ + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) { /* minimum header length */ + IPSTAT_INC(ips_badhlen); + goto drop; + } + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == NULL) { + IPSTAT_INC(ips_badhlen); + return NULL; /* mbuf already free'd */ + } + ip = mtod(m, struct ip *); + } + + /* + * Checksum correct? + */ + if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) + sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); + else { + if (hlen == sizeof(struct ip)) + sum = in_cksum_hdr(ip); + else + sum = in_cksum(m, hlen); + } + if (sum) { + IPSTAT_INC(ips_badsum); + goto drop; + } + + /* + * Remember that we have checked the IP header and found it valid. + */ + m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); + + ip_len = ntohs(ip->ip_len); + + /* + * Is IP length longer than packet we have got? + */ + if (m->m_pkthdr.len < ip_len) { + IPSTAT_INC(ips_tooshort); + goto drop; + } + + /* + * Is packet longer than IP header tells us? If yes, truncate packet. + */ + if (m->m_pkthdr.len > ip_len) { + if (m->m_len == m->m_pkthdr.len) { + m->m_len = ip_len; + m->m_pkthdr.len = ip_len; + } else + m_adj(m, ip_len - m->m_pkthdr.len); + } + + /* + * Is packet from or to 127/8? + */ + if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + IPSTAT_INC(ips_badaddr); + goto drop; + } + +#ifdef ALTQ + /* + * Is packet dropped by traffic conditioner? + */ + if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) + goto drop; +#endif + + /* + * Step 2: fallback conditions to normal ip_input path processing + */ + + /* + * Only IP packets without options + */ + if (ip->ip_hl != (sizeof(struct ip) >> 2)) { + if (ip_doopts == 1) + return m; + else if (ip_doopts == 2) { + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB, + 0, 0); + return NULL; /* mbuf already free'd */ + } + /* else ignore IP options and continue */ + } + + /* + * Only unicast IP, not from loopback, no L2 or IP broadcast, + * no multicast, no INADDR_ANY + * + * XXX: Probably some of these checks could be direct drop + * conditions. However it is not clear whether there are some + * hacks or obscure behaviours which make it neccessary to + * let ip_input handle it. We play safe here and let ip_input + * deal with it until it is proven that we can directly drop it. + */ + if ((m->m_flags & (M_BCAST|M_MCAST)) || + (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) || + ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST || + ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || + IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || + ip->ip_src.s_addr == INADDR_ANY || + ip->ip_dst.s_addr == INADDR_ANY ) + return m; + + /* + * Is it for a local address on this host? + */ + if (in_localip(ip->ip_dst)) + return m; + + IPSTAT_INC(ips_total); + + /* + * Step 3: incoming packet firewall processing + */ + + /* + * Convert to host representation + */ + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + odest.s_addr = dest.s_addr = ip->ip_dst.s_addr; + + /* + * Run through list of ipfilter hooks for input packets + */ + if (!PFIL_HOOKED(&V_inet_pfil_hook)) + goto passin; + + if (pfil_run_hooks( + &V_inet_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, NULL) || + m == NULL) + goto drop; + + M_ASSERTVALID(m); + M_ASSERTPKTHDR(m); + + ip = mtod(m, struct ip *); /* m may have changed by pfil hook */ + dest.s_addr = ip->ip_dst.s_addr; + + /* + * Destination address changed? + */ + if (odest.s_addr != dest.s_addr) { + /* + * Is it now for a local address on this host? + */ + if (in_localip(dest)) + goto forwardlocal; + /* + * Go on with new destination address + */ + } +#ifdef IPFIREWALL_FORWARD + if (m->m_flags & M_FASTFWD_OURS) { + /* + * ipfw changed it for a local address on this host. + */ + goto forwardlocal; + } +#endif /* IPFIREWALL_FORWARD */ + +passin: + /* + * Step 4: decrement TTL and look up route + */ + + /* + * Check TTL + */ +#ifdef IPSTEALTH + if (!V_ipstealth) { +#endif + if (ip->ip_ttl <= IPTTLDEC) { + icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); + return NULL; /* mbuf already free'd */ + } + + /* + * Decrement the TTL and incrementally change the IP header checksum. + * Don't bother doing this with hw checksum offloading, it's faster + * doing it right here. + */ + ip->ip_ttl -= IPTTLDEC; + if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8)) + ip->ip_sum -= ~htons(IPTTLDEC << 8); + else + ip->ip_sum += htons(IPTTLDEC << 8); +#ifdef IPSTEALTH + } +#endif + + /* + * Find route to destination. + */ + if ((dst = ip_findroute(&ro, dest, m)) == NULL) + return NULL; /* icmp unreach already sent */ + ifp = ro.ro_rt->rt_ifp; + + /* + * Immediately drop blackholed traffic, and directed broadcasts + * for either the all-ones or all-zero subnet addresses on + * locally attached networks. + */ + if ((ro.ro_rt->rt_flags & (RTF_BLACKHOLE|RTF_BROADCAST)) != 0) + goto drop; + + /* + * Step 5: outgoing firewall packet processing + */ + + /* + * Run through list of hooks for output packets. + */ + if (!PFIL_HOOKED(&V_inet_pfil_hook)) + goto passout; + + if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, NULL) || m == NULL) { + goto drop; + } + + M_ASSERTVALID(m); + M_ASSERTPKTHDR(m); + + ip = mtod(m, struct ip *); + dest.s_addr = ip->ip_dst.s_addr; + + /* + * Destination address changed? + */ +#ifndef IPFIREWALL_FORWARD + if (odest.s_addr != dest.s_addr) { +#else + fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); + if (odest.s_addr != dest.s_addr || fwd_tag != NULL) { +#endif /* IPFIREWALL_FORWARD */ + /* + * Is it now for a local address on this host? + */ +#ifndef IPFIREWALL_FORWARD + if (in_localip(dest)) { +#else + if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) { +#endif /* IPFIREWALL_FORWARD */ +forwardlocal: + /* + * Return packet for processing by ip_input(). + * Keep host byte order as expected at ip_input's + * "ours"-label. + */ + m->m_flags |= M_FASTFWD_OURS; + if (ro.ro_rt) + RTFREE(ro.ro_rt); + return m; + } + /* + * Redo route lookup with new destination address + */ +#ifdef IPFIREWALL_FORWARD + if (fwd_tag) { + dest.s_addr = ((struct sockaddr_in *) + (fwd_tag + 1))->sin_addr.s_addr; + m_tag_delete(m, fwd_tag); + } +#endif /* IPFIREWALL_FORWARD */ + RTFREE(ro.ro_rt); + if ((dst = ip_findroute(&ro, dest, m)) == NULL) + return NULL; /* icmp unreach already sent */ + ifp = ro.ro_rt->rt_ifp; + } + +passout: + /* + * Step 6: send off the packet + */ + + /* + * Check if route is dampned (when ARP is unable to resolve) + */ + if ((ro.ro_rt->rt_flags & RTF_REJECT) && + (ro.ro_rt->rt_rmx.rmx_expire == 0 || + time_uptime < ro.ro_rt->rt_rmx.rmx_expire)) { + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); + goto consumed; + } + +#ifndef ALTQ + /* + * Check if there is enough space in the interface queue + */ + if ((ifp->if_snd.ifq_len + ip->ip_len / ifp->if_mtu + 1) >= + ifp->if_snd.ifq_maxlen) { + IPSTAT_INC(ips_odropped); + /* would send source quench here but that is depreciated */ + goto drop; + } +#endif + + /* + * Check if media link state of interface is not down + */ + if (ifp->if_link_state == LINK_STATE_DOWN) { + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); + goto consumed; + } + + /* + * Check if packet fits MTU or if hardware will fragment for us + */ + if (ro.ro_rt->rt_rmx.rmx_mtu) + mtu = min(ro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); + else + mtu = ifp->if_mtu; + + if (ip->ip_len <= mtu || + (ifp->if_hwassist & CSUM_FRAGMENT && (ip->ip_off & IP_DF) == 0)) { + /* + * Restore packet header fields to original values + */ + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + /* + * Send off the packet via outgoing interface + */ + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, &ro); + } else { + /* + * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery + */ + if (ip->ip_off & IP_DF) { + IPSTAT_INC(ips_cantfrag); + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, + 0, mtu); + goto consumed; + } else { + /* + * We have to fragment the packet + */ + m->m_pkthdr.csum_flags |= CSUM_IP; + /* + * ip_fragment expects ip_len and ip_off in host byte + * order but returns all packets in network byte order + */ + if (ip_fragment(ip, &m, mtu, ifp->if_hwassist, + (~ifp->if_hwassist & CSUM_DELAY_IP))) { + goto drop; + } + KASSERT(m != NULL, ("null mbuf and no error")); + /* + * Send off the fragments via outgoing interface + */ + error = 0; + do { + m0 = m->m_nextpkt; + m->m_nextpkt = NULL; + + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, &ro); + if (error) + break; + } while ((m = m0) != NULL); + if (error) { + /* Reclaim remaining fragments */ + for (m = m0; m; m = m0) { + m0 = m->m_nextpkt; + m_freem(m); + } + } else + IPSTAT_INC(ips_fragmented); + } + } + + if (error != 0) + IPSTAT_INC(ips_odropped); + else { + ro.ro_rt->rt_rmx.rmx_pksent++; + IPSTAT_INC(ips_forward); + IPSTAT_INC(ips_fastforward); + } +consumed: + RTFREE(ro.ro_rt); + return NULL; +drop: + if (m) + m_freem(m); + if (ro.ro_rt) + RTFREE(ro.ro_rt); + return NULL; +} diff --git a/freebsd/sys/netinet/ip_fw.h b/freebsd/sys/netinet/ip_fw.h new file mode 100644 index 00000000..cf5d8d03 --- /dev/null +++ b/freebsd/sys/netinet/ip_fw.h @@ -0,0 +1,579 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IPFW2_H +#define _IPFW2_H + +/* + * The default rule number. By the design of ip_fw, the default rule + * is the last one, so its number can also serve as the highest number + * allowed for a rule. The ip_fw code relies on both meanings of this + * constant. + */ +#define IPFW_DEFAULT_RULE 65535 + +/* + * The number of ipfw tables. The maximum allowed table number is the + * (IPFW_TABLES_MAX - 1). + */ +#define IPFW_TABLES_MAX 128 + +/* + * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit + * argument between 1 and 65534. The value 0 is unused, the value + * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the + * can be 1..65534, or 65535 to indicate the use of a 'tablearg' + * result of the most recent table() lookup. + * Note that 16bit is only a historical limit, resulting from + * the use of a 16-bit fields for that value. In reality, we can have + * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg. + */ +#define IPFW_ARG_MIN 1 +#define IPFW_ARG_MAX 65534 +#define IP_FW_TABLEARG 65535 /* XXX should use 0 */ + +/* + * The kernel representation of ipfw rules is made of a list of + * 'instructions' (for all practical purposes equivalent to BPF + * instructions), which specify which fields of the packet + * (or its metadata) should be analysed. + * + * Each instruction is stored in a structure which begins with + * "ipfw_insn", and can contain extra fields depending on the + * instruction type (listed below). + * Note that the code is written so that individual instructions + * have a size which is a multiple of 32 bits. This means that, if + * such structures contain pointers or other 64-bit entities, + * (there is just one instance now) they may end up unaligned on + * 64-bit architectures, so the must be handled with care. + * + * "enum ipfw_opcodes" are the opcodes supported. We can have up + * to 256 different opcodes. When adding new opcodes, they should + * be appended to the end of the opcode list before O_LAST_OPCODE, + * this will prevent the ABI from being broken, otherwise users + * will have to recompile ipfw(8) when they update the kernel. + */ + +enum ipfw_opcodes { /* arguments (4 byte each) */ + O_NOP, + + O_IP_SRC, /* u32 = IP */ + O_IP_SRC_MASK, /* ip = IP/mask */ + O_IP_SRC_ME, /* none */ + O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */ + + O_IP_DST, /* u32 = IP */ + O_IP_DST_MASK, /* ip = IP/mask */ + O_IP_DST_ME, /* none */ + O_IP_DST_SET, /* u32=base, arg1=len, bitmap */ + + O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */ + O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */ + O_PROTO, /* arg1=protocol */ + + O_MACADDR2, /* 2 mac addr:mask */ + O_MAC_TYPE, /* same as srcport */ + + O_LAYER2, /* none */ + O_IN, /* none */ + O_FRAG, /* none */ + + O_RECV, /* none */ + O_XMIT, /* none */ + O_VIA, /* none */ + + O_IPOPT, /* arg1 = 2*u8 bitmap */ + O_IPLEN, /* arg1 = len */ + O_IPID, /* arg1 = id */ + + O_IPTOS, /* arg1 = id */ + O_IPPRECEDENCE, /* arg1 = precedence << 5 */ + O_IPTTL, /* arg1 = TTL */ + + O_IPVER, /* arg1 = version */ + O_UID, /* u32 = id */ + O_GID, /* u32 = id */ + O_ESTAB, /* none (tcp established) */ + O_TCPFLAGS, /* arg1 = 2*u8 bitmap */ + O_TCPWIN, /* arg1 = desired win */ + O_TCPSEQ, /* u32 = desired seq. */ + O_TCPACK, /* u32 = desired seq. */ + O_ICMPTYPE, /* u32 = icmp bitmap */ + O_TCPOPTS, /* arg1 = 2*u8 bitmap */ + + O_VERREVPATH, /* none */ + O_VERSRCREACH, /* none */ + + O_PROBE_STATE, /* none */ + O_KEEP_STATE, /* none */ + O_LIMIT, /* ipfw_insn_limit */ + O_LIMIT_PARENT, /* dyn_type, not an opcode. */ + + /* + * These are really 'actions'. + */ + + O_LOG, /* ipfw_insn_log */ + O_PROB, /* u32 = match probability */ + + O_CHECK_STATE, /* none */ + O_ACCEPT, /* none */ + O_DENY, /* none */ + O_REJECT, /* arg1=icmp arg (same as deny) */ + O_COUNT, /* none */ + O_SKIPTO, /* arg1=next rule number */ + O_PIPE, /* arg1=pipe number */ + O_QUEUE, /* arg1=queue number */ + O_DIVERT, /* arg1=port number */ + O_TEE, /* arg1=port number */ + O_FORWARD_IP, /* fwd sockaddr */ + O_FORWARD_MAC, /* fwd mac */ + O_NAT, /* nope */ + O_REASS, /* none */ + + /* + * More opcodes. + */ + O_IPSEC, /* has ipsec history */ + O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */ + O_IP_DST_LOOKUP, /* arg1=table number, u32=value */ + O_ANTISPOOF, /* none */ + O_JAIL, /* u32 = id */ + O_ALTQ, /* u32 = altq classif. qid */ + O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */ + O_TCPDATALEN, /* arg1 = tcp data len */ + O_IP6_SRC, /* address without mask */ + O_IP6_SRC_ME, /* my addresses */ + O_IP6_SRC_MASK, /* address with the mask */ + O_IP6_DST, + O_IP6_DST_ME, + O_IP6_DST_MASK, + O_FLOW6ID, /* for flow id tag in the ipv6 pkt */ + O_ICMP6TYPE, /* icmp6 packet type filtering */ + O_EXT_HDR, /* filtering for ipv6 extension header */ + O_IP6, + + /* + * actions for ng_ipfw + */ + O_NETGRAPH, /* send to ng_ipfw */ + O_NGTEE, /* copy to ng_ipfw */ + + O_IP4, + + O_UNREACH6, /* arg1=icmpv6 code arg (deny) */ + + O_TAG, /* arg1=tag number */ + O_TAGGED, /* arg1=tag number */ + + O_SETFIB, /* arg1=FIB number */ + O_FIB, /* arg1=FIB desired fib number */ + + O_LAST_OPCODE /* not an opcode! */ +}; + +/* + * The extension header are filtered only for presence using a bit + * vector with a flag for each header. + */ +#define EXT_FRAGMENT 0x1 +#define EXT_HOPOPTS 0x2 +#define EXT_ROUTING 0x4 +#define EXT_AH 0x8 +#define EXT_ESP 0x10 +#define EXT_DSTOPTS 0x20 +#define EXT_RTHDR0 0x40 +#define EXT_RTHDR2 0x80 + +/* + * Template for instructions. + * + * ipfw_insn is used for all instructions which require no operands, + * a single 16-bit value (arg1), or a couple of 8-bit values. + * + * For other instructions which require different/larger arguments + * we have derived structures, ipfw_insn_*. + * + * The size of the instruction (in 32-bit words) is in the low + * 6 bits of "len". The 2 remaining bits are used to implement + * NOT and OR on individual instructions. Given a type, you can + * compute the length to be put in "len" using F_INSN_SIZE(t) + * + * F_NOT negates the match result of the instruction. + * + * F_OR is used to build or blocks. By default, instructions + * are evaluated as part of a logical AND. An "or" block + * { X or Y or Z } contains F_OR set in all but the last + * instruction of the block. A match will cause the code + * to skip past the last instruction of the block. + * + * NOTA BENE: in a couple of places we assume that + * sizeof(ipfw_insn) == sizeof(u_int32_t) + * this needs to be fixed. + * + */ +typedef struct _ipfw_insn { /* template for instructions */ + u_int8_t opcode; + u_int8_t len; /* number of 32-bit words */ +#define F_NOT 0x80 +#define F_OR 0x40 +#define F_LEN_MASK 0x3f +#define F_LEN(cmd) ((cmd)->len & F_LEN_MASK) + + u_int16_t arg1; +} ipfw_insn; + +/* + * The F_INSN_SIZE(type) computes the size, in 4-byte words, of + * a given type. + */ +#define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t)) + +/* + * This is used to store an array of 16-bit entries (ports etc.) + */ +typedef struct _ipfw_insn_u16 { + ipfw_insn o; + u_int16_t ports[2]; /* there may be more */ +} ipfw_insn_u16; + +/* + * This is used to store an array of 32-bit entries + * (uid, single IPv4 addresses etc.) + */ +typedef struct _ipfw_insn_u32 { + ipfw_insn o; + u_int32_t d[1]; /* one or more */ +} ipfw_insn_u32; + +/* + * This is used to store IP addr-mask pairs. + */ +typedef struct _ipfw_insn_ip { + ipfw_insn o; + struct in_addr addr; + struct in_addr mask; +} ipfw_insn_ip; + +/* + * This is used to forward to a given address (ip). + */ +typedef struct _ipfw_insn_sa { + ipfw_insn o; + struct sockaddr_in sa; +} ipfw_insn_sa; + +/* + * This is used for MAC addr-mask pairs. + */ +typedef struct _ipfw_insn_mac { + ipfw_insn o; + u_char addr[12]; /* dst[6] + src[6] */ + u_char mask[12]; /* dst[6] + src[6] */ +} ipfw_insn_mac; + +/* + * This is used for interface match rules (recv xx, xmit xx). + */ +typedef struct _ipfw_insn_if { + ipfw_insn o; + union { + struct in_addr ip; + int glob; + } p; + char name[IFNAMSIZ]; +} ipfw_insn_if; + +/* + * This is used for storing an altq queue id number. + */ +typedef struct _ipfw_insn_altq { + ipfw_insn o; + u_int32_t qid; +} ipfw_insn_altq; + +/* + * This is used for limit rules. + */ +typedef struct _ipfw_insn_limit { + ipfw_insn o; + u_int8_t _pad; + u_int8_t limit_mask; /* combination of DYN_* below */ +#define DYN_SRC_ADDR 0x1 +#define DYN_SRC_PORT 0x2 +#define DYN_DST_ADDR 0x4 +#define DYN_DST_PORT 0x8 + + u_int16_t conn_limit; +} ipfw_insn_limit; + +/* + * This is used for log instructions. + */ +typedef struct _ipfw_insn_log { + ipfw_insn o; + u_int32_t max_log; /* how many do we log -- 0 = all */ + u_int32_t log_left; /* how many left to log */ +} ipfw_insn_log; + +/* + * Data structures required by both ipfw(8) and ipfw(4) but not part of the + * management API are protected by IPFW_INTERNAL. + */ +#ifdef IPFW_INTERNAL +/* Server pool support (LSNAT). */ +struct cfg_spool { + LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ + struct in_addr addr; + u_short port; +}; +#endif + +/* Redirect modes id. */ +#define REDIR_ADDR 0x01 +#define REDIR_PORT 0x02 +#define REDIR_PROTO 0x04 + +#ifdef IPFW_INTERNAL +/* Nat redirect configuration. */ +struct cfg_redir { + LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ + u_int16_t mode; /* type of redirect mode */ + struct in_addr laddr; /* local ip address */ + struct in_addr paddr; /* public ip address */ + struct in_addr raddr; /* remote ip address */ + u_short lport; /* local port */ + u_short pport; /* public port */ + u_short rport; /* remote port */ + u_short pport_cnt; /* number of public ports */ + u_short rport_cnt; /* number of remote ports */ + int proto; /* protocol: tcp/udp */ + struct alias_link **alink; + /* num of entry in spool chain */ + u_int16_t spool_cnt; + /* chain of spool instances */ + LIST_HEAD(spool_chain, cfg_spool) spool_chain; +}; +#endif + +#define NAT_BUF_LEN 1024 + +#ifdef IPFW_INTERNAL +/* Nat configuration data struct. */ +struct cfg_nat { + /* chain of nat instances */ + LIST_ENTRY(cfg_nat) _next; + int id; /* nat id */ + struct in_addr ip; /* nat ip address */ + char if_name[IF_NAMESIZE]; /* interface name */ + int mode; /* aliasing mode */ + struct libalias *lib; /* libalias instance */ + /* number of entry in spool chain */ + int redir_cnt; + /* chain of redir instances */ + LIST_HEAD(redir_chain, cfg_redir) redir_chain; +}; +#endif + +#define SOF_NAT sizeof(struct cfg_nat) +#define SOF_REDIR sizeof(struct cfg_redir) +#define SOF_SPOOL sizeof(struct cfg_spool) + +/* Nat command. */ +typedef struct _ipfw_insn_nat { + ipfw_insn o; + struct cfg_nat *nat; +} ipfw_insn_nat; + +/* Apply ipv6 mask on ipv6 addr */ +#define APPLY_MASK(addr,mask) \ + (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \ + (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \ + (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \ + (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3]; + +/* Structure for ipv6 */ +typedef struct _ipfw_insn_ip6 { + ipfw_insn o; + struct in6_addr addr6; + struct in6_addr mask6; +} ipfw_insn_ip6; + +/* Used to support icmp6 types */ +typedef struct _ipfw_insn_icmp6 { + ipfw_insn o; + uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h + * define ICMP6_MAXTYPE + * as follows: n = ICMP6_MAXTYPE/32 + 1 + * Actually is 203 + */ +} ipfw_insn_icmp6; + +/* + * Here we have the structure representing an ipfw rule. + * + * It starts with a general area (with link fields and counters) + * followed by an array of one or more instructions, which the code + * accesses as an array of 32-bit values. + * + * Given a rule pointer r: + * + * r->cmd is the start of the first instruction. + * ACTION_PTR(r) is the start of the first action (things to do + * once a rule matched). + * + * When assembling instruction, remember the following: + * + * + if a rule has a "keep-state" (or "limit") option, then the + * first instruction (at r->cmd) MUST BE an O_PROBE_STATE + * + if a rule has a "log" option, then the first action + * (at ACTION_PTR(r)) MUST be O_LOG + * + if a rule has an "altq" option, it comes after "log" + * + if a rule has an O_TAG option, it comes after "log" and "altq" + * + * NOTE: we use a simple linked list of rules because we never need + * to delete a rule without scanning the list. We do not use + * queue(3) macros for portability and readability. + */ + +struct ip_fw { + struct ip_fw *x_next; /* linked list of rules */ + struct ip_fw *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ +#define RESVD_SET 31 /* set for default and persistent rules */ + uint8_t _pad; /* padding */ + uint32_t id; /* rule id */ + + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + +#define ACTION_PTR(rule) \ + (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) ) + +#define RULESIZE(rule) (sizeof(struct ip_fw) + \ + ((struct ip_fw *)(rule))->cmd_len * 4 - 4) + +#if 1 // should be moved to in.h +/* + * This structure is used as a flow mask and a flow id for various + * parts of the code. + * addr_type is used in userland and kernel to mark the address type. + * fib is used in the kernel to record the fib in use. + * _flags is used in the kernel to store tcp flags for dynamic rules. + */ +struct ipfw_flow_id { + uint32_t dst_ip; + uint32_t src_ip; + uint16_t dst_port; + uint16_t src_port; + uint8_t fib; + uint8_t proto; + uint8_t _flags; /* protocol-specific flags */ + uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */ + struct in6_addr dst_ip6; + struct in6_addr src_ip6; + uint32_t flow_id6; + uint32_t extra; /* queue/pipe or frag_id */ +}; +#endif + +#define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) + +/* + * Dynamic ipfw rule. + */ +typedef struct _ipfw_dyn_rule ipfw_dyn_rule; + +struct _ipfw_dyn_rule { + ipfw_dyn_rule *next; /* linked list of rules. */ + struct ip_fw *rule; /* pointer to rule */ + /* 'rule' is used to pass up the rule number (from the parent) */ + + ipfw_dyn_rule *parent; /* pointer to parent rule */ + u_int64_t pcnt; /* packet match counter */ + u_int64_t bcnt; /* byte match counter */ + struct ipfw_flow_id id; /* (masked) flow id */ + u_int32_t expire; /* expire time */ + u_int32_t bucket; /* which bucket in hash table */ + u_int32_t state; /* state of this rule (typically a + * combination of TCP flags) + */ + u_int32_t ack_fwd; /* most recent ACKs in forward */ + u_int32_t ack_rev; /* and reverse directions (used */ + /* to generate keepalives) */ + u_int16_t dyn_type; /* rule type */ + u_int16_t count; /* refcount */ +}; + +/* + * Definitions for IP option names. + */ +#define IP_FW_IPOPT_LSRR 0x01 +#define IP_FW_IPOPT_SSRR 0x02 +#define IP_FW_IPOPT_RR 0x04 +#define IP_FW_IPOPT_TS 0x08 + +/* + * Definitions for TCP option names. + */ +#define IP_FW_TCPOPT_MSS 0x01 +#define IP_FW_TCPOPT_WINDOW 0x02 +#define IP_FW_TCPOPT_SACK 0x04 +#define IP_FW_TCPOPT_TS 0x08 +#define IP_FW_TCPOPT_CC 0x10 + +#define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */ +#define ICMP6_UNREACH_RST 0x100 /* fake ICMPv6 code (send a TCP RST) */ + +/* + * These are used for lookup tables. + */ +typedef struct _ipfw_table_entry { + in_addr_t addr; /* network address */ + u_int32_t value; /* value */ + u_int16_t tbl; /* table number */ + u_int8_t masklen; /* mask length */ +} ipfw_table_entry; + +typedef struct _ipfw_table { + u_int32_t size; /* size of entries in bytes */ + u_int32_t cnt; /* # of entries */ + u_int16_t tbl; /* table number */ + ipfw_table_entry ent[0]; /* entries */ +} ipfw_table; + +#endif /* _IPFW2_H */ diff --git a/freebsd/sys/netinet/ip_gre.c b/freebsd/sys/netinet/ip_gre.c new file mode 100644 index 00000000..253376de --- /dev/null +++ b/freebsd/sys/netinet/ip_gre.c @@ -0,0 +1,336 @@ +#include + +/* $NetBSD: ip_gre.c,v 1.29 2003/09/05 23:02:43 itojun Exp $ */ + +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Heiko W.Rupp + * + * IPv6-over-GRE contributed by Gert Doering + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * deencapsulate tunneled packets and send them on + * output half is in net/if_gre.[ch] + * This currently handles IPPROTO_GRE, IPPROTO_MOBILE + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef INET +#include +#include +#include +#include +#include +#include +#include +#else +#error ip_gre input without IP? +#endif + +#ifdef NETATALK +#include +#include +#include +#endif + +/* Needs IP headers. */ +#include + +#include + +#if 1 +void gre_inet_ntoa(struct in_addr in); /* XXX */ +#endif + +static struct gre_softc *gre_lookup(struct mbuf *, u_int8_t); + +static struct mbuf *gre_input2(struct mbuf *, int, u_char); + +/* + * De-encapsulate a packet and feed it back through ip input (this + * routine is called whenever IP gets a packet with proto type + * IPPROTO_GRE and a local destination address). + * This really is simple + */ +void +gre_input(struct mbuf *m, int off) +{ + int proto; + + proto = (mtod(m, struct ip *))->ip_p; + + m = gre_input2(m, off, proto); + + /* + * If no matching tunnel that is up is found. We inject + * the mbuf to raw ip socket to see if anyone picks it up. + */ + if (m != NULL) + rip_input(m, off); +} + +/* + * Decapsulate. Does the real work and is called from gre_input() + * (above). Returns an mbuf back if packet is not yet processed, + * and NULL if it needs no further processing. proto is the protocol + * number of the "calling" foo_input() routine. + */ +static struct mbuf * +gre_input2(struct mbuf *m ,int hlen, u_char proto) +{ + struct greip *gip; + int isr; + struct gre_softc *sc; + u_int16_t flags; + u_int32_t af; + + if ((sc = gre_lookup(m, proto)) == NULL) { + /* No matching tunnel or tunnel is down. */ + return (m); + } + + if (m->m_len < sizeof(*gip)) { + m = m_pullup(m, sizeof(*gip)); + if (m == NULL) + return (NULL); + } + gip = mtod(m, struct greip *); + + GRE2IFP(sc)->if_ipackets++; + GRE2IFP(sc)->if_ibytes += m->m_pkthdr.len; + + switch (proto) { + case IPPROTO_GRE: + hlen += sizeof(struct gre_h); + + /* process GRE flags as packet can be of variable len */ + flags = ntohs(gip->gi_flags); + + /* Checksum & Offset are present */ + if ((flags & GRE_CP) | (flags & GRE_RP)) + hlen += 4; + /* We don't support routing fields (variable length) */ + if (flags & GRE_RP) + return (m); + if (flags & GRE_KP) + hlen += 4; + if (flags & GRE_SP) + hlen += 4; + + switch (ntohs(gip->gi_ptype)) { /* ethertypes */ + case WCCP_PROTOCOL_TYPE: + if (sc->wccp_ver == WCCP_V2) + hlen += 4; + /* FALLTHROUGH */ + case ETHERTYPE_IP: /* shouldn't need a schednetisr(), */ + isr = NETISR_IP;/* as we are in ip_input */ + af = AF_INET; + break; +#ifdef INET6 + case ETHERTYPE_IPV6: + isr = NETISR_IPV6; + af = AF_INET6; + break; +#endif +#ifdef NETATALK + case ETHERTYPE_ATALK: + isr = NETISR_ATALK1; + af = AF_APPLETALK; + break; +#endif + default: + /* Others not yet supported. */ + return (m); + } + break; + default: + /* Others not yet supported. */ + return (m); + } + + if (hlen > m->m_pkthdr.len) { + m_freem(m); + return (NULL); + } + /* Unlike NetBSD, in FreeBSD m_adj() adjusts m->m_pkthdr.len as well */ + m_adj(m, hlen); + + if (bpf_peers_present(GRE2IFP(sc)->if_bpf)) { + bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m); + } + + m->m_pkthdr.rcvif = GRE2IFP(sc); + + netisr_queue(isr, m); + + /* Packet is done, no further processing needed. */ + return (NULL); +} + +/* + * input routine for IPPRPOTO_MOBILE + * This is a little bit diffrent from the other modes, as the + * encapsulating header was not prepended, but instead inserted + * between IP header and payload + */ + +void +gre_mobile_input(struct mbuf *m, int hlen) +{ + struct ip *ip; + struct mobip_h *mip; + struct gre_softc *sc; + int msiz; + + if ((sc = gre_lookup(m, IPPROTO_MOBILE)) == NULL) { + /* No matching tunnel or tunnel is down. */ + m_freem(m); + return; + } + + if (m->m_len < sizeof(*mip)) { + m = m_pullup(m, sizeof(*mip)); + if (m == NULL) + return; + } + ip = mtod(m, struct ip *); + mip = mtod(m, struct mobip_h *); + + GRE2IFP(sc)->if_ipackets++; + GRE2IFP(sc)->if_ibytes += m->m_pkthdr.len; + + if (ntohs(mip->mh.proto) & MOB_HH_SBIT) { + msiz = MOB_HH_SIZ_L; + mip->mi.ip_src.s_addr = mip->mh.osrc; + } else + msiz = MOB_HH_SIZ_S; + + if (m->m_len < (ip->ip_hl << 2) + msiz) { + m = m_pullup(m, (ip->ip_hl << 2) + msiz); + if (m == NULL) + return; + ip = mtod(m, struct ip *); + mip = mtod(m, struct mobip_h *); + } + + mip->mi.ip_dst.s_addr = mip->mh.odst; + mip->mi.ip_p = (ntohs(mip->mh.proto) >> 8); + + if (gre_in_cksum((u_int16_t *)&mip->mh, msiz) != 0) { + m_freem(m); + return; + } + + bcopy((caddr_t)(ip) + (ip->ip_hl << 2) + msiz, (caddr_t)(ip) + + (ip->ip_hl << 2), m->m_len - msiz - (ip->ip_hl << 2)); + m->m_len -= msiz; + m->m_pkthdr.len -= msiz; + + /* + * On FreeBSD, rip_input() supplies us with ip->ip_len + * already converted into host byteorder and also decreases + * it by the lengh of IP header, however, ip_input() expects + * that this field is in the original format (network byteorder + * and full size of IP packet), so that adjust accordingly. + */ + ip->ip_len = htons(ip->ip_len + sizeof(struct ip) - msiz); + + ip->ip_sum = 0; + ip->ip_sum = in_cksum(m, (ip->ip_hl << 2)); + + if (bpf_peers_present(GRE2IFP(sc)->if_bpf)) { + u_int32_t af = AF_INET; + bpf_mtap2(GRE2IFP(sc)->if_bpf, &af, sizeof(af), m); + } + + m->m_pkthdr.rcvif = GRE2IFP(sc); + + netisr_queue(NETISR_IP, m); +} + +/* + * Find the gre interface associated with our src/dst/proto set. + * + * XXXRW: Need some sort of drain/refcount mechanism so that the softc + * reference remains valid after it's returned from gre_lookup(). Right + * now, I'm thinking it should be reference-counted with a gre_dropref() + * when the caller is done with the softc. This is complicated by how + * to handle destroying the gre softc; probably using a gre_drain() in + * in_gre.c during destroy. + */ +static struct gre_softc * +gre_lookup(struct mbuf *m, u_int8_t proto) +{ + struct ip *ip = mtod(m, struct ip *); + struct gre_softc *sc; + + mtx_lock(&gre_mtx); + for (sc = LIST_FIRST(&gre_softc_list); sc != NULL; + sc = LIST_NEXT(sc, sc_list)) { + if ((sc->g_dst.s_addr == ip->ip_src.s_addr) && + (sc->g_src.s_addr == ip->ip_dst.s_addr) && + (sc->g_proto == proto) && + ((GRE2IFP(sc)->if_flags & IFF_UP) != 0)) { + mtx_unlock(&gre_mtx); + return (sc); + } + } + mtx_unlock(&gre_mtx); + + return (NULL); +} diff --git a/freebsd/sys/netinet/ip_gre.h b/freebsd/sys/netinet/ip_gre.h new file mode 100644 index 00000000..1fb67d93 --- /dev/null +++ b/freebsd/sys/netinet/ip_gre.h @@ -0,0 +1,43 @@ +/* $NetBSD: ip_gre.h,v 1.5 2002/06/09 16:33:40 itojun Exp $ */ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Heiko W.Rupp + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef _KERNEL +void gre_input(struct mbuf *, int); +void gre_mobile_input(struct mbuf *, int); +#endif /* _KERNEL */ diff --git a/freebsd/sys/netinet/ip_icmp.c b/freebsd/sys/netinet/ip_icmp.c new file mode 100644 index 00000000..b7a83128 --- /dev/null +++ b/freebsd/sys/netinet/ip_icmp.c @@ -0,0 +1,986 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_icmp.c 8.2 (Berkeley) 1/4/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef IPSEC +#include +#include +#endif + +#include + +#include + +/* + * ICMP routines: error generation, receive packet processing, and + * routines to turnaround packets back to the originator, and + * host table maintenance routines. + */ +VNET_DEFINE(struct icmpstat, icmpstat); +SYSCTL_VNET_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RW, + &VNET_NAME(icmpstat), icmpstat, ""); + +static VNET_DEFINE(int, icmpmaskrepl) = 0; +#define V_icmpmaskrepl VNET(icmpmaskrepl) +SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, + &VNET_NAME(icmpmaskrepl), 0, + "Reply to ICMP Address Mask Request packets."); + +static VNET_DEFINE(u_int, icmpmaskfake) = 0; +#define V_icmpmaskfake VNET(icmpmaskfake) +SYSCTL_VNET_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_RW, + &VNET_NAME(icmpmaskfake), 0, + "Fake reply to ICMP Address Mask Request packets."); + +static VNET_DEFINE(int, drop_redirect) = 0; +#define V_drop_redirect VNET(drop_redirect) +SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW, + &VNET_NAME(drop_redirect), 0, + "Ignore ICMP redirects"); + +static VNET_DEFINE(int, log_redirect) = 0; +#define V_log_redirect VNET(log_redirect) +SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, + &VNET_NAME(log_redirect), 0, + "Log ICMP redirects to the console"); + +static VNET_DEFINE(int, icmplim) = 200; +#define V_icmplim VNET(icmplim) +SYSCTL_VNET_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, + &VNET_NAME(icmplim), 0, + "Maximum number of ICMP responses per second"); + +static VNET_DEFINE(int, icmplim_output) = 1; +#define V_icmplim_output VNET(icmplim_output) +SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_RW, + &VNET_NAME(icmplim_output), 0, + "Enable rate limiting of ICMP responses"); + +static VNET_DEFINE(char, reply_src[IFNAMSIZ]); +#define V_reply_src VNET(reply_src) +SYSCTL_VNET_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_RW, + &VNET_NAME(reply_src), IFNAMSIZ, + "icmp reply source for non-local packets."); + +static VNET_DEFINE(int, icmp_rfi) = 0; +#define V_icmp_rfi VNET(icmp_rfi) +SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_RW, + &VNET_NAME(icmp_rfi), 0, + "ICMP reply from incoming interface for non-local packets"); + +static VNET_DEFINE(int, icmp_quotelen) = 8; +#define V_icmp_quotelen VNET(icmp_quotelen) +SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_RW, + &VNET_NAME(icmp_quotelen), 0, + "Number of bytes from original packet to quote in ICMP reply"); + +/* + * ICMP broadcast echo sysctl + */ +static VNET_DEFINE(int, icmpbmcastecho) = 0; +#define V_icmpbmcastecho VNET(icmpbmcastecho) +SYSCTL_VNET_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, + &VNET_NAME(icmpbmcastecho), 0, + ""); + + +#ifdef ICMPPRINTFS +int icmpprintfs = 0; +#endif + +static void icmp_reflect(struct mbuf *); +static void icmp_send(struct mbuf *, struct mbuf *); + +extern struct protosw inetsw[]; + +/* + * Kernel module interface for updating icmpstat. The argument is an index + * into icmpstat treated as an array of u_long. While this encodes the + * general layout of icmpstat into the caller, it doesn't encode its + * location, so that future changes to add, for example, per-CPU stats + * support won't cause binary compatibility problems for kernel modules. + */ +void +kmod_icmpstat_inc(int statnum) +{ + + (*((u_long *)&V_icmpstat + statnum))++; +} + +/* + * Generate an error packet of type error + * in response to bad packet ip. + */ +void +icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu) +{ + register struct ip *oip = mtod(n, struct ip *), *nip; + register unsigned oiphlen = oip->ip_hl << 2; + register struct icmp *icp; + register struct mbuf *m; + unsigned icmplen, icmpelen, nlen; + + KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type", __func__)); +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_error(%p, %x, %d)\n", oip, type, code); +#endif + if (type != ICMP_REDIRECT) + ICMPSTAT_INC(icps_error); + /* + * Don't send error: + * if the original packet was encrypted. + * if not the first fragment of message. + * in response to a multicast or broadcast packet. + * if the old packet protocol was an ICMP error message. + */ + if (n->m_flags & M_DECRYPTED) + goto freeit; + if (oip->ip_off & ~(IP_MF|IP_DF)) + goto freeit; + if (n->m_flags & (M_BCAST|M_MCAST)) + goto freeit; + if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT && + n->m_len >= oiphlen + ICMP_MINLEN && + !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip + oiphlen))->icmp_type)) { + ICMPSTAT_INC(icps_oldicmp); + goto freeit; + } + /* Drop if IP header plus 8 bytes is not contignous in first mbuf. */ + if (oiphlen + 8 > n->m_len) + goto freeit; + /* + * Calculate length to quote from original packet and + * prevent the ICMP mbuf from overflowing. + * Unfortunatly this is non-trivial since ip_forward() + * sends us truncated packets. + */ + nlen = m_length(n, NULL); + if (oip->ip_p == IPPROTO_TCP) { + struct tcphdr *th; + int tcphlen; + + if (oiphlen + sizeof(struct tcphdr) > n->m_len && + n->m_next == NULL) + goto stdreply; + if (n->m_len < oiphlen + sizeof(struct tcphdr) && + ((n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL)) + goto freeit; + th = (struct tcphdr *)((caddr_t)oip + oiphlen); + tcphlen = th->th_off << 2; + if (tcphlen < sizeof(struct tcphdr)) + goto freeit; + if (oip->ip_len < oiphlen + tcphlen) + goto freeit; + if (oiphlen + tcphlen > n->m_len && n->m_next == NULL) + goto stdreply; + if (n->m_len < oiphlen + tcphlen && + ((n = m_pullup(n, oiphlen + tcphlen)) == NULL)) + goto freeit; + icmpelen = max(tcphlen, min(V_icmp_quotelen, oip->ip_len - oiphlen)); + } else +stdreply: icmpelen = max(8, min(V_icmp_quotelen, oip->ip_len - oiphlen)); + + icmplen = min(oiphlen + icmpelen, nlen); + if (icmplen < sizeof(struct ip)) + goto freeit; + + if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen) + m = m_gethdr(M_DONTWAIT, MT_DATA); + else + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) + goto freeit; +#ifdef MAC + mac_netinet_icmp_reply(n, m); +#endif + icmplen = min(icmplen, M_TRAILINGSPACE(m) - sizeof(struct ip) - ICMP_MINLEN); + m_align(m, ICMP_MINLEN + icmplen); + m->m_len = ICMP_MINLEN + icmplen; + + /* XXX MRT make the outgoing packet use the same FIB + * that was associated with the incoming packet + */ + M_SETFIB(m, M_GETFIB(n)); + icp = mtod(m, struct icmp *); + ICMPSTAT_INC(icps_outhist[type]); + icp->icmp_type = type; + if (type == ICMP_REDIRECT) + icp->icmp_gwaddr.s_addr = dest; + else { + icp->icmp_void = 0; + /* + * The following assignments assume an overlay with the + * just zeroed icmp_void field. + */ + if (type == ICMP_PARAMPROB) { + icp->icmp_pptr = code; + code = 0; + } else if (type == ICMP_UNREACH && + code == ICMP_UNREACH_NEEDFRAG && mtu) { + icp->icmp_nextmtu = htons(mtu); + } + } + icp->icmp_code = code; + + /* + * Copy the quotation into ICMP message and + * convert quoted IP header back to network representation. + */ + m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip); + nip = &icp->icmp_ip; + nip->ip_len = htons(nip->ip_len); + nip->ip_off = htons(nip->ip_off); + + /* + * Set up ICMP message mbuf and copy old IP header (without options + * in front of ICMP message. + * If the original mbuf was meant to bypass the firewall, the error + * reply should bypass as well. + */ + m->m_flags |= n->m_flags & M_SKIP_FIREWALL; + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + m->m_pkthdr.len = m->m_len; + m->m_pkthdr.rcvif = n->m_pkthdr.rcvif; + nip = mtod(m, struct ip *); + bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip)); + nip->ip_len = m->m_len; + nip->ip_v = IPVERSION; + nip->ip_hl = 5; + nip->ip_p = IPPROTO_ICMP; + nip->ip_tos = 0; + icmp_reflect(m); + +freeit: + m_freem(n); +} + +/* + * Process a received ICMP message. + */ +void +icmp_input(struct mbuf *m, int off) +{ + struct icmp *icp; + struct in_ifaddr *ia; + struct ip *ip = mtod(m, struct ip *); + struct sockaddr_in icmpsrc, icmpdst, icmpgw; + int hlen = off; + int icmplen = ip->ip_len; + int i, code; + void (*ctlfunc)(int, struct sockaddr *, void *); + int fibnum; + + /* + * Locate icmp structure in mbuf, and check + * that not corrupted and of at least minimum length. + */ +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(ip->ip_src)); + printf("icmp_input from %s to %s, len %d\n", + buf, inet_ntoa(ip->ip_dst), icmplen); + } +#endif + if (icmplen < ICMP_MINLEN) { + ICMPSTAT_INC(icps_tooshort); + goto freeit; + } + i = hlen + min(icmplen, ICMP_ADVLENMIN); + if (m->m_len < i && (m = m_pullup(m, i)) == NULL) { + ICMPSTAT_INC(icps_tooshort); + return; + } + ip = mtod(m, struct ip *); + m->m_len -= hlen; + m->m_data += hlen; + icp = mtod(m, struct icmp *); + if (in_cksum(m, icmplen)) { + ICMPSTAT_INC(icps_checksum); + goto freeit; + } + m->m_len += hlen; + m->m_data -= hlen; + + if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type == IFT_FAITH) { + /* + * Deliver very specific ICMP type only. + */ + switch (icp->icmp_type) { + case ICMP_UNREACH: + case ICMP_TIMXCEED: + break; + default: + goto freeit; + } + } + +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_input, type %d code %d\n", icp->icmp_type, + icp->icmp_code); +#endif + + /* + * Message type specific processing. + */ + if (icp->icmp_type > ICMP_MAXTYPE) + goto raw; + + /* Initialize */ + bzero(&icmpsrc, sizeof(icmpsrc)); + icmpsrc.sin_len = sizeof(struct sockaddr_in); + icmpsrc.sin_family = AF_INET; + bzero(&icmpdst, sizeof(icmpdst)); + icmpdst.sin_len = sizeof(struct sockaddr_in); + icmpdst.sin_family = AF_INET; + bzero(&icmpgw, sizeof(icmpgw)); + icmpgw.sin_len = sizeof(struct sockaddr_in); + icmpgw.sin_family = AF_INET; + + ICMPSTAT_INC(icps_inhist[icp->icmp_type]); + code = icp->icmp_code; + switch (icp->icmp_type) { + + case ICMP_UNREACH: + switch (code) { + case ICMP_UNREACH_NET: + case ICMP_UNREACH_HOST: + case ICMP_UNREACH_SRCFAIL: + case ICMP_UNREACH_NET_UNKNOWN: + case ICMP_UNREACH_HOST_UNKNOWN: + case ICMP_UNREACH_ISOLATED: + case ICMP_UNREACH_TOSNET: + case ICMP_UNREACH_TOSHOST: + case ICMP_UNREACH_HOST_PRECEDENCE: + case ICMP_UNREACH_PRECEDENCE_CUTOFF: + code = PRC_UNREACH_NET; + break; + + case ICMP_UNREACH_NEEDFRAG: + code = PRC_MSGSIZE; + break; + + /* + * RFC 1122, Sections 3.2.2.1 and 4.2.3.9. + * Treat subcodes 2,3 as immediate RST + */ + case ICMP_UNREACH_PROTOCOL: + case ICMP_UNREACH_PORT: + code = PRC_UNREACH_PORT; + break; + + case ICMP_UNREACH_NET_PROHIB: + case ICMP_UNREACH_HOST_PROHIB: + case ICMP_UNREACH_FILTER_PROHIB: + code = PRC_UNREACH_ADMIN_PROHIB; + break; + + default: + goto badcode; + } + goto deliver; + + case ICMP_TIMXCEED: + if (code > 1) + goto badcode; + code += PRC_TIMXCEED_INTRANS; + goto deliver; + + case ICMP_PARAMPROB: + if (code > 1) + goto badcode; + code = PRC_PARAMPROB; + goto deliver; + + case ICMP_SOURCEQUENCH: + if (code) + goto badcode; + code = PRC_QUENCH; + deliver: + /* + * Problem with datagram; advise higher level routines. + */ + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || + icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { + ICMPSTAT_INC(icps_badlen); + goto freeit; + } + icp->icmp_ip.ip_len = ntohs(icp->icmp_ip.ip_len); + /* Discard ICMP's in response to multicast packets */ + if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr))) + goto badcode; +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; + /* + * XXX if the packet contains [IPv4 AH TCP], we can't make a + * notification to TCP layer. + */ + ctlfunc = inetsw[ip_protox[icp->icmp_ip.ip_p]].pr_ctlinput; + if (ctlfunc) + (*ctlfunc)(code, (struct sockaddr *)&icmpsrc, + (void *)&icp->icmp_ip); + break; + + badcode: + ICMPSTAT_INC(icps_badcode); + break; + + case ICMP_ECHO: + if (!V_icmpbmcastecho + && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { + ICMPSTAT_INC(icps_bmcastecho); + break; + } + icp->icmp_type = ICMP_ECHOREPLY; + if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0) + goto freeit; + else + goto reflect; + + case ICMP_TSTAMP: + if (!V_icmpbmcastecho + && (m->m_flags & (M_MCAST | M_BCAST)) != 0) { + ICMPSTAT_INC(icps_bmcasttstamp); + break; + } + if (icmplen < ICMP_TSLEN) { + ICMPSTAT_INC(icps_badlen); + break; + } + icp->icmp_type = ICMP_TSTAMPREPLY; + icp->icmp_rtime = iptime(); + icp->icmp_ttime = icp->icmp_rtime; /* bogus, do later! */ + if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0) + goto freeit; + else + goto reflect; + + case ICMP_MASKREQ: + if (V_icmpmaskrepl == 0) + break; + /* + * We are not able to respond with all ones broadcast + * unless we receive it over a point-to-point interface. + */ + if (icmplen < ICMP_MASKLEN) + break; + switch (ip->ip_dst.s_addr) { + + case INADDR_BROADCAST: + case INADDR_ANY: + icmpdst.sin_addr = ip->ip_src; + break; + + default: + icmpdst.sin_addr = ip->ip_dst; + } + ia = (struct in_ifaddr *)ifaof_ifpforaddr( + (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); + if (ia == NULL) + break; + if (ia->ia_ifp == NULL) { + ifa_free(&ia->ia_ifa); + break; + } + icp->icmp_type = ICMP_MASKREPLY; + if (V_icmpmaskfake == 0) + icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr; + else + icp->icmp_mask = V_icmpmaskfake; + if (ip->ip_src.s_addr == 0) { + if (ia->ia_ifp->if_flags & IFF_BROADCAST) + ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr; + else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) + ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; + } + ifa_free(&ia->ia_ifa); +reflect: + ip->ip_len += hlen; /* since ip_input deducts this */ + ICMPSTAT_INC(icps_reflect); + ICMPSTAT_INC(icps_outhist[icp->icmp_type]); + icmp_reflect(m); + return; + + case ICMP_REDIRECT: + if (V_log_redirect) { + u_long src, dst, gw; + + src = ntohl(ip->ip_src.s_addr); + dst = ntohl(icp->icmp_ip.ip_dst.s_addr); + gw = ntohl(icp->icmp_gwaddr.s_addr); + printf("icmp redirect from %d.%d.%d.%d: " + "%d.%d.%d.%d => %d.%d.%d.%d\n", + (int)(src >> 24), (int)((src >> 16) & 0xff), + (int)((src >> 8) & 0xff), (int)(src & 0xff), + (int)(dst >> 24), (int)((dst >> 16) & 0xff), + (int)((dst >> 8) & 0xff), (int)(dst & 0xff), + (int)(gw >> 24), (int)((gw >> 16) & 0xff), + (int)((gw >> 8) & 0xff), (int)(gw & 0xff)); + } + /* + * RFC1812 says we must ignore ICMP redirects if we + * are acting as router. + */ + if (V_drop_redirect || V_ipforwarding) + break; + if (code > 3) + goto badcode; + if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) || + icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) { + ICMPSTAT_INC(icps_badlen); + break; + } + /* + * Short circuit routing redirects to force + * immediate change in the kernel's routing + * tables. The message is also handed to anyone + * listening on a raw socket (e.g. the routing + * daemon for use in updating its tables). + */ + icmpgw.sin_addr = ip->ip_src; + icmpdst.sin_addr = icp->icmp_gwaddr; +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(icp->icmp_ip.ip_dst)); + + printf("redirect dst %s to %s\n", + buf, inet_ntoa(icp->icmp_gwaddr)); + } +#endif + icmpsrc.sin_addr = icp->icmp_ip.ip_dst; + for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) { + in_rtredirect((struct sockaddr *)&icmpsrc, + (struct sockaddr *)&icmpdst, + (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, + (struct sockaddr *)&icmpgw, fibnum); + } + pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); +#ifdef IPSEC + key_sa_routechange((struct sockaddr *)&icmpsrc); +#endif + break; + + /* + * No kernel processing for the following; + * just fall through to send to raw listener. + */ + case ICMP_ECHOREPLY: + case ICMP_ROUTERADVERT: + case ICMP_ROUTERSOLICIT: + case ICMP_TSTAMPREPLY: + case ICMP_IREQREPLY: + case ICMP_MASKREPLY: + default: + break; + } + +raw: + rip_input(m, off); + return; + +freeit: + m_freem(m); +} + +/* + * Reflect the ip packet back to the source + */ +static void +icmp_reflect(struct mbuf *m) +{ + struct ip *ip = mtod(m, struct ip *); + struct ifaddr *ifa; + struct ifnet *ifp; + struct in_ifaddr *ia; + struct in_addr t; + struct mbuf *opts = 0; + int optlen = (ip->ip_hl << 2) - sizeof(struct ip); + + if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) || + IN_ZERONET(ntohl(ip->ip_src.s_addr)) ) { + m_freem(m); /* Bad return address */ + ICMPSTAT_INC(icps_badaddr); + goto done; /* Ip_output() will check for broadcast */ + } + + t = ip->ip_dst; + ip->ip_dst = ip->ip_src; + + /* + * Source selection for ICMP replies: + * + * If the incoming packet was addressed directly to one of our + * own addresses, use dst as the src for the reply. + */ + IN_IFADDR_RLOCK(); + LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) { + if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) { + t = IA_SIN(ia)->sin_addr; + IN_IFADDR_RUNLOCK(); + goto match; + } + } + IN_IFADDR_RUNLOCK(); + + /* + * If the incoming packet was addressed to one of our broadcast + * addresses, use the first non-broadcast address which corresponds + * to the incoming interface. + */ + ifp = m->m_pkthdr.rcvif; + if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ia = ifatoia(ifa); + if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == + t.s_addr) { + t = IA_SIN(ia)->sin_addr; + IF_ADDR_UNLOCK(ifp); + goto match; + } + } + IF_ADDR_UNLOCK(ifp); + } + /* + * If the packet was transiting through us, use the address of + * the interface the packet came through in. If that interface + * doesn't have a suitable IP address, the normal selection + * criteria apply. + */ + if (V_icmp_rfi && ifp != NULL) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ia = ifatoia(ifa); + t = IA_SIN(ia)->sin_addr; + IF_ADDR_UNLOCK(ifp); + goto match; + } + IF_ADDR_UNLOCK(ifp); + } + /* + * If the incoming packet was not addressed directly to us, use + * designated interface for icmp replies specified by sysctl + * net.inet.icmp.reply_src (default not set). Otherwise continue + * with normal source selection. + */ + if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ia = ifatoia(ifa); + t = IA_SIN(ia)->sin_addr; + IF_ADDR_UNLOCK(ifp); + goto match; + } + IF_ADDR_UNLOCK(ifp); + } + /* + * If the packet was transiting through us, use the address of + * the interface that is the closest to the packet source. + * When we don't have a route back to the packet source, stop here + * and drop the packet. + */ + ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); + if (ia == NULL) { + m_freem(m); + ICMPSTAT_INC(icps_noroute); + goto done; + } + t = IA_SIN(ia)->sin_addr; + ifa_free(&ia->ia_ifa); +match: +#ifdef MAC + mac_netinet_icmp_replyinplace(m); +#endif + ip->ip_src = t; + ip->ip_ttl = V_ip_defttl; + + if (optlen > 0) { + register u_char *cp; + int opt, cnt; + u_int len; + + /* + * Retrieve any source routing from the incoming packet; + * add on any record-route or timestamp options. + */ + cp = (u_char *) (ip + 1); + if ((opts = ip_srcroute(m)) == 0 && + (opts = m_gethdr(M_DONTWAIT, MT_DATA))) { + opts->m_len = sizeof(struct in_addr); + mtod(opts, struct in_addr *)->s_addr = 0; + } + if (opts) { +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("icmp_reflect optlen %d rt %d => ", + optlen, opts->m_len); +#endif + for (cnt = optlen; cnt > 0; cnt -= len, cp += len) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + len = 1; + else { + if (cnt < IPOPT_OLEN + sizeof(*cp)) + break; + len = cp[IPOPT_OLEN]; + if (len < IPOPT_OLEN + sizeof(*cp) || + len > cnt) + break; + } + /* + * Should check for overflow, but it "can't happen" + */ + if (opt == IPOPT_RR || opt == IPOPT_TS || + opt == IPOPT_SECURITY) { + bcopy((caddr_t)cp, + mtod(opts, caddr_t) + opts->m_len, len); + opts->m_len += len; + } + } + /* Terminate & pad, if necessary */ + cnt = opts->m_len % 4; + if (cnt) { + for (; cnt < 4; cnt++) { + *(mtod(opts, caddr_t) + opts->m_len) = + IPOPT_EOL; + opts->m_len++; + } + } +#ifdef ICMPPRINTFS + if (icmpprintfs) + printf("%d\n", opts->m_len); +#endif + } + /* + * Now strip out original options by copying rest of first + * mbuf's data back, and adjust the IP length. + */ + ip->ip_len -= optlen; + ip->ip_v = IPVERSION; + ip->ip_hl = 5; + m->m_len -= optlen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= optlen; + optlen += sizeof(struct ip); + bcopy((caddr_t)ip + optlen, (caddr_t)(ip + 1), + (unsigned)(m->m_len - sizeof(struct ip))); + } + m_tag_delete_nonpersistent(m); + m->m_flags &= ~(M_BCAST|M_MCAST); + icmp_send(m, opts); +done: + if (opts) + (void)m_free(opts); +} + +/* + * Send an icmp packet back to the ip level, + * after supplying a checksum. + */ +static void +icmp_send(struct mbuf *m, struct mbuf *opts) +{ + register struct ip *ip = mtod(m, struct ip *); + register int hlen; + register struct icmp *icp; + + hlen = ip->ip_hl << 2; + m->m_data += hlen; + m->m_len -= hlen; + icp = mtod(m, struct icmp *); + icp->icmp_cksum = 0; + icp->icmp_cksum = in_cksum(m, ip->ip_len - hlen); + m->m_data -= hlen; + m->m_len += hlen; + m->m_pkthdr.rcvif = (struct ifnet *)0; +#ifdef ICMPPRINTFS + if (icmpprintfs) { + char buf[4 * sizeof "123"]; + strcpy(buf, inet_ntoa(ip->ip_dst)); + printf("icmp_send dst %s src %s\n", + buf, inet_ntoa(ip->ip_src)); + } +#endif + (void) ip_output(m, opts, NULL, 0, NULL, NULL); +} + +/* + * Return milliseconds since 00:00 GMT in network format. + */ +uint32_t +iptime(void) +{ + struct timeval atv; + u_long t; + + getmicrotime(&atv); + t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000; + return (htonl(t)); +} + +/* + * Return the next larger or smaller MTU plateau (table from RFC 1191) + * given current value MTU. If DIR is less than zero, a larger plateau + * is returned; otherwise, a smaller value is returned. + */ +int +ip_next_mtu(int mtu, int dir) +{ + static int mtutab[] = { + 65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508, + 296, 68, 0 + }; + int i, size; + + size = (sizeof mtutab) / (sizeof mtutab[0]); + if (dir >= 0) { + for (i = 0; i < size; i++) + if (mtu > mtutab[i]) + return mtutab[i]; + } else { + for (i = size - 1; i >= 0; i--) + if (mtu < mtutab[i]) + return mtutab[i]; + if (mtu == mtutab[0]) + return mtutab[0]; + } + return 0; +} + + +/* + * badport_bandlim() - check for ICMP bandwidth limit + * + * Return 0 if it is ok to send an ICMP error response, -1 if we have + * hit our bandwidth limit and it is not ok. + * + * If icmplim is <= 0, the feature is disabled and 0 is returned. + * + * For now we separate the TCP and UDP subsystems w/ different 'which' + * values. We may eventually remove this separation (and simplify the + * code further). + * + * Note that the printing of the error message is delayed so we can + * properly print the icmp error rate that the system was trying to do + * (i.e. 22000/100 pps, etc...). This can cause long delays in printing + * the 'final' error, but it doesn't make sense to solve the printing + * delay with more complex code. + */ + +int +badport_bandlim(int which) +{ + +#define N(a) (sizeof (a) / sizeof (a[0])) + static struct rate { + const char *type; + struct timeval lasttime; + int curpps; + } rates[BANDLIM_MAX+1] = { + { "icmp unreach response" }, + { "icmp ping response" }, + { "icmp tstamp response" }, + { "closed port RST response" }, + { "open port RST response" }, + { "icmp6 unreach response" } + }; + + /* + * Return ok status if feature disabled or argument out of range. + */ + if (V_icmplim > 0 && (u_int) which < N(rates)) { + struct rate *r = &rates[which]; + int opps = r->curpps; + + if (!ppsratecheck(&r->lasttime, &r->curpps, V_icmplim)) + return -1; /* discard packet */ + /* + * If we've dropped below the threshold after having + * rate-limited traffic print the message. This preserves + * the previous behaviour at the expense of added complexity. + */ + if (V_icmplim_output && opps > V_icmplim) + log(LOG_NOTICE, "Limiting %s from %d to %d packets/sec\n", + r->type, opps, V_icmplim); + } + return 0; /* okay to send packet */ +#undef N +} diff --git a/freebsd/sys/netinet/ip_icmp.h b/freebsd/sys/netinet/ip_icmp.h new file mode 100644 index 00000000..903f033d --- /dev/null +++ b/freebsd/sys/netinet/ip_icmp.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/netinet/ip_id.c b/freebsd/sys/netinet/ip_id.c new file mode 100644 index 00000000..ba99cdbb --- /dev/null +++ b/freebsd/sys/netinet/ip_id.c @@ -0,0 +1,211 @@ +#include + + +/*- + * Copyright (c) 2008 Michael J. Silbersack. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * IP ID generation is a fascinating topic. + * + * In order to avoid ID collisions during packet reassembly, common sense + * dictates that the period between reuse of IDs be as large as possible. + * This leads to the classic implementation of a system-wide counter, thereby + * ensuring that IDs repeat only once every 2^16 packets. + * + * Subsequent security researchers have pointed out that using a global + * counter makes ID values predictable. This predictability allows traffic + * analysis, idle scanning, and even packet injection in specific cases. + * These results suggest that IP IDs should be as random as possible. + * + * The "searchable queues" algorithm used in this IP ID implementation was + * proposed by Amit Klein. It is a compromise between the above two + * viewpoints that has provable behavior that can be tuned to the user's + * requirements. + * + * The basic concept is that we supplement a standard random number generator + * with a queue of the last L IDs that we have handed out to ensure that all + * IDs have a period of at least L. + * + * To efficiently implement this idea, we keep two data structures: a + * circular array of IDs of size L and a bitstring of 65536 bits. + * + * To start, we ask the RNG for a new ID. A quick index into the bitstring + * is used to determine if this is a recently used value. The process is + * repeated until a value is returned that is not in the bitstring. + * + * Having found a usable ID, we remove the ID stored at the current position + * in the queue from the bitstring and replace it with our new ID. Our new + * ID is then added to the bitstring and the queue pointer is incremented. + * + * The lower limit of 512 was chosen because there doesn't seem to be much + * point to having a smaller value. The upper limit of 32768 was chosen for + * two reasons. First, every step above 32768 decreases the entropy. Taken + * to an extreme, 65533 would offer 1 bit of entropy. Second, the number of + * attempts it takes the algorithm to find an unused ID drastically + * increases, killing performance. The default value of 8192 was chosen + * because it provides a good tradeoff between randomness and non-repetition. + * + * With L=8192, the queue will use 16K of memory. The bitstring always + * uses 8K of memory. No memory is allocated until the use of random ids is + * enabled. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static MALLOC_DEFINE(M_IPID, "ipid", "randomized ip id state"); + +static u_int16_t *id_array = NULL; +static bitstr_t *id_bits = NULL; +static int array_ptr = 0; +static int array_size = 8192; +static int random_id_collisions = 0; +static int random_id_total = 0; +static struct mtx ip_id_mtx; + +static void ip_initid(void); +static int sysctl_ip_id_change(SYSCTL_HANDLER_ARGS); + +MTX_SYSINIT(ip_id_mtx, &ip_id_mtx, "ip_id_mtx", MTX_DEF); + +SYSCTL_DECL(_net_inet_ip); +SYSCTL_PROC(_net_inet_ip, OID_AUTO, random_id_period, CTLTYPE_INT|CTLFLAG_RW, + &array_size, 0, sysctl_ip_id_change, "IU", "IP ID Array size"); +SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_collisions, CTLFLAG_RD, + &random_id_collisions, 0, "Count of IP ID collisions"); +SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id_total, CTLFLAG_RD, + &random_id_total, 0, "Count of IP IDs created"); + +static int +sysctl_ip_id_change(SYSCTL_HANDLER_ARGS) +{ + int error, new; + + new = array_size; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + if (new >= 512 && new <= 32768) { + mtx_lock(&ip_id_mtx); + array_size = new; + ip_initid(); + mtx_unlock(&ip_id_mtx); + } else + error = EINVAL; + } + return (error); +} + +/* + * ip_initid() runs with a mutex held and may execute in a network context. + * As a result, it uses M_NOWAIT. Ideally, we would always do this + * allocation from the sysctl contact and have it be an invariant that if + * this random ID allocation mode is selected, the buffers are present. This + * would also avoid potential network context failures of IP ID generation. + */ +static void +ip_initid(void) +{ + + mtx_assert(&ip_id_mtx, MA_OWNED); + + if (id_array != NULL) { + free(id_array, M_IPID); + free(id_bits, M_IPID); + } + random_id_collisions = 0; + random_id_total = 0; + array_ptr = 0; + id_array = (u_int16_t *) malloc(array_size * sizeof(u_int16_t), + M_IPID, M_NOWAIT | M_ZERO); + id_bits = (bitstr_t *) malloc(bitstr_size(65536), M_IPID, + M_NOWAIT | M_ZERO); + if (id_array == NULL || id_bits == NULL) { + /* Neither or both. */ + if (id_array != NULL) { + free(id_array, M_IPID); + id_array = NULL; + } + if (id_bits != NULL) { + free(id_bits, M_IPID); + id_bits = NULL; + } + } +} + +u_int16_t +ip_randomid(void) +{ + u_int16_t new_id; + + mtx_lock(&ip_id_mtx); + if (id_array == NULL) + ip_initid(); + + /* + * Fail gracefully; return a fixed id if memory allocation failed; + * ideally we wouldn't do allocation in this context in order to + * avoid the possibility of this failure mode. + */ + if (id_array == NULL) { + mtx_unlock(&ip_id_mtx); + return (1); + } + + /* + * To avoid a conflict with the zeros that the array is initially + * filled with, we never hand out an id of zero. + */ + new_id = 0; + do { + if (new_id != 0) + random_id_collisions++; + arc4rand(&new_id, sizeof(new_id), 0); + } while (bit_test(id_bits, new_id) || new_id == 0); + bit_clear(id_bits, id_array[array_ptr]); + bit_set(id_bits, new_id); + id_array[array_ptr] = new_id; + array_ptr++; + if (array_ptr == array_size) + array_ptr = 0; + random_id_total++; + mtx_unlock(&ip_id_mtx); + return (new_id); +} diff --git a/freebsd/sys/netinet/ip_input.c b/freebsd/sys/netinet/ip_input.c new file mode 100644 index 00000000..3964e886 --- /dev/null +++ b/freebsd/sys/netinet/ip_input.c @@ -0,0 +1,1794 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef IPSEC +#include +#endif /* IPSEC */ + +#include + +#include + +#ifdef CTASSERT +CTASSERT(sizeof(struct ip) == 20); +#endif + +struct rwlock in_ifaddr_lock; +RW_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); + +VNET_DEFINE(int, rsvp_on); + +VNET_DEFINE(int, ipforwarding); +SYSCTL_VNET_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_RW, + &VNET_NAME(ipforwarding), 0, + "Enable IP forwarding between interfaces"); + +static VNET_DEFINE(int, ipsendredirects) = 1; /* XXX */ +#define V_ipsendredirects VNET(ipsendredirects) +SYSCTL_VNET_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, + &VNET_NAME(ipsendredirects), 0, + "Enable sending IP redirects"); + +VNET_DEFINE(int, ip_defttl) = IPDEFTTL; +SYSCTL_VNET_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, + &VNET_NAME(ip_defttl), 0, + "Maximum TTL on IP packets"); + +static VNET_DEFINE(int, ip_keepfaith); +#define V_ip_keepfaith VNET(ip_keepfaith) +SYSCTL_VNET_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, + &VNET_NAME(ip_keepfaith), 0, + "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); + +static VNET_DEFINE(int, ip_sendsourcequench); +#define V_ip_sendsourcequench VNET(ip_sendsourcequench) +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, sendsourcequench, CTLFLAG_RW, + &VNET_NAME(ip_sendsourcequench), 0, + "Enable the transmission of source quench packets"); + +VNET_DEFINE(int, ip_do_randomid); +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, + &VNET_NAME(ip_do_randomid), 0, + "Assign random ip_id values"); + +/* + * XXX - Setting ip_checkinterface mostly implements the receive side of + * the Strong ES model described in RFC 1122, but since the routing table + * and transmit implementation do not implement the Strong ES model, + * setting this to 1 results in an odd hybrid. + * + * XXX - ip_checkinterface currently must be disabled if you use ipnat + * to translate the destination address to another local interface. + * + * XXX - ip_checkinterface must be disabled if you add IP aliases + * to the loopback interface instead of the interface where the + * packets for those addresses are received. + */ +static VNET_DEFINE(int, ip_checkinterface); +#define V_ip_checkinterface VNET(ip_checkinterface) +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, + &VNET_NAME(ip_checkinterface), 0, + "Verify packet arrives on correct interface"); + +VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ + +static struct netisr_handler ip_nh = { + .nh_name = "ip", + .nh_handler = ip_input, + .nh_proto = NETISR_IP, + .nh_policy = NETISR_POLICY_FLOW, +}; + +extern struct domain inetdomain; +extern struct protosw inetsw[]; +u_char ip_protox[IPPROTO_MAX]; +VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ +VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ +VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ + +VNET_DEFINE(struct ipstat, ipstat); +SYSCTL_VNET_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RW, + &VNET_NAME(ipstat), ipstat, + "IP statistics (struct ipstat, netinet/ip_var.h)"); + +static VNET_DEFINE(uma_zone_t, ipq_zone); +static VNET_DEFINE(TAILQ_HEAD(ipqhead, ipq), ipq[IPREASS_NHASH]); +static struct mtx ipqlock; + +#define V_ipq_zone VNET(ipq_zone) +#define V_ipq VNET(ipq) + +#define IPQ_LOCK() mtx_lock(&ipqlock) +#define IPQ_UNLOCK() mtx_unlock(&ipqlock) +#define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF) +#define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED) + +static void maxnipq_update(void); +static void ipq_zone_change(void *); +static void ip_drain_locked(void); + +static VNET_DEFINE(int, maxnipq); /* Administrative limit on # reass queues. */ +static VNET_DEFINE(int, nipq); /* Total # of reass queues */ +#define V_maxnipq VNET(maxnipq) +#define V_nipq VNET(nipq) +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD, + &VNET_NAME(nipq), 0, + "Current number of IPv4 fragment reassembly queue entries"); + +static VNET_DEFINE(int, maxfragsperpacket); +#define V_maxfragsperpacket VNET(maxfragsperpacket) +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, + &VNET_NAME(maxfragsperpacket), 0, + "Maximum number of IPv4 fragments allowed per packet"); + +struct callout ipport_tick_callout; + +#ifdef IPCTL_DEFMTU +SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, + &ip_mtu, 0, "Default MTU"); +#endif + +#ifdef IPSTEALTH +VNET_DEFINE(int, ipstealth); +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, + &VNET_NAME(ipstealth), 0, + "IP stealth mode, no TTL decrementation on forwarding"); +#endif + +#ifdef FLOWTABLE +static VNET_DEFINE(int, ip_output_flowtable_size) = 2048; +VNET_DEFINE(struct flowtable *, ip_ft); +#define V_ip_output_flowtable_size VNET(ip_output_flowtable_size) + +SYSCTL_VNET_INT(_net_inet_ip, OID_AUTO, output_flowtable_size, CTLFLAG_RDTUN, + &VNET_NAME(ip_output_flowtable_size), 2048, + "number of entries in the per-cpu output flow caches"); +#endif + +VNET_DEFINE(int, fw_one_pass) = 1; + +static void ip_freef(struct ipqhead *, struct ipq *); + +/* + * Kernel module interface for updating ipstat. The argument is an index + * into ipstat treated as an array of u_long. While this encodes the general + * layout of ipstat into the caller, it doesn't encode its location, so that + * future changes to add, for example, per-CPU stats support won't cause + * binary compatibility problems for kernel modules. + */ +void +kmod_ipstat_inc(int statnum) +{ + + (*((u_long *)&V_ipstat + statnum))++; +} + +void +kmod_ipstat_dec(int statnum) +{ + + (*((u_long *)&V_ipstat + statnum))--; +} + +static int +sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) +{ + int error, qlimit; + + netisr_getqlimit(&ip_nh, &qlimit); + error = sysctl_handle_int(oidp, &qlimit, 0, req); + if (error || !req->newptr) + return (error); + if (qlimit < 1) + return (EINVAL); + return (netisr_setqlimit(&ip_nh, qlimit)); +} +SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, + CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", + "Maximum size of the IP input queue"); + +static int +sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) +{ + u_int64_t qdrops_long; + int error, qdrops; + + netisr_getqdrops(&ip_nh, &qdrops_long); + qdrops = qdrops_long; + error = sysctl_handle_int(oidp, &qdrops, 0, req); + if (error || !req->newptr) + return (error); + if (qdrops != 0) + return (EINVAL); + netisr_clearqdrops(&ip_nh); + return (0); +} + +SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, + CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", + "Number of packets dropped from the IP input queue"); + +/* + * IP initialization: fill in IP protocol switch table. + * All protocols not implemented in kernel go to raw IP protocol handler. + */ +void +ip_init(void) +{ + struct protosw *pr; + int i; + + V_ip_id = time_second & 0xffff; + + TAILQ_INIT(&V_in_ifaddrhead); + V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); + + /* Initialize IP reassembly queue. */ + for (i = 0; i < IPREASS_NHASH; i++) + TAILQ_INIT(&V_ipq[i]); + V_maxnipq = nmbclusters / 32; + V_maxfragsperpacket = 16; + V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, + NULL, UMA_ALIGN_PTR, 0); + maxnipq_update(); + + /* Initialize packet filter hooks. */ + V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; + V_inet_pfil_hook.ph_af = AF_INET; + if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0) + printf("%s: WARNING: unable to register pfil hook, " + "error %d\n", __func__, i); + +#ifdef FLOWTABLE + if (TUNABLE_INT_FETCH("net.inet.ip.output_flowtable_size", + &V_ip_output_flowtable_size)) { + if (V_ip_output_flowtable_size < 256) + V_ip_output_flowtable_size = 256; + if (!powerof2(V_ip_output_flowtable_size)) { + printf("flowtable must be power of 2 size\n"); + V_ip_output_flowtable_size = 2048; + } + } else { + /* + * round up to the next power of 2 + */ + V_ip_output_flowtable_size = 1 << fls((1024 + maxusers * 64)-1); + } + V_ip_ft = flowtable_alloc("ipv4", V_ip_output_flowtable_size, FL_PCPU); +#endif + + /* Skip initialization of globals for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; + + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); + if (pr == NULL) + panic("ip_init: PF_INET not found"); + + /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ + for (i = 0; i < IPPROTO_MAX; i++) + ip_protox[i] = pr - inetsw; + /* + * Cycle through IP protocols and put them into the appropriate place + * in ip_protox[]. + */ + for (pr = inetdomain.dom_protosw; + pr < inetdomain.dom_protoswNPROTOSW; pr++) + if (pr->pr_domain->dom_family == PF_INET && + pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { + /* Be careful to only index valid IP protocols. */ + if (pr->pr_protocol < IPPROTO_MAX) + ip_protox[pr->pr_protocol] = pr - inetsw; + } + + /* Start ipport_tick. */ + callout_init(&ipport_tick_callout, CALLOUT_MPSAFE); + callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); + EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, + SHUTDOWN_PRI_DEFAULT); + EVENTHANDLER_REGISTER(nmbclusters_change, ipq_zone_change, + NULL, EVENTHANDLER_PRI_ANY); + + /* Initialize various other remaining things. */ + IPQ_LOCK_INIT(); + netisr_register(&ip_nh); +} + +#ifdef VIMAGE +void +ip_destroy(void) +{ + + /* Cleanup in_ifaddr hash table; should be empty. */ + hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); + + IPQ_LOCK(); + ip_drain_locked(); + IPQ_UNLOCK(); + + uma_zdestroy(V_ipq_zone); +} +#endif + +void +ip_fini(void *xtp) +{ + + callout_stop(&ipport_tick_callout); +} + +/* + * Ip input routine. Checksum and byte swap header. If fragmented + * try to reassemble. Process options. Pass to next level. + */ +void +ip_input(struct mbuf *m) +{ + struct ip *ip = NULL; + struct in_ifaddr *ia = NULL; + struct ifaddr *ifa; + struct ifnet *ifp; + int checkif, hlen = 0; + u_short sum; + int dchg = 0; /* dest changed after fw */ + struct in_addr odst; /* original dst address */ + + M_ASSERTPKTHDR(m); + + if (m->m_flags & M_FASTFWD_OURS) { + /* + * Firewall or NAT changed destination to local. + * We expect ip_len and ip_off to be in host byte order. + */ + m->m_flags &= ~M_FASTFWD_OURS; + /* Set up some basics that will be used later. */ + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + goto ours; + } + + IPSTAT_INC(ips_total); + + if (m->m_pkthdr.len < sizeof(struct ip)) + goto tooshort; + + if (m->m_len < sizeof (struct ip) && + (m = m_pullup(m, sizeof (struct ip))) == NULL) { + IPSTAT_INC(ips_toosmall); + return; + } + ip = mtod(m, struct ip *); + + if (ip->ip_v != IPVERSION) { + IPSTAT_INC(ips_badvers); + goto bad; + } + + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) { /* minimum header length */ + IPSTAT_INC(ips_badhlen); + goto bad; + } + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == NULL) { + IPSTAT_INC(ips_badhlen); + return; + } + ip = mtod(m, struct ip *); + } + + /* 127/8 must not appear on wire - RFC1122 */ + ifp = m->m_pkthdr.rcvif; + if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + if ((ifp->if_flags & IFF_LOOPBACK) == 0) { + IPSTAT_INC(ips_badaddr); + goto bad; + } + } + + if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { + sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); + } else { + if (hlen == sizeof(struct ip)) { + sum = in_cksum_hdr(ip); + } else { + sum = in_cksum(m, hlen); + } + } + if (sum) { + IPSTAT_INC(ips_badsum); + goto bad; + } + +#ifdef ALTQ + if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) + /* packet is dropped by traffic conditioner */ + return; +#endif + + /* + * Convert fields to host representation. + */ + ip->ip_len = ntohs(ip->ip_len); + if (ip->ip_len < hlen) { + IPSTAT_INC(ips_badlen); + goto bad; + } + ip->ip_off = ntohs(ip->ip_off); + + /* + * Check that the amount of data in the buffers + * is as at least much as the IP header would have us expect. + * Trim mbufs if longer than we expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < ip->ip_len) { +tooshort: + IPSTAT_INC(ips_tooshort); + goto bad; + } + if (m->m_pkthdr.len > ip->ip_len) { + if (m->m_len == m->m_pkthdr.len) { + m->m_len = ip->ip_len; + m->m_pkthdr.len = ip->ip_len; + } else + m_adj(m, ip->ip_len - m->m_pkthdr.len); + } +#ifdef IPSEC + /* + * Bypass packet filtering for packets from a tunnel (gif). + */ + if (ip_ipsec_filtertunnel(m)) + goto passin; +#endif /* IPSEC */ + + /* + * Run through list of hooks for input packets. + * + * NB: Beware of the destination address changing (e.g. + * by NAT rewriting). When this happens, tell + * ip_forward to do the right thing. + */ + + /* Jump over all PFIL processing if hooks are not active. */ + if (!PFIL_HOOKED(&V_inet_pfil_hook)) + goto passin; + + odst = ip->ip_dst; + if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, NULL) != 0) + return; + if (m == NULL) /* consumed by filter */ + return; + + ip = mtod(m, struct ip *); + dchg = (odst.s_addr != ip->ip_dst.s_addr); + ifp = m->m_pkthdr.rcvif; + +#ifdef IPFIREWALL_FORWARD + if (m->m_flags & M_FASTFWD_OURS) { + m->m_flags &= ~M_FASTFWD_OURS; + goto ours; + } + if ((dchg = (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL)) != 0) { + /* + * Directly ship the packet on. This allows forwarding + * packets originally destined to us to some other directly + * connected host. + */ + ip_forward(m, dchg); + return; + } +#endif /* IPFIREWALL_FORWARD */ + +passin: + /* + * Process options and, if not destined for us, + * ship it on. ip_dooptions returns 1 when an + * error was detected (causing an icmp message + * to be sent and the original packet to be freed). + */ + if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) + return; + + /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no + * matter if it is destined to another node, or whether it is + * a multicast one, RSVP wants it! and prevents it from being forwarded + * anywhere else. Also checks if the rsvp daemon is running before + * grabbing the packet. + */ + if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) + goto ours; + + /* + * Check our list of addresses, to see if the packet is for us. + * If we don't have any addresses, assume any unicast packet + * we receive might be for us (and let the upper layers deal + * with it). + */ + if (TAILQ_EMPTY(&V_in_ifaddrhead) && + (m->m_flags & (M_MCAST|M_BCAST)) == 0) + goto ours; + + /* + * Enable a consistency check between the destination address + * and the arrival interface for a unicast packet (the RFC 1122 + * strong ES model) if IP forwarding is disabled and the packet + * is not locally generated and the packet is not subject to + * 'ipfw fwd'. + * + * XXX - Checking also should be disabled if the destination + * address is ipnat'ed to a different interface. + * + * XXX - Checking is incompatible with IP aliases added + * to the loopback interface instead of the interface where + * the packets are received. + * + * XXX - This is the case for carp vhost IPs as well so we + * insert a workaround. If the packet got here, we already + * checked with carp_iamatch() and carp_forus(). + */ + checkif = V_ip_checkinterface && (V_ipforwarding == 0) && + ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && + ifp->if_carp == NULL && (dchg == 0); + + /* + * Check for exact addresses in the hash bucket. + */ + /* IN_IFADDR_RLOCK(); */ + LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { + /* + * If the address matches, verify that the packet + * arrived via the correct interface if checking is + * enabled. + */ + if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && + (!checkif || ia->ia_ifp == ifp)) { + ifa_ref(&ia->ia_ifa); + /* IN_IFADDR_RUNLOCK(); */ + goto ours; + } + } + /* IN_IFADDR_RUNLOCK(); */ + + /* + * Check for broadcast addresses. + * + * Only accept broadcast packets that arrive via the matching + * interface. Reception of forwarded directed broadcasts would + * be handled via ip_forward() and ether_output() with the loopback + * into the stack for SIMPLEX interfaces handled by ether_output(). + */ + if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { + IF_ADDR_LOCK(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ia = ifatoia(ifa); + if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == + ip->ip_dst.s_addr) { + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + goto ours; + } + if (ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) { + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + goto ours; + } +#ifdef BOOTP_COMPAT + if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { + ifa_ref(ifa); + IF_ADDR_UNLOCK(ifp); + goto ours; + } +#endif + } + IF_ADDR_UNLOCK(ifp); + ia = NULL; + } + /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ + if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { + IPSTAT_INC(ips_cantforward); + m_freem(m); + return; + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + if (V_ip_mrouter) { + /* + * If we are acting as a multicast router, all + * incoming multicast packets are passed to the + * kernel-level multicast forwarding function. + * The packet is returned (relatively) intact; if + * ip_mforward() returns a non-zero value, the packet + * must be discarded, else it may be accepted below. + */ + if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { + IPSTAT_INC(ips_cantforward); + m_freem(m); + return; + } + + /* + * The process-level routing daemon needs to receive + * all multicast IGMP packets, whether or not this + * host belongs to their destination groups. + */ + if (ip->ip_p == IPPROTO_IGMP) + goto ours; + IPSTAT_INC(ips_forward); + } + /* + * Assume the packet is for us, to avoid prematurely taking + * a lock on the in_multi hash. Protocols must perform + * their own filtering and update statistics accordingly. + */ + goto ours; + } + if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) + goto ours; + if (ip->ip_dst.s_addr == INADDR_ANY) + goto ours; + + /* + * FAITH(Firewall Aided Internet Translator) + */ + if (ifp && ifp->if_type == IFT_FAITH) { + if (V_ip_keepfaith) { + if (ip->ip_p == IPPROTO_TCP || ip->ip_p == IPPROTO_ICMP) + goto ours; + } + m_freem(m); + return; + } + + /* + * Not for us; forward if possible and desirable. + */ + if (V_ipforwarding == 0) { + IPSTAT_INC(ips_cantforward); + m_freem(m); + } else { +#ifdef IPSEC + if (ip_ipsec_fwd(m)) + goto bad; +#endif /* IPSEC */ + ip_forward(m, dchg); + } + return; + +ours: +#ifdef IPSTEALTH + /* + * IPSTEALTH: Process non-routing options only + * if the packet is destined for us. + */ + if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) { + if (ia != NULL) + ifa_free(&ia->ia_ifa); + return; + } +#endif /* IPSTEALTH */ + + /* Count the packet in the ip address stats */ + if (ia != NULL) { + ia->ia_ifa.if_ipackets++; + ia->ia_ifa.if_ibytes += m->m_pkthdr.len; + ifa_free(&ia->ia_ifa); + } + + /* + * Attempt reassembly; if it succeeds, proceed. + * ip_reass() will return a different mbuf. + */ + if (ip->ip_off & (IP_MF | IP_OFFMASK)) { + m = ip_reass(m); + if (m == NULL) + return; + ip = mtod(m, struct ip *); + /* Get the header length of the reassembled packet */ + hlen = ip->ip_hl << 2; + } + + /* + * Further protocols expect the packet length to be w/o the + * IP header. + */ + ip->ip_len -= hlen; + +#ifdef IPSEC + /* + * enforce IPsec policy checking if we are seeing last header. + * note that we do not visit this with protocols with pcb layer + * code - like udp/tcp/raw ip. + */ + if (ip_ipsec_input(m)) + goto bad; +#endif /* IPSEC */ + + /* + * Switch out to protocol's input routine. + */ + IPSTAT_INC(ips_delivered); + + (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); + return; +bad: + m_freem(m); +} + +/* + * After maxnipq has been updated, propagate the change to UMA. The UMA zone + * max has slightly different semantics than the sysctl, for historical + * reasons. + */ +static void +maxnipq_update(void) +{ + + /* + * -1 for unlimited allocation. + */ + if (V_maxnipq < 0) + uma_zone_set_max(V_ipq_zone, 0); + /* + * Positive number for specific bound. + */ + if (V_maxnipq > 0) + uma_zone_set_max(V_ipq_zone, V_maxnipq); + /* + * Zero specifies no further fragment queue allocation -- set the + * bound very low, but rely on implementation elsewhere to actually + * prevent allocation and reclaim current queues. + */ + if (V_maxnipq == 0) + uma_zone_set_max(V_ipq_zone, 1); +} + +static void +ipq_zone_change(void *tag) +{ + + if (V_maxnipq > 0 && V_maxnipq < (nmbclusters / 32)) { + V_maxnipq = nmbclusters / 32; + maxnipq_update(); + } +} + +static int +sysctl_maxnipq(SYSCTL_HANDLER_ARGS) +{ + int error, i; + + i = V_maxnipq; + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || !req->newptr) + return (error); + + /* + * XXXRW: Might be a good idea to sanity check the argument and place + * an extreme upper bound. + */ + if (i < -1) + return (EINVAL); + V_maxnipq = i; + maxnipq_update(); + return (0); +} + +SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLTYPE_INT|CTLFLAG_RW, + NULL, 0, sysctl_maxnipq, "I", + "Maximum number of IPv4 fragment reassembly queue entries"); + +/* + * Take incoming datagram fragment and try to reassemble it into + * whole datagram. If the argument is the first fragment or one + * in between the function will return NULL and store the mbuf + * in the fragment chain. If the argument is the last fragment + * the packet will be reassembled and the pointer to the new + * mbuf returned for further processing. Only m_tags attached + * to the first packet/fragment are preserved. + * The IP header is *NOT* adjusted out of iplen. + */ +struct mbuf * +ip_reass(struct mbuf *m) +{ + struct ip *ip; + struct mbuf *p, *q, *nq, *t; + struct ipq *fp = NULL; + struct ipqhead *head; + int i, hlen, next; + u_int8_t ecn, ecn0; + u_short hash; + + /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ + if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { + IPSTAT_INC(ips_fragments); + IPSTAT_INC(ips_fragdropped); + m_freem(m); + return (NULL); + } + + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + + hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); + head = &V_ipq[hash]; + IPQ_LOCK(); + + /* + * Look for queue of fragments + * of this datagram. + */ + TAILQ_FOREACH(fp, head, ipq_list) + if (ip->ip_id == fp->ipq_id && + ip->ip_src.s_addr == fp->ipq_src.s_addr && + ip->ip_dst.s_addr == fp->ipq_dst.s_addr && +#ifdef MAC + mac_ipq_match(m, fp) && +#endif + ip->ip_p == fp->ipq_p) + goto found; + + fp = NULL; + + /* + * Attempt to trim the number of allocated fragment queues if it + * exceeds the administrative limit. + */ + if ((V_nipq > V_maxnipq) && (V_maxnipq > 0)) { + /* + * drop something from the tail of the current queue + * before proceeding further + */ + struct ipq *q = TAILQ_LAST(head, ipqhead); + if (q == NULL) { /* gak */ + for (i = 0; i < IPREASS_NHASH; i++) { + struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); + if (r) { + IPSTAT_ADD(ips_fragtimeout, + r->ipq_nfrags); + ip_freef(&V_ipq[i], r); + break; + } + } + } else { + IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags); + ip_freef(head, q); + } + } + +found: + /* + * Adjust ip_len to not reflect header, + * convert offset of this to bytes. + */ + ip->ip_len -= hlen; + if (ip->ip_off & IP_MF) { + /* + * Make sure that fragments have a data length + * that's a non-zero multiple of 8 bytes. + */ + if (ip->ip_len == 0 || (ip->ip_len & 0x7) != 0) { + IPSTAT_INC(ips_toosmall); /* XXX */ + goto dropfrag; + } + m->m_flags |= M_FRAG; + } else + m->m_flags &= ~M_FRAG; + ip->ip_off <<= 3; + + + /* + * Attempt reassembly; if it succeeds, proceed. + * ip_reass() will return a different mbuf. + */ + IPSTAT_INC(ips_fragments); + m->m_pkthdr.header = ip; + + /* Previous ip_reass() started here. */ + /* + * Presence of header sizes in mbufs + * would confuse code below. + */ + m->m_data += hlen; + m->m_len -= hlen; + + /* + * If first fragment to arrive, create a reassembly queue. + */ + if (fp == NULL) { + fp = uma_zalloc(V_ipq_zone, M_NOWAIT); + if (fp == NULL) + goto dropfrag; +#ifdef MAC + if (mac_ipq_init(fp, M_NOWAIT) != 0) { + uma_zfree(V_ipq_zone, fp); + fp = NULL; + goto dropfrag; + } + mac_ipq_create(m, fp); +#endif + TAILQ_INSERT_HEAD(head, fp, ipq_list); + V_nipq++; + fp->ipq_nfrags = 1; + fp->ipq_ttl = IPFRAGTTL; + fp->ipq_p = ip->ip_p; + fp->ipq_id = ip->ip_id; + fp->ipq_src = ip->ip_src; + fp->ipq_dst = ip->ip_dst; + fp->ipq_frags = m; + m->m_nextpkt = NULL; + goto done; + } else { + fp->ipq_nfrags++; +#ifdef MAC + mac_ipq_update(m, fp); +#endif + } + +#define GETIP(m) ((struct ip*)((m)->m_pkthdr.header)) + + /* + * Handle ECN by comparing this segment with the first one; + * if CE is set, do not lose CE. + * drop if CE and not-ECT are mixed for the same packet. + */ + ecn = ip->ip_tos & IPTOS_ECN_MASK; + ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; + if (ecn == IPTOS_ECN_CE) { + if (ecn0 == IPTOS_ECN_NOTECT) + goto dropfrag; + if (ecn0 != IPTOS_ECN_CE) + GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; + } + if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) + goto dropfrag; + + /* + * Find a segment which begins after this one does. + */ + for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) + if (GETIP(q)->ip_off > ip->ip_off) + break; + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us, otherwise + * stick new segment in the proper place. + * + * If some of the data is dropped from the the preceding + * segment, then it's checksum is invalidated. + */ + if (p) { + i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off; + if (i > 0) { + if (i >= ip->ip_len) + goto dropfrag; + m_adj(m, i); + m->m_pkthdr.csum_flags = 0; + ip->ip_off += i; + ip->ip_len -= i; + } + m->m_nextpkt = p->m_nextpkt; + p->m_nextpkt = m; + } else { + m->m_nextpkt = fp->ipq_frags; + fp->ipq_frags = m; + } + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off; + q = nq) { + i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off; + if (i < GETIP(q)->ip_len) { + GETIP(q)->ip_len -= i; + GETIP(q)->ip_off += i; + m_adj(q, i); + q->m_pkthdr.csum_flags = 0; + break; + } + nq = q->m_nextpkt; + m->m_nextpkt = nq; + IPSTAT_INC(ips_fragdropped); + fp->ipq_nfrags--; + m_freem(q); + } + + /* + * Check for complete reassembly and perform frag per packet + * limiting. + * + * Frag limiting is performed here so that the nth frag has + * a chance to complete the packet before we drop the packet. + * As a result, n+1 frags are actually allowed per packet, but + * only n will ever be stored. (n = maxfragsperpacket.) + * + */ + next = 0; + for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { + if (GETIP(q)->ip_off != next) { + if (fp->ipq_nfrags > V_maxfragsperpacket) { + IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); + ip_freef(head, fp); + } + goto done; + } + next += GETIP(q)->ip_len; + } + /* Make sure the last packet didn't have the IP_MF flag */ + if (p->m_flags & M_FRAG) { + if (fp->ipq_nfrags > V_maxfragsperpacket) { + IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); + ip_freef(head, fp); + } + goto done; + } + + /* + * Reassembly is complete. Make sure the packet is a sane size. + */ + q = fp->ipq_frags; + ip = GETIP(q); + if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { + IPSTAT_INC(ips_toolong); + IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); + ip_freef(head, fp); + goto done; + } + + /* + * Concatenate fragments. + */ + m = q; + t = m->m_next; + m->m_next = NULL; + m_cat(m, t); + nq = q->m_nextpkt; + q->m_nextpkt = NULL; + for (q = nq; q != NULL; q = nq) { + nq = q->m_nextpkt; + q->m_nextpkt = NULL; + m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; + m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; + m_cat(m, q); + } + /* + * In order to do checksumming faster we do 'end-around carry' here + * (and not in for{} loop), though it implies we are not going to + * reassemble more than 64k fragments. + */ + m->m_pkthdr.csum_data = + (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); +#ifdef MAC + mac_ipq_reassemble(fp, m); + mac_ipq_destroy(fp); +#endif + + /* + * Create header for new ip packet by modifying header of first + * packet; dequeue and discard fragment reassembly header. + * Make header visible. + */ + ip->ip_len = (ip->ip_hl << 2) + next; + ip->ip_src = fp->ipq_src; + ip->ip_dst = fp->ipq_dst; + TAILQ_REMOVE(head, fp, ipq_list); + V_nipq--; + uma_zfree(V_ipq_zone, fp); + m->m_len += (ip->ip_hl << 2); + m->m_data -= (ip->ip_hl << 2); + /* some debugging cruft by sklower, below, will go away soon */ + if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ + m_fixhdr(m); + IPSTAT_INC(ips_reassembled); + IPQ_UNLOCK(); + return (m); + +dropfrag: + IPSTAT_INC(ips_fragdropped); + if (fp != NULL) + fp->ipq_nfrags--; + m_freem(m); +done: + IPQ_UNLOCK(); + return (NULL); + +#undef GETIP +} + +/* + * Free a fragment reassembly header and all + * associated datagrams. + */ +static void +ip_freef(struct ipqhead *fhp, struct ipq *fp) +{ + struct mbuf *q; + + IPQ_LOCK_ASSERT(); + + while (fp->ipq_frags) { + q = fp->ipq_frags; + fp->ipq_frags = q->m_nextpkt; + m_freem(q); + } + TAILQ_REMOVE(fhp, fp, ipq_list); + uma_zfree(V_ipq_zone, fp); + V_nipq--; +} + +/* + * IP timer processing; + * if a timer expires on a reassembly + * queue, discard it. + */ +void +ip_slowtimo(void) +{ + VNET_ITERATOR_DECL(vnet_iter); + struct ipq *fp; + int i; + + VNET_LIST_RLOCK_NOSLEEP(); + IPQ_LOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + for (i = 0; i < IPREASS_NHASH; i++) { + for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { + struct ipq *fpp; + + fpp = fp; + fp = TAILQ_NEXT(fp, ipq_list); + if(--fpp->ipq_ttl == 0) { + IPSTAT_ADD(ips_fragtimeout, + fpp->ipq_nfrags); + ip_freef(&V_ipq[i], fpp); + } + } + } + /* + * If we are over the maximum number of fragments + * (due to the limit being lowered), drain off + * enough to get down to the new limit. + */ + if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { + for (i = 0; i < IPREASS_NHASH; i++) { + while (V_nipq > V_maxnipq && + !TAILQ_EMPTY(&V_ipq[i])) { + IPSTAT_ADD(ips_fragdropped, + TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); + ip_freef(&V_ipq[i], + TAILQ_FIRST(&V_ipq[i])); + } + } + } + CURVNET_RESTORE(); + } + IPQ_UNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); +} + +/* + * Drain off all datagram fragments. + */ +static void +ip_drain_locked(void) +{ + int i; + + IPQ_LOCK_ASSERT(); + + for (i = 0; i < IPREASS_NHASH; i++) { + while(!TAILQ_EMPTY(&V_ipq[i])) { + IPSTAT_ADD(ips_fragdropped, + TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); + ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); + } + } +} + +void +ip_drain(void) +{ + VNET_ITERATOR_DECL(vnet_iter); + + VNET_LIST_RLOCK_NOSLEEP(); + IPQ_LOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + ip_drain_locked(); + CURVNET_RESTORE(); + } + IPQ_UNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); + in_rtqdrain(); +} + +/* + * The protocol to be inserted into ip_protox[] must be already registered + * in inetsw[], either statically or through pf_proto_register(). + */ +int +ipproto_register(short ipproto) +{ + struct protosw *pr; + + /* Sanity checks. */ + if (ipproto <= 0 || ipproto >= IPPROTO_MAX) + return (EPROTONOSUPPORT); + + /* + * The protocol slot must not be occupied by another protocol + * already. An index pointing to IPPROTO_RAW is unused. + */ + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); + if (pr == NULL) + return (EPFNOSUPPORT); + if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ + return (EEXIST); + + /* Find the protocol position in inetsw[] and set the index. */ + for (pr = inetdomain.dom_protosw; + pr < inetdomain.dom_protoswNPROTOSW; pr++) { + if (pr->pr_domain->dom_family == PF_INET && + pr->pr_protocol && pr->pr_protocol == ipproto) { + ip_protox[pr->pr_protocol] = pr - inetsw; + return (0); + } + } + return (EPROTONOSUPPORT); +} + +int +ipproto_unregister(short ipproto) +{ + struct protosw *pr; + + /* Sanity checks. */ + if (ipproto <= 0 || ipproto >= IPPROTO_MAX) + return (EPROTONOSUPPORT); + + /* Check if the protocol was indeed registered. */ + pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); + if (pr == NULL) + return (EPFNOSUPPORT); + if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ + return (ENOENT); + + /* Reset the protocol slot to IPPROTO_RAW. */ + ip_protox[ipproto] = pr - inetsw; + return (0); +} + +/* + * Given address of next destination (final or next hop), return (referenced) + * internet address info of interface to be used to get there. + */ +struct in_ifaddr * +ip_rtaddr(struct in_addr dst, u_int fibnum) +{ + struct route sro; + struct sockaddr_in *sin; + struct in_ifaddr *ia; + + bzero(&sro, sizeof(sro)); + sin = (struct sockaddr_in *)&sro.ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = dst; + in_rtalloc_ign(&sro, 0, fibnum); + + if (sro.ro_rt == NULL) + return (NULL); + + ia = ifatoia(sro.ro_rt->rt_ifa); + ifa_ref(&ia->ia_ifa); + RTFREE(sro.ro_rt); + return (ia); +} + +u_char inetctlerrmap[PRC_NCMDS] = { + 0, 0, 0, 0, + 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, + EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, + EMSGSIZE, EHOSTUNREACH, 0, 0, + 0, 0, EHOSTUNREACH, 0, + ENOPROTOOPT, ECONNREFUSED +}; + +/* + * Forward a packet. If some error occurs return the sender + * an icmp packet. Note we can't always generate a meaningful + * icmp message because icmp doesn't have a large enough repertoire + * of codes and types. + * + * If not forwarding, just drop the packet. This could be confusing + * if ipforwarding was zero but some routing protocol was advancing + * us as a gateway to somewhere. However, we must let the routing + * protocol deal with that. + * + * The srcrt parameter indicates whether the packet is being forwarded + * via a source route. + */ +void +ip_forward(struct mbuf *m, int srcrt) +{ + struct ip *ip = mtod(m, struct ip *); + struct in_ifaddr *ia; + struct mbuf *mcopy; + struct in_addr dest; + struct route ro; + int error, type = 0, code = 0, mtu = 0; + + if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { + IPSTAT_INC(ips_cantforward); + m_freem(m); + return; + } +#ifdef IPSTEALTH + if (!V_ipstealth) { +#endif + if (ip->ip_ttl <= IPTTLDEC) { + icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, + 0, 0); + return; + } +#ifdef IPSTEALTH + } +#endif + + ia = ip_rtaddr(ip->ip_dst, M_GETFIB(m)); +#ifndef IPSEC + /* + * 'ia' may be NULL if there is no route for this destination. + * In case of IPsec, Don't discard it just yet, but pass it to + * ip_output in case of outgoing IPsec policy. + */ + if (!srcrt && ia == NULL) { + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0); + return; + } +#endif + + /* + * Save the IP header and at most 8 bytes of the payload, + * in case we need to generate an ICMP message to the src. + * + * XXX this can be optimized a lot by saving the data in a local + * buffer on the stack (72 bytes at most), and only allocating the + * mbuf if really necessary. The vast majority of the packets + * are forwarded without having to send an ICMP back (either + * because unnecessary, or because rate limited), so we are + * really we are wasting a lot of work here. + * + * We don't use m_copy() because it might return a reference + * to a shared cluster. Both this function and ip_output() + * assume exclusive access to the IP header in `m', so any + * data in a cluster may change before we reach icmp_error(). + */ + MGETHDR(mcopy, M_DONTWAIT, m->m_type); + if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_DONTWAIT)) { + /* + * It's probably ok if the pkthdr dup fails (because + * the deep copy of the tag chain failed), but for now + * be conservative and just discard the copy since + * code below may some day want the tags. + */ + m_free(mcopy); + mcopy = NULL; + } + if (mcopy != NULL) { + mcopy->m_len = min(ip->ip_len, M_TRAILINGSPACE(mcopy)); + mcopy->m_pkthdr.len = mcopy->m_len; + m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); + } + +#ifdef IPSTEALTH + if (!V_ipstealth) { +#endif + ip->ip_ttl -= IPTTLDEC; +#ifdef IPSTEALTH + } +#endif + + /* + * If forwarding packet using same interface that it came in on, + * perhaps should send a redirect to sender to shortcut a hop. + * Only send redirect if source is sending directly to us, + * and if packet was not source routed (or has any options). + * Also, don't send redirect if forwarding using a default route + * or a route modified by a redirect. + */ + dest.s_addr = 0; + if (!srcrt && V_ipsendredirects && + ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { + struct sockaddr_in *sin; + struct rtentry *rt; + + bzero(&ro, sizeof(ro)); + sin = (struct sockaddr_in *)&ro.ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_dst; + in_rtalloc_ign(&ro, 0, M_GETFIB(m)); + + rt = ro.ro_rt; + + if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && + satosin(rt_key(rt))->sin_addr.s_addr != 0) { +#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) + u_long src = ntohl(ip->ip_src.s_addr); + + if (RTA(rt) && + (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { + if (rt->rt_flags & RTF_GATEWAY) + dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; + else + dest.s_addr = ip->ip_dst.s_addr; + /* Router requirements says to only send host redirects */ + type = ICMP_REDIRECT; + code = ICMP_REDIRECT_HOST; + } + } + if (rt) + RTFREE(rt); + } + + /* + * Try to cache the route MTU from ip_output so we can consider it for + * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191. + */ + bzero(&ro, sizeof(ro)); + + error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); + + if (error == EMSGSIZE && ro.ro_rt) + mtu = ro.ro_rt->rt_rmx.rmx_mtu; + if (ro.ro_rt) + RTFREE(ro.ro_rt); + + if (error) + IPSTAT_INC(ips_cantforward); + else { + IPSTAT_INC(ips_forward); + if (type) + IPSTAT_INC(ips_redirectsent); + else { + if (mcopy) + m_freem(mcopy); + if (ia != NULL) + ifa_free(&ia->ia_ifa); + return; + } + } + if (mcopy == NULL) { + if (ia != NULL) + ifa_free(&ia->ia_ifa); + return; + } + + switch (error) { + + case 0: /* forwarded, but need redirect */ + /* type, code set above */ + break; + + case ENETUNREACH: + case EHOSTUNREACH: + case ENETDOWN: + case EHOSTDOWN: + default: + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + break; + + case EMSGSIZE: + type = ICMP_UNREACH; + code = ICMP_UNREACH_NEEDFRAG; + +#ifdef IPSEC + /* + * If IPsec is configured for this path, + * override any possibly mtu value set by ip_output. + */ + mtu = ip_ipsec_mtu(mcopy, mtu); +#endif /* IPSEC */ + /* + * If the MTU was set before make sure we are below the + * interface MTU. + * If the MTU wasn't set before use the interface mtu or + * fall back to the next smaller mtu step compared to the + * current packet size. + */ + if (mtu != 0) { + if (ia != NULL) + mtu = min(mtu, ia->ia_ifp->if_mtu); + } else { + if (ia != NULL) + mtu = ia->ia_ifp->if_mtu; + else + mtu = ip_next_mtu(ip->ip_len, 0); + } + IPSTAT_INC(ips_cantfrag); + break; + + case ENOBUFS: + /* + * A router should not generate ICMP_SOURCEQUENCH as + * required in RFC1812 Requirements for IP Version 4 Routers. + * Source quench could be a big problem under DoS attacks, + * or if the underlying interface is rate-limited. + * Those who need source quench packets may re-enable them + * via the net.inet.ip.sendsourcequench sysctl. + */ + if (V_ip_sendsourcequench == 0) { + m_freem(mcopy); + if (ia != NULL) + ifa_free(&ia->ia_ifa); + return; + } else { + type = ICMP_SOURCEQUENCH; + code = 0; + } + break; + + case EACCES: /* ipfw denied packet */ + m_freem(mcopy); + if (ia != NULL) + ifa_free(&ia->ia_ifa); + return; + } + if (ia != NULL) + ifa_free(&ia->ia_ifa); + icmp_error(mcopy, type, code, dest.s_addr, mtu); +} + +void +ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, + struct mbuf *m) +{ + + if (inp->inp_socket->so_options & (SO_BINTIME | SO_TIMESTAMP)) { + struct bintime bt; + + bintime(&bt); + if (inp->inp_socket->so_options & SO_BINTIME) { + *mp = sbcreatecontrol((caddr_t) &bt, sizeof(bt), + SCM_BINTIME, SOL_SOCKET); + if (*mp) + mp = &(*mp)->m_next; + } + if (inp->inp_socket->so_options & SO_TIMESTAMP) { + struct timeval tv; + + bintime2timeval(&bt, &tv); + *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET); + if (*mp) + mp = &(*mp)->m_next; + } + } + if (inp->inp_flags & INP_RECVDSTADDR) { + *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, + sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + if (inp->inp_flags & INP_RECVTTL) { + *mp = sbcreatecontrol((caddr_t) &ip->ip_ttl, + sizeof(u_char), IP_RECVTTL, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +#ifdef notyet + /* XXX + * Moving these out of udp_input() made them even more broken + * than they already were. + */ + /* options were tossed already */ + if (inp->inp_flags & INP_RECVOPTS) { + *mp = sbcreatecontrol((caddr_t) opts_deleted_above, + sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + /* ip_srcroute doesn't do what we want here, need to fix */ + if (inp->inp_flags & INP_RECVRETOPTS) { + *mp = sbcreatecontrol((caddr_t) ip_srcroute(m), + sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +#endif + if (inp->inp_flags & INP_RECVIF) { + struct ifnet *ifp; + struct sdlbuf { + struct sockaddr_dl sdl; + u_char pad[32]; + } sdlbuf; + struct sockaddr_dl *sdp; + struct sockaddr_dl *sdl2 = &sdlbuf.sdl; + + if (((ifp = m->m_pkthdr.rcvif)) + && ( ifp->if_index && (ifp->if_index <= V_if_index))) { + sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; + /* + * Change our mind and don't try copy. + */ + if ((sdp->sdl_family != AF_LINK) + || (sdp->sdl_len > sizeof(sdlbuf))) { + goto makedummy; + } + bcopy(sdp, sdl2, sdp->sdl_len); + } else { +makedummy: + sdl2->sdl_len + = offsetof(struct sockaddr_dl, sdl_data[0]); + sdl2->sdl_family = AF_LINK; + sdl2->sdl_index = 0; + sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; + } + *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, + IP_RECVIF, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } +} + +/* + * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the + * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on + * locking. This code remains in ip_input.c as ip_mroute.c is optionally + * compiled. + */ +static VNET_DEFINE(int, ip_rsvp_on); +VNET_DEFINE(struct socket *, ip_rsvpd); + +#define V_ip_rsvp_on VNET(ip_rsvp_on) + +int +ip_rsvp_init(struct socket *so) +{ + + if (so->so_type != SOCK_RAW || + so->so_proto->pr_protocol != IPPROTO_RSVP) + return EOPNOTSUPP; + + if (V_ip_rsvpd != NULL) + return EADDRINUSE; + + V_ip_rsvpd = so; + /* + * This may seem silly, but we need to be sure we don't over-increment + * the RSVP counter, in case something slips up. + */ + if (!V_ip_rsvp_on) { + V_ip_rsvp_on = 1; + V_rsvp_on++; + } + + return 0; +} + +int +ip_rsvp_done(void) +{ + + V_ip_rsvpd = NULL; + /* + * This may seem silly, but we need to be sure we don't over-decrement + * the RSVP counter, in case something slips up. + */ + if (V_ip_rsvp_on) { + V_ip_rsvp_on = 0; + V_rsvp_on--; + } + return 0; +} + +void +rsvp_input(struct mbuf *m, int off) /* XXX must fixup manually */ +{ + + if (rsvp_input_p) { /* call the real one if loaded */ + rsvp_input_p(m, off); + return; + } + + /* Can still get packets with rsvp_on = 0 if there is a local member + * of the group to which the RSVP packet is addressed. But in this + * case we want to throw the packet away. + */ + + if (!V_rsvp_on) { + m_freem(m); + return; + } + + if (V_ip_rsvpd != NULL) { + rip_input(m, off); + return; + } + /* Drop the packet */ + m_freem(m); +} diff --git a/freebsd/sys/netinet/ip_ipsec.c b/freebsd/sys/netinet/ip_ipsec.c new file mode 100644 index 00000000..f19d5e0e --- /dev/null +++ b/freebsd/sys/netinet/ip_ipsec.c @@ -0,0 +1,424 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef SCTP +#include +#endif + +#include + +#ifdef IPSEC +#include +#include +#include +#endif /*IPSEC*/ + +extern struct protosw inetsw[]; + +#ifdef IPSEC +#ifdef IPSEC_FILTERTUNNEL +static VNET_DEFINE(int, ip4_ipsec_filtertunnel) = 1; +#else +static VNET_DEFINE(int, ip4_ipsec_filtertunnel) = 0; +#endif +#define V_ip4_ipsec_filtertunnel VNET(ip4_ipsec_filtertunnel) + +SYSCTL_DECL(_net_inet_ipsec); +SYSCTL_VNET_INT(_net_inet_ipsec, OID_AUTO, filtertunnel, + CTLFLAG_RW, &VNET_NAME(ip4_ipsec_filtertunnel), 0, + "If set filter packets from an IPsec tunnel."); +#endif /* IPSEC */ + +/* + * Check if we have to jump over firewall processing for this packet. + * Called from ip_input(). + * 1 = jump over firewall, 0 = packet goes through firewall. + */ +int +ip_ipsec_filtertunnel(struct mbuf *m) +{ +#if defined(IPSEC) + + /* + * Bypass packet filtering for packets from a tunnel. + */ + if (!V_ip4_ipsec_filtertunnel && + m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL) + return 1; +#endif + return 0; +} + +/* + * Check if this packet has an active SA and needs to be dropped instead + * of forwarded. + * Called from ip_input(). + * 1 = drop packet, 0 = forward packet. + */ +int +ip_ipsec_fwd(struct mbuf *m) +{ +#ifdef IPSEC + struct m_tag *mtag; + struct tdb_ident *tdbi; + struct secpolicy *sp; + int s, error; + + mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); + s = splnet(); + if (mtag != NULL) { + tdbi = (struct tdb_ident *)(mtag + 1); + sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND); + } else { + sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, + IP_FORWARDING, &error); + } + if (sp == NULL) { /* NB: can happen if error */ + splx(s); + /*XXX error stat???*/ + DPRINTF(("ip_input: no SP for forwarding\n")); /*XXX*/ + return 1; + } + + /* + * Check security policy against packet attributes. + */ + error = ipsec_in_reject(sp, m); + KEY_FREESP(&sp); + splx(s); + if (error) { + IPSTAT_INC(ips_cantforward); + return 1; + } +#endif /* IPSEC */ + return 0; +} + +/* + * Check if protocol type doesn't have a further header and do IPSEC + * decryption or reject right now. Protocols with further headers get + * their IPSEC treatment within the protocol specific processing. + * Called from ip_input(). + * 1 = drop packet, 0 = continue processing packet. + */ +int +ip_ipsec_input(struct mbuf *m) +{ +#ifdef IPSEC + struct ip *ip = mtod(m, struct ip *); + struct m_tag *mtag; + struct tdb_ident *tdbi; + struct secpolicy *sp; + int s, error; + /* + * enforce IPsec policy checking if we are seeing last header. + * note that we do not visit this with protocols with pcb layer + * code - like udp/tcp/raw ip. + */ + if ((inetsw[ip_protox[ip->ip_p]].pr_flags & PR_LASTHDR) != 0) { + /* + * Check if the packet has already had IPsec processing + * done. If so, then just pass it along. This tag gets + * set during AH, ESP, etc. input handling, before the + * packet is returned to the ip input queue for delivery. + */ + mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL); + s = splnet(); + if (mtag != NULL) { + tdbi = (struct tdb_ident *)(mtag + 1); + sp = ipsec_getpolicy(tdbi, IPSEC_DIR_INBOUND); + } else { + sp = ipsec_getpolicybyaddr(m, IPSEC_DIR_INBOUND, + IP_FORWARDING, &error); + } + if (sp != NULL) { + /* + * Check security policy against packet attributes. + */ + error = ipsec_in_reject(sp, m); + KEY_FREESP(&sp); + } else { + /* XXX error stat??? */ + error = EINVAL; + DPRINTF(("ip_input: no SP, packet discarded\n"));/*XXX*/ + return 1; + } + splx(s); + if (error) + return 1; + } +#endif /* IPSEC */ + return 0; +} + +/* + * Compute the MTU for a forwarded packet that gets IPSEC encapsulated. + * Called from ip_forward(). + * Returns MTU suggestion for ICMP needfrag reply. + */ +int +ip_ipsec_mtu(struct mbuf *m, int mtu) +{ + /* + * If the packet is routed over IPsec tunnel, tell the + * originator the tunnel MTU. + * tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz + * XXX quickhack!!! + */ + struct secpolicy *sp = NULL; + int ipsecerror; + int ipsechdr; + struct route *ro; + sp = ipsec_getpolicybyaddr(m, + IPSEC_DIR_OUTBOUND, + IP_FORWARDING, + &ipsecerror); + if (sp != NULL) { + /* count IPsec header size */ + ipsechdr = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, NULL); + + /* + * find the correct route for outer IPv4 + * header, compute tunnel MTU. + */ + if (sp->req != NULL && + sp->req->sav != NULL && + sp->req->sav->sah != NULL) { + ro = &sp->req->sav->sah->route_cache.sa_route; + if (ro->ro_rt && ro->ro_rt->rt_ifp) { + mtu = + ro->ro_rt->rt_rmx.rmx_mtu ? + ro->ro_rt->rt_rmx.rmx_mtu : + ro->ro_rt->rt_ifp->if_mtu; + mtu -= ipsechdr; + } + } + KEY_FREESP(&sp); + } + return mtu; +} + +/* + * + * Called from ip_output(). + * 1 = drop packet, 0 = continue processing packet, + * -1 = packet was reinjected and stop processing packet + */ +int +ip_ipsec_output(struct mbuf **m, struct inpcb *inp, int *flags, int *error, + struct ifnet **ifp) +{ +#ifdef IPSEC + struct secpolicy *sp = NULL; + struct ip *ip = mtod(*m, struct ip *); + struct tdb_ident *tdbi; + struct m_tag *mtag; + int s; + /* + * Check the security policy (SP) for the packet and, if + * required, do IPsec-related processing. There are two + * cases here; the first time a packet is sent through + * it will be untagged and handled by ipsec4_checkpolicy. + * If the packet is resubmitted to ip_output (e.g. after + * AH, ESP, etc. processing), there will be a tag to bypass + * the lookup and related policy checking. + */ + mtag = m_tag_find(*m, PACKET_TAG_IPSEC_PENDING_TDB, NULL); + s = splnet(); + if (mtag != NULL) { + tdbi = (struct tdb_ident *)(mtag + 1); + sp = ipsec_getpolicy(tdbi, IPSEC_DIR_OUTBOUND); + if (sp == NULL) + *error = -EINVAL; /* force silent drop */ + m_tag_delete(*m, mtag); + } else { + sp = ipsec4_checkpolicy(*m, IPSEC_DIR_OUTBOUND, *flags, + error, inp); + } + /* + * There are four return cases: + * sp != NULL apply IPsec policy + * sp == NULL, error == 0 no IPsec handling needed + * sp == NULL, error == -EINVAL discard packet w/o error + * sp == NULL, error != 0 discard packet, report error + */ + if (sp != NULL) { + /* Loop detection, check if ipsec processing already done */ + KASSERT(sp->req != NULL, ("ip_output: no ipsec request")); + for (mtag = m_tag_first(*m); mtag != NULL; + mtag = m_tag_next(*m, mtag)) { + if (mtag->m_tag_cookie != MTAG_ABI_COMPAT) + continue; + if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE && + mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED) + continue; + /* + * Check if policy has an SA associated with it. + * This can happen when an SP has yet to acquire + * an SA; e.g. on first reference. If it occurs, + * then we let ipsec4_process_packet do its thing. + */ + if (sp->req->sav == NULL) + break; + tdbi = (struct tdb_ident *)(mtag + 1); + if (tdbi->spi == sp->req->sav->spi && + tdbi->proto == sp->req->sav->sah->saidx.proto && + bcmp(&tdbi->dst, &sp->req->sav->sah->saidx.dst, + sizeof (union sockaddr_union)) == 0) { + /* + * No IPsec processing is needed, free + * reference to SP. + * + * NB: null pointer to avoid free at + * done: below. + */ + KEY_FREESP(&sp), sp = NULL; + splx(s); + goto done; + } + } + + /* + * Do delayed checksums now because we send before + * this is done in the normal processing path. + */ + if ((*m)->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(*m); + (*m)->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } +#ifdef SCTP + if ((*m)->m_pkthdr.csum_flags & CSUM_SCTP) { + sctp_delayed_cksum(*m, (uint32_t)(ip->ip_hl << 2)); + (*m)->m_pkthdr.csum_flags &= ~CSUM_SCTP; + } +#endif + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + + /* NB: callee frees mbuf */ + *error = ipsec4_process_packet(*m, sp->req, *flags, 0); + if (*error == EJUSTRETURN) { + /* + * We had a SP with a level of 'use' and no SA. We + * will just continue to process the packet without + * IPsec processing and return without error. + */ + *error = 0; + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + goto done; + } + /* + * Preserve KAME behaviour: ENOENT can be returned + * when an SA acquire is in progress. Don't propagate + * this to user-level; it confuses applications. + * + * XXX this will go away when the SADB is redone. + */ + if (*error == ENOENT) + *error = 0; + splx(s); + goto reinjected; + } else { /* sp == NULL */ + splx(s); + + if (*error != 0) { + /* + * Hack: -EINVAL is used to signal that a packet + * should be silently discarded. This is typically + * because we asked key management for an SA and + * it was delayed (e.g. kicked up to IKE). + */ + if (*error == -EINVAL) + *error = 0; + goto bad; + } else { + /* No IPsec processing for this packet. */ + } +#ifdef notyet + /* + * If deferred crypto processing is needed, check that + * the interface supports it. + */ + mtag = m_tag_find(*m, PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED, NULL); + if (mtag != NULL && ifp != NULL && + ((*ifp)->if_capenable & IFCAP_IPSEC) == 0) { + /* notify IPsec to do its own crypto */ + ipsp_skipcrypto_unmark((struct tdb_ident *)(mtag + 1)); + *error = EHOSTUNREACH; + goto bad; + } +#endif + } +done: + if (sp != NULL) + KEY_FREESP(&sp); + return 0; +reinjected: + if (sp != NULL) + KEY_FREESP(&sp); + return -1; +bad: + if (sp != NULL) + KEY_FREESP(&sp); + return 1; +#endif /* IPSEC */ + return 0; +} diff --git a/freebsd/sys/netinet/ip_ipsec.h b/freebsd/sys/netinet/ip_ipsec.h new file mode 100644 index 00000000..c4de1652 --- /dev/null +++ b/freebsd/sys/netinet/ip_ipsec.h @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_IPSEC_HH_ +#define _NETINET_IP_IPSEC_HH_ + +int ip_ipsec_filtertunnel(struct mbuf *); +int ip_ipsec_fwd(struct mbuf *); +int ip_ipsec_input(struct mbuf *); +int ip_ipsec_mtu(struct mbuf *, int); +int ip_ipsec_output(struct mbuf **, struct inpcb *, int *, int *, + struct ifnet **); +#endif diff --git a/freebsd/sys/netinet/ip_mroute.c b/freebsd/sys/netinet/ip_mroute.c new file mode 100644 index 00000000..2f7676ad --- /dev/null +++ b/freebsd/sys/netinet/ip_mroute.c @@ -0,0 +1,2952 @@ +#include + +/*- + * Copyright (c) 1989 Stephen Deering + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 + */ + +/* + * IP multicast forwarding procedures + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * Modified by Mark J. Steiglitz, Stanford, May, 1991 + * Modified by Van Jacobson, LBL, January 1993 + * Modified by Ajit Thyagarajan, PARC, August 1993 + * Modified by Bill Fenner, PARC, April 1995 + * Modified by Ahmed Helmy, SGI, June 1996 + * Modified by George Edmond Eddy (Rusty), ISI, February 1998 + * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 + * Modified by Hitoshi Asaeda, WIDE, August 2000 + * Modified by Pavlin Radoslavov, ICSI, October 2002 + * + * MROUTING Revision: 3.5 + * and PIM-SMv2 and PIM-DM support, advanced API support, + * bandwidth metering and signaling + */ + +/* + * TODO: Prefix functions with ipmf_. + * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol + * domain attachment (if_afdata) so we can track consumers of that service. + * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT, + * move it to socket options. + * TODO: Cleanup LSRR removal further. + * TODO: Push RSVP stubs into raw_ip.c. + * TODO: Use bitstring.h for vif set. + * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded. + * TODO: Sync ip6_mroute.c with this file. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#define _PIM_VT 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#ifndef KTR_IPMF +#define KTR_IPMF KTR_INET +#endif + +#define VIFI_INVALID ((vifi_t) -1) +#define M_HASCL(m) ((m)->m_flags & M_EXT) + +static VNET_DEFINE(uint32_t, last_tv_sec); /* last time we processed this */ +#define V_last_tv_sec VNET(last_tv_sec) + +static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache"); + +/* + * Locking. We use two locks: one for the virtual interface table and + * one for the forwarding table. These locks may be nested in which case + * the VIF lock must always be taken first. Note that each lock is used + * to cover not only the specific data structure but also related data + * structures. + */ + +static struct mtx mrouter_mtx; +#define MROUTER_LOCK() mtx_lock(&mrouter_mtx) +#define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) +#define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED) +#define MROUTER_LOCK_INIT() \ + mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) +#define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) + +static int ip_mrouter_cnt; /* # of vnets with active mrouters */ +static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ + +static VNET_DEFINE(struct mrtstat, mrtstat); +#define V_mrtstat VNET(mrtstat) +SYSCTL_VNET_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, + &VNET_NAME(mrtstat), mrtstat, + "IPv4 Multicast Forwarding Statistics (struct mrtstat, " + "netinet/ip_mroute.h)"); + +static VNET_DEFINE(u_long, mfchash); +#define V_mfchash VNET(mfchash) +#define MFCHASH(a, g) \ + ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ + ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash) +#define MFCHASHSIZE 256 + +static u_long mfchashsize; /* Hash size */ +static VNET_DEFINE(u_char *, nexpire); /* 0..mfchashsize-1 */ +#define V_nexpire VNET(nexpire) +static VNET_DEFINE(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); +#define V_mfchashtbl VNET(mfchashtbl) + +static struct mtx mfc_mtx; +#define MFC_LOCK() mtx_lock(&mfc_mtx) +#define MFC_UNLOCK() mtx_unlock(&mfc_mtx) +#define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) +#define MFC_LOCK_INIT() \ + mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) +#define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) + +static VNET_DEFINE(vifi_t, numvifs); +#define V_numvifs VNET(numvifs) +static VNET_DEFINE(struct vif, viftable[MAXVIFS]); +#define V_viftable VNET(viftable) +SYSCTL_VNET_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, + &VNET_NAME(viftable), sizeof(V_viftable), "S,vif[MAXVIFS]", + "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); + +static struct mtx vif_mtx; +#define VIF_LOCK() mtx_lock(&vif_mtx) +#define VIF_UNLOCK() mtx_unlock(&vif_mtx) +#define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) +#define VIF_LOCK_INIT() \ + mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF) +#define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) + +static eventhandler_tag if_detach_event_tag = NULL; + +static VNET_DEFINE(struct callout, expire_upcalls_ch); +#define V_expire_upcalls_ch VNET(expire_upcalls_ch) + +#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ +#define UPCALL_EXPIRE 6 /* number of timeouts */ + +/* + * Bandwidth meter variables and constants + */ +static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); +/* + * Pending timeouts are stored in a hash table, the key being the + * expiration time. Periodically, the entries are analysed and processed. + */ +#define BW_METER_BUCKETS 1024 +static VNET_DEFINE(struct bw_meter*, bw_meter_timers[BW_METER_BUCKETS]); +#define V_bw_meter_timers VNET(bw_meter_timers) +static VNET_DEFINE(struct callout, bw_meter_ch); +#define V_bw_meter_ch VNET(bw_meter_ch) +#define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ + +/* + * Pending upcalls are stored in a vector which is flushed when + * full, or periodically + */ +static VNET_DEFINE(struct bw_upcall, bw_upcalls[BW_UPCALLS_MAX]); +#define V_bw_upcalls VNET(bw_upcalls) +static VNET_DEFINE(u_int, bw_upcalls_n); /* # of pending upcalls */ +#define V_bw_upcalls_n VNET(bw_upcalls_n) +static VNET_DEFINE(struct callout, bw_upcalls_ch); +#define V_bw_upcalls_ch VNET(bw_upcalls_ch) + +#define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ + +static VNET_DEFINE(struct pimstat, pimstat); +#define V_pimstat VNET(pimstat) + +SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); +SYSCTL_VNET_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, + &VNET_NAME(pimstat), pimstat, + "PIM Statistics (struct pimstat, netinet/pim_var.h)"); + +static u_long pim_squelch_wholepkt = 0; +SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, + &pim_squelch_wholepkt, 0, + "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); + +extern struct domain inetdomain; +static const struct protosw in_pim_protosw = { + .pr_type = SOCK_RAW, + .pr_domain = &inetdomain, + .pr_protocol = IPPROTO_PIM, + .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, + .pr_input = pim_input, + .pr_output = (pr_output_t*)rip_output, + .pr_ctloutput = rip_ctloutput, + .pr_usrreqs = &rip_usrreqs +}; +static const struct encaptab *pim_encap_cookie; + +static int pim_encapcheck(const struct mbuf *, int, int, void *); + +/* + * Note: the PIM Register encapsulation adds the following in front of a + * data packet: + * + * struct pim_encap_hdr { + * struct ip ip; + * struct pim_encap_pimhdr pim; + * } + * + */ + +struct pim_encap_pimhdr { + struct pim pim; + uint32_t flags; +}; +#define PIM_ENCAP_TTL 64 + +static struct ip pim_encap_iphdr = { +#if BYTE_ORDER == LITTLE_ENDIAN + sizeof(struct ip) >> 2, + IPVERSION, +#else + IPVERSION, + sizeof(struct ip) >> 2, +#endif + 0, /* tos */ + sizeof(struct ip), /* total length */ + 0, /* id */ + 0, /* frag offset */ + PIM_ENCAP_TTL, + IPPROTO_PIM, + 0, /* checksum */ +}; + +static struct pim_encap_pimhdr pim_encap_pimhdr = { + { + PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ + 0, /* reserved */ + 0, /* checksum */ + }, + 0 /* flags */ +}; + +static VNET_DEFINE(vifi_t, reg_vif_num) = VIFI_INVALID; +#define V_reg_vif_num VNET(reg_vif_num) +static VNET_DEFINE(struct ifnet, multicast_register_if); +#define V_multicast_register_if VNET(multicast_register_if) + +/* + * Private variables. + */ + +static u_long X_ip_mcast_src(int); +static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *); +static int X_ip_mrouter_done(void); +static int X_ip_mrouter_get(struct socket *, struct sockopt *); +static int X_ip_mrouter_set(struct socket *, struct sockopt *); +static int X_legal_vif_num(int); +static int X_mrt_ioctl(u_long, caddr_t, int); + +static int add_bw_upcall(struct bw_upcall *); +static int add_mfc(struct mfcctl2 *); +static int add_vif(struct vifctl *); +static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); +static void bw_meter_process(void); +static void bw_meter_receive_packet(struct bw_meter *, int, + struct timeval *); +static void bw_upcalls_send(void); +static int del_bw_upcall(struct bw_upcall *); +static int del_mfc(struct mfcctl2 *); +static int del_vif(vifi_t); +static int del_vif_locked(vifi_t); +static void expire_bw_meter_process(void *); +static void expire_bw_upcalls_send(void *); +static void expire_mfc(struct mfc *); +static void expire_upcalls(void *); +static void free_bw_list(struct bw_meter *); +static int get_sg_cnt(struct sioc_sg_req *); +static int get_vif_cnt(struct sioc_vif_req *); +static void if_detached_event(void *, struct ifnet *); +static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); +static int ip_mrouter_init(struct socket *, int); +static __inline struct mfc * + mfc_find(struct in_addr *, struct in_addr *); +static void phyint_send(struct ip *, struct vif *, struct mbuf *); +static struct mbuf * + pim_register_prepare(struct ip *, struct mbuf *); +static int pim_register_send(struct ip *, struct vif *, + struct mbuf *, struct mfc *); +static int pim_register_send_rp(struct ip *, struct vif *, + struct mbuf *, struct mfc *); +static int pim_register_send_upcall(struct ip *, struct vif *, + struct mbuf *, struct mfc *); +static void schedule_bw_meter(struct bw_meter *, struct timeval *); +static void send_packet(struct vif *, struct mbuf *); +static int set_api_config(uint32_t *); +static int set_assert(int); +static int socket_send(struct socket *, struct mbuf *, + struct sockaddr_in *); +static void unschedule_bw_meter(struct bw_meter *); + +/* + * Kernel multicast forwarding API capabilities and setup. + * If more API capabilities are added to the kernel, they should be + * recorded in `mrt_api_support'. + */ +#define MRT_API_VERSION 0x0305 + +static const int mrt_api_version = MRT_API_VERSION; +static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | + MRT_MFC_FLAGS_BORDER_VIF | + MRT_MFC_RP | + MRT_MFC_BW_UPCALL); +static VNET_DEFINE(uint32_t, mrt_api_config); +#define V_mrt_api_config VNET(mrt_api_config) +static VNET_DEFINE(int, pim_assert_enabled); +#define V_pim_assert_enabled VNET(pim_assert_enabled) +static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */ + +/* + * Find a route for a given origin IP address and multicast group address. + * Statistics must be updated by the caller. + */ +static __inline struct mfc * +mfc_find(struct in_addr *o, struct in_addr *g) +{ + struct mfc *rt; + + MFC_LOCK_ASSERT(); + + LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { + if (in_hosteq(rt->mfc_origin, *o) && + in_hosteq(rt->mfc_mcastgrp, *g) && + TAILQ_EMPTY(&rt->mfc_stall)) + break; + } + + return (rt); +} + +/* + * Handle MRT setsockopt commands to modify the multicast forwarding tables. + */ +static int +X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) +{ + int error, optval; + vifi_t vifi; + struct vifctl vifc; + struct mfcctl2 mfc; + struct bw_upcall bw_upcall; + uint32_t i; + + if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT) + return EPERM; + + error = 0; + switch (sopt->sopt_name) { + case MRT_INIT: + error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); + if (error) + break; + error = ip_mrouter_init(so, optval); + break; + + case MRT_DONE: + error = ip_mrouter_done(); + break; + + case MRT_ADD_VIF: + error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); + if (error) + break; + error = add_vif(&vifc); + break; + + case MRT_DEL_VIF: + error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); + if (error) + break; + error = del_vif(vifi); + break; + + case MRT_ADD_MFC: + case MRT_DEL_MFC: + /* + * select data size depending on API version. + */ + if (sopt->sopt_name == MRT_ADD_MFC && + V_mrt_api_config & MRT_API_FLAGS_ALL) { + error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), + sizeof(struct mfcctl2)); + } else { + error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), + sizeof(struct mfcctl)); + bzero((caddr_t)&mfc + sizeof(struct mfcctl), + sizeof(mfc) - sizeof(struct mfcctl)); + } + if (error) + break; + if (sopt->sopt_name == MRT_ADD_MFC) + error = add_mfc(&mfc); + else + error = del_mfc(&mfc); + break; + + case MRT_ASSERT: + error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); + if (error) + break; + set_assert(optval); + break; + + case MRT_API_CONFIG: + error = sooptcopyin(sopt, &i, sizeof i, sizeof i); + if (!error) + error = set_api_config(&i); + if (!error) + error = sooptcopyout(sopt, &i, sizeof i); + break; + + case MRT_ADD_BW_UPCALL: + case MRT_DEL_BW_UPCALL: + error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, + sizeof bw_upcall); + if (error) + break; + if (sopt->sopt_name == MRT_ADD_BW_UPCALL) + error = add_bw_upcall(&bw_upcall); + else + error = del_bw_upcall(&bw_upcall); + break; + + default: + error = EOPNOTSUPP; + break; + } + return error; +} + +/* + * Handle MRT getsockopt commands + */ +static int +X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) +{ + int error; + + switch (sopt->sopt_name) { + case MRT_VERSION: + error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version); + break; + + case MRT_ASSERT: + error = sooptcopyout(sopt, &V_pim_assert_enabled, + sizeof V_pim_assert_enabled); + break; + + case MRT_API_SUPPORT: + error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); + break; + + case MRT_API_CONFIG: + error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config); + break; + + default: + error = EOPNOTSUPP; + break; + } + return error; +} + +/* + * Handle ioctl commands to obtain information from the cache + */ +static int +X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused) +{ + int error = 0; + + /* + * Currently the only function calling this ioctl routine is rtioctl(). + * Typically, only root can create the raw socket in order to execute + * this ioctl method, however the request might be coming from a prison + */ + error = priv_check(curthread, PRIV_NETINET_MROUTE); + if (error) + return (error); + switch (cmd) { + case (SIOCGETVIFCNT): + error = get_vif_cnt((struct sioc_vif_req *)data); + break; + + case (SIOCGETSGCNT): + error = get_sg_cnt((struct sioc_sg_req *)data); + break; + + default: + error = EINVAL; + break; + } + return error; +} + +/* + * returns the packet, byte, rpf-failure count for the source group provided + */ +static int +get_sg_cnt(struct sioc_sg_req *req) +{ + struct mfc *rt; + + MFC_LOCK(); + rt = mfc_find(&req->src, &req->grp); + if (rt == NULL) { + MFC_UNLOCK(); + req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; + return EADDRNOTAVAIL; + } + req->pktcnt = rt->mfc_pkt_cnt; + req->bytecnt = rt->mfc_byte_cnt; + req->wrong_if = rt->mfc_wrong_if; + MFC_UNLOCK(); + return 0; +} + +/* + * returns the input and output packet and byte counts on the vif provided + */ +static int +get_vif_cnt(struct sioc_vif_req *req) +{ + vifi_t vifi = req->vifi; + + VIF_LOCK(); + if (vifi >= V_numvifs) { + VIF_UNLOCK(); + return EINVAL; + } + + req->icount = V_viftable[vifi].v_pkt_in; + req->ocount = V_viftable[vifi].v_pkt_out; + req->ibytes = V_viftable[vifi].v_bytes_in; + req->obytes = V_viftable[vifi].v_bytes_out; + VIF_UNLOCK(); + + return 0; +} + +static void +if_detached_event(void *arg __unused, struct ifnet *ifp) +{ + vifi_t vifi; + int i; + + MROUTER_LOCK(); + + if (V_ip_mrouter == NULL) { + MROUTER_UNLOCK(); + return; + } + + VIF_LOCK(); + MFC_LOCK(); + + /* + * Tear down multicast forwarder state associated with this ifnet. + * 1. Walk the vif list, matching vifs against this ifnet. + * 2. Walk the multicast forwarding cache (mfc) looking for + * inner matches with this vif's index. + * 3. Expire any matching multicast forwarding cache entries. + * 4. Free vif state. This should disable ALLMULTI on the interface. + */ + for (vifi = 0; vifi < V_numvifs; vifi++) { + if (V_viftable[vifi].v_ifp != ifp) + continue; + for (i = 0; i < mfchashsize; i++) { + struct mfc *rt, *nrt; + for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) { + nrt = LIST_NEXT(rt, mfc_hash); + if (rt->mfc_parent == vifi) { + expire_mfc(rt); + } + } + } + del_vif_locked(vifi); + } + + MFC_UNLOCK(); + VIF_UNLOCK(); + + MROUTER_UNLOCK(); +} + +/* + * Enable multicast forwarding. + */ +static int +ip_mrouter_init(struct socket *so, int version) +{ + + CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__, + so->so_type, so->so_proto->pr_protocol); + + if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) + return EOPNOTSUPP; + + if (version != 1) + return ENOPROTOOPT; + + MROUTER_LOCK(); + + if (ip_mrouter_unloading) { + MROUTER_UNLOCK(); + return ENOPROTOOPT; + } + + if (V_ip_mrouter != NULL) { + MROUTER_UNLOCK(); + return EADDRINUSE; + } + + V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash, + HASH_NOWAIT); + + callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, + curvnet); + callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, + curvnet); + callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, + curvnet); + + V_ip_mrouter = so; + ip_mrouter_cnt++; + + MROUTER_UNLOCK(); + + CTR1(KTR_IPMF, "%s: done", __func__); + + return 0; +} + +/* + * Disable multicast forwarding. + */ +static int +X_ip_mrouter_done(void) +{ + vifi_t vifi; + int i; + struct ifnet *ifp; + struct ifreq ifr; + + MROUTER_LOCK(); + + if (V_ip_mrouter == NULL) { + MROUTER_UNLOCK(); + return EINVAL; + } + + /* + * Detach/disable hooks to the reset of the system. + */ + V_ip_mrouter = NULL; + ip_mrouter_cnt--; + V_mrt_api_config = 0; + + VIF_LOCK(); + + /* + * For each phyint in use, disable promiscuous reception of all IP + * multicasts. + */ + for (vifi = 0; vifi < V_numvifs; vifi++) { + if (!in_nullhost(V_viftable[vifi].v_lcl_addr) && + !(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { + struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr); + + so->sin_len = sizeof(struct sockaddr_in); + so->sin_family = AF_INET; + so->sin_addr.s_addr = INADDR_ANY; + ifp = V_viftable[vifi].v_ifp; + if_allmulti(ifp, 0); + } + } + bzero((caddr_t)V_viftable, sizeof(V_viftable)); + V_numvifs = 0; + V_pim_assert_enabled = 0; + + VIF_UNLOCK(); + + callout_stop(&V_expire_upcalls_ch); + callout_stop(&V_bw_upcalls_ch); + callout_stop(&V_bw_meter_ch); + + MFC_LOCK(); + + /* + * Free all multicast forwarding cache entries. + * Do not use hashdestroy(), as we must perform other cleanup. + */ + for (i = 0; i < mfchashsize; i++) { + struct mfc *rt, *nrt; + for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) { + nrt = LIST_NEXT(rt, mfc_hash); + expire_mfc(rt); + } + } + free(V_mfchashtbl, M_MRTABLE); + V_mfchashtbl = NULL; + + bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); + + V_bw_upcalls_n = 0; + bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); + + MFC_UNLOCK(); + + V_reg_vif_num = VIFI_INVALID; + + MROUTER_UNLOCK(); + + CTR1(KTR_IPMF, "%s: done", __func__); + + return 0; +} + +/* + * Set PIM assert processing global + */ +static int +set_assert(int i) +{ + if ((i != 1) && (i != 0)) + return EINVAL; + + V_pim_assert_enabled = i; + + return 0; +} + +/* + * Configure API capabilities + */ +int +set_api_config(uint32_t *apival) +{ + int i; + + /* + * We can set the API capabilities only if it is the first operation + * after MRT_INIT. I.e.: + * - there are no vifs installed + * - pim_assert is not enabled + * - the MFC table is empty + */ + if (V_numvifs > 0) { + *apival = 0; + return EPERM; + } + if (V_pim_assert_enabled) { + *apival = 0; + return EPERM; + } + + MFC_LOCK(); + + for (i = 0; i < mfchashsize; i++) { + if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) { + *apival = 0; + return EPERM; + } + } + + MFC_UNLOCK(); + + V_mrt_api_config = *apival & mrt_api_support; + *apival = V_mrt_api_config; + + return 0; +} + +/* + * Add a vif to the vif table + */ +static int +add_vif(struct vifctl *vifcp) +{ + struct vif *vifp = V_viftable + vifcp->vifc_vifi; + struct sockaddr_in sin = {sizeof sin, AF_INET}; + struct ifaddr *ifa; + struct ifnet *ifp; + int error; + + VIF_LOCK(); + if (vifcp->vifc_vifi >= MAXVIFS) { + VIF_UNLOCK(); + return EINVAL; + } + /* rate limiting is no longer supported by this code */ + if (vifcp->vifc_rate_limit != 0) { + log(LOG_ERR, "rate limiting is no longer supported\n"); + VIF_UNLOCK(); + return EINVAL; + } + if (!in_nullhost(vifp->v_lcl_addr)) { + VIF_UNLOCK(); + return EADDRINUSE; + } + if (in_nullhost(vifcp->vifc_lcl_addr)) { + VIF_UNLOCK(); + return EADDRNOTAVAIL; + } + + /* Find the interface with an address in AF_INET family */ + if (vifcp->vifc_flags & VIFF_REGISTER) { + /* + * XXX: Because VIFF_REGISTER does not really need a valid + * local interface (e.g. it could be 127.0.0.2), we don't + * check its address. + */ + ifp = NULL; + } else { + sin.sin_addr = vifcp->vifc_lcl_addr; + ifa = ifa_ifwithaddr((struct sockaddr *)&sin); + if (ifa == NULL) { + VIF_UNLOCK(); + return EADDRNOTAVAIL; + } + ifp = ifa->ifa_ifp; + ifa_free(ifa); + } + + if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { + CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__); + VIF_UNLOCK(); + return EOPNOTSUPP; + } else if (vifcp->vifc_flags & VIFF_REGISTER) { + ifp = &V_multicast_register_if; + CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp); + if (V_reg_vif_num == VIFI_INVALID) { + if_initname(&V_multicast_register_if, "register_vif", 0); + V_multicast_register_if.if_flags = IFF_LOOPBACK; + V_reg_vif_num = vifcp->vifc_vifi; + } + } else { /* Make sure the interface supports multicast */ + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + VIF_UNLOCK(); + return EOPNOTSUPP; + } + + /* Enable promiscuous reception of all IP multicasts from the if */ + error = if_allmulti(ifp, 1); + if (error) { + VIF_UNLOCK(); + return error; + } + } + + vifp->v_flags = vifcp->vifc_flags; + vifp->v_threshold = vifcp->vifc_threshold; + vifp->v_lcl_addr = vifcp->vifc_lcl_addr; + vifp->v_rmt_addr = vifcp->vifc_rmt_addr; + vifp->v_ifp = ifp; + /* initialize per vif pkt counters */ + vifp->v_pkt_in = 0; + vifp->v_pkt_out = 0; + vifp->v_bytes_in = 0; + vifp->v_bytes_out = 0; + bzero(&vifp->v_route, sizeof(vifp->v_route)); + + /* Adjust numvifs up if the vifi is higher than numvifs */ + if (V_numvifs <= vifcp->vifc_vifi) + V_numvifs = vifcp->vifc_vifi + 1; + + VIF_UNLOCK(); + + CTR4(KTR_IPMF, "%s: add vif %d laddr %s thresh %x", __func__, + (int)vifcp->vifc_vifi, inet_ntoa(vifcp->vifc_lcl_addr), + (int)vifcp->vifc_threshold); + + return 0; +} + +/* + * Delete a vif from the vif table + */ +static int +del_vif_locked(vifi_t vifi) +{ + struct vif *vifp; + + VIF_LOCK_ASSERT(); + + if (vifi >= V_numvifs) { + return EINVAL; + } + vifp = &V_viftable[vifi]; + if (in_nullhost(vifp->v_lcl_addr)) { + return EADDRNOTAVAIL; + } + + if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) + if_allmulti(vifp->v_ifp, 0); + + if (vifp->v_flags & VIFF_REGISTER) + V_reg_vif_num = VIFI_INVALID; + + bzero((caddr_t)vifp, sizeof (*vifp)); + + CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi); + + /* Adjust numvifs down */ + for (vifi = V_numvifs; vifi > 0; vifi--) + if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr)) + break; + V_numvifs = vifi; + + return 0; +} + +static int +del_vif(vifi_t vifi) +{ + int cc; + + VIF_LOCK(); + cc = del_vif_locked(vifi); + VIF_UNLOCK(); + + return cc; +} + +/* + * update an mfc entry without resetting counters and S,G addresses. + */ +static void +update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) +{ + int i; + + rt->mfc_parent = mfccp->mfcc_parent; + for (i = 0; i < V_numvifs; i++) { + rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; + rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config & + MRT_MFC_FLAGS_ALL; + } + /* set the RP address */ + if (V_mrt_api_config & MRT_MFC_RP) + rt->mfc_rp = mfccp->mfcc_rp; + else + rt->mfc_rp.s_addr = INADDR_ANY; +} + +/* + * fully initialize an mfc entry from the parameter. + */ +static void +init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) +{ + rt->mfc_origin = mfccp->mfcc_origin; + rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; + + update_mfc_params(rt, mfccp); + + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + timevalclear(&rt->mfc_last_assert); +} + +static void +expire_mfc(struct mfc *rt) +{ + struct rtdetq *rte, *nrte; + + free_bw_list(rt->mfc_bw_meter); + + TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { + m_freem(rte->m); + TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); + free(rte, M_MRTABLE); + } + + LIST_REMOVE(rt, mfc_hash); + free(rt, M_MRTABLE); +} + +/* + * Add an mfc entry + */ +static int +add_mfc(struct mfcctl2 *mfccp) +{ + struct mfc *rt; + struct rtdetq *rte, *nrte; + u_long hash = 0; + u_short nstl; + + VIF_LOCK(); + MFC_LOCK(); + + rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); + + /* If an entry already exists, just update the fields */ + if (rt) { + CTR4(KTR_IPMF, "%s: update mfc orig %s group %lx parent %x", + __func__, inet_ntoa(mfccp->mfcc_origin), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent); + update_mfc_params(rt, mfccp); + MFC_UNLOCK(); + VIF_UNLOCK(); + return (0); + } + + /* + * Find the entry for which the upcall was made and update + */ + nstl = 0; + hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); + LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { + if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && + in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && + !TAILQ_EMPTY(&rt->mfc_stall)) { + CTR5(KTR_IPMF, + "%s: add mfc orig %s group %lx parent %x qh %p", + __func__, inet_ntoa(mfccp->mfcc_origin), + (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), + mfccp->mfcc_parent, + TAILQ_FIRST(&rt->mfc_stall)); + if (nstl++) + CTR1(KTR_IPMF, "%s: multiple matches", __func__); + + init_mfc_params(rt, mfccp); + rt->mfc_expire = 0; /* Don't clean this guy up */ + V_nexpire[hash]--; + + /* Free queued packets, but attempt to forward them first. */ + TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { + if (rte->ifp != NULL) + ip_mdq(rte->m, rte->ifp, rt, -1); + m_freem(rte->m); + TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); + rt->mfc_nstall--; + free(rte, M_MRTABLE); + } + } + } + + /* + * It is possible that an entry is being inserted without an upcall + */ + if (nstl == 0) { + CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__); + LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { + if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && + in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { + init_mfc_params(rt, mfccp); + if (rt->mfc_expire) + V_nexpire[hash]--; + rt->mfc_expire = 0; + break; /* XXX */ + } + } + + if (rt == NULL) { /* no upcall, so make a new entry */ + rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + if (rt == NULL) { + MFC_UNLOCK(); + VIF_UNLOCK(); + return (ENOBUFS); + } + + init_mfc_params(rt, mfccp); + TAILQ_INIT(&rt->mfc_stall); + rt->mfc_nstall = 0; + + rt->mfc_expire = 0; + rt->mfc_bw_meter = NULL; + + /* insert new entry at head of hash chain */ + LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); + } + } + + MFC_UNLOCK(); + VIF_UNLOCK(); + + return (0); +} + +/* + * Delete an mfc entry + */ +static int +del_mfc(struct mfcctl2 *mfccp) +{ + struct in_addr origin; + struct in_addr mcastgrp; + struct mfc *rt; + + origin = mfccp->mfcc_origin; + mcastgrp = mfccp->mfcc_mcastgrp; + + CTR3(KTR_IPMF, "%s: delete mfc orig %s group %lx", __func__, + inet_ntoa(origin), (u_long)ntohl(mcastgrp.s_addr)); + + MFC_LOCK(); + + rt = mfc_find(&origin, &mcastgrp); + if (rt == NULL) { + MFC_UNLOCK(); + return EADDRNOTAVAIL; + } + + /* + * free the bw_meter entries + */ + free_bw_list(rt->mfc_bw_meter); + rt->mfc_bw_meter = NULL; + + LIST_REMOVE(rt, mfc_hash); + free(rt, M_MRTABLE); + + MFC_UNLOCK(); + + return (0); +} + +/* + * Send a message to the routing daemon on the multicast routing socket. + */ +static int +socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) +{ + if (s) { + SOCKBUF_LOCK(&s->so_rcv); + if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, + NULL) != 0) { + sorwakeup_locked(s); + return 0; + } + SOCKBUF_UNLOCK(&s->so_rcv); + } + m_freem(mm); + return -1; +} + +/* + * IP multicast forwarding function. This function assumes that the packet + * pointed to by "ip" has arrived on (or is about to be sent to) the interface + * pointed to by "ifp", and the packet is to be relayed to other networks + * that have members of the packet's destination IP multicast group. + * + * The packet is returned unscathed to the caller, unless it is + * erroneous, in which case a non-zero return value tells the caller to + * discard it. + */ + +#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ + +static int +X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, + struct ip_moptions *imo) +{ + struct mfc *rt; + int error; + vifi_t vifi; + + CTR3(KTR_IPMF, "ip_mforward: delete mfc orig %s group %lx ifp %p", + inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr), ifp); + + if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || + ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { + /* + * Packet arrived via a physical interface or + * an encapsulated tunnel or a register_vif. + */ + } else { + /* + * Packet arrived through a source-route tunnel. + * Source-route tunnels are no longer supported. + */ + return (1); + } + + VIF_LOCK(); + MFC_LOCK(); + if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) { + if (ip->ip_ttl < MAXTTL) + ip->ip_ttl++; /* compensate for -1 in *_send routines */ + error = ip_mdq(m, ifp, NULL, vifi); + MFC_UNLOCK(); + VIF_UNLOCK(); + return error; + } + + /* + * Don't forward a packet with time-to-live of zero or one, + * or a packet destined to a local-only group. + */ + if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { + MFC_UNLOCK(); + VIF_UNLOCK(); + return 0; + } + + /* + * Determine forwarding vifs from the forwarding cache table + */ + MRTSTAT_INC(mrts_mfc_lookups); + rt = mfc_find(&ip->ip_src, &ip->ip_dst); + + /* Entry exists, so forward if necessary */ + if (rt != NULL) { + error = ip_mdq(m, ifp, rt, -1); + MFC_UNLOCK(); + VIF_UNLOCK(); + return error; + } else { + /* + * If we don't have a route for packet's origin, + * Make a copy of the packet & send message to routing daemon + */ + + struct mbuf *mb0; + struct rtdetq *rte; + u_long hash; + int hlen = ip->ip_hl << 2; + + MRTSTAT_INC(mrts_mfc_misses); + MRTSTAT_INC(mrts_no_route); + CTR2(KTR_IPMF, "ip_mforward: no mfc for (%s,%lx)", + inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr)); + + /* + * Allocate mbufs early so that we don't do extra work if we are + * just going to fail anyway. Make sure to pullup the header so + * that other people can't step on it. + */ + rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, + M_NOWAIT|M_ZERO); + if (rte == NULL) { + MFC_UNLOCK(); + VIF_UNLOCK(); + return ENOBUFS; + } + + mb0 = m_copypacket(m, M_DONTWAIT); + if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) + mb0 = m_pullup(mb0, hlen); + if (mb0 == NULL) { + free(rte, M_MRTABLE); + MFC_UNLOCK(); + VIF_UNLOCK(); + return ENOBUFS; + } + + /* is there an upcall waiting for this flow ? */ + hash = MFCHASH(ip->ip_src, ip->ip_dst); + LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { + if (in_hosteq(ip->ip_src, rt->mfc_origin) && + in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && + !TAILQ_EMPTY(&rt->mfc_stall)) + break; + } + + if (rt == NULL) { + int i; + struct igmpmsg *im; + struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; + struct mbuf *mm; + + /* + * Locate the vifi for the incoming interface for this packet. + * If none found, drop packet. + */ + for (vifi = 0; vifi < V_numvifs && + V_viftable[vifi].v_ifp != ifp; vifi++) + ; + if (vifi >= V_numvifs) /* vif not found, drop packet */ + goto non_fatal; + + /* no upcall, so make a new entry */ + rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + if (rt == NULL) + goto fail; + + /* Make a copy of the header to send to the user level process */ + mm = m_copy(mb0, 0, hlen); + if (mm == NULL) + goto fail1; + + /* + * Send message to routing daemon to install + * a route into the kernel table + */ + + im = mtod(mm, struct igmpmsg *); + im->im_msgtype = IGMPMSG_NOCACHE; + im->im_mbz = 0; + im->im_vif = vifi; + + MRTSTAT_INC(mrts_upcalls); + + k_igmpsrc.sin_addr = ip->ip_src; + if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { + CTR0(KTR_IPMF, "ip_mforward: socket queue full"); + MRTSTAT_INC(mrts_upq_sockfull); +fail1: + free(rt, M_MRTABLE); +fail: + free(rte, M_MRTABLE); + m_freem(mb0); + MFC_UNLOCK(); + VIF_UNLOCK(); + return ENOBUFS; + } + + /* insert new entry at head of hash chain */ + rt->mfc_origin.s_addr = ip->ip_src.s_addr; + rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; + rt->mfc_expire = UPCALL_EXPIRE; + V_nexpire[hash]++; + for (i = 0; i < V_numvifs; i++) { + rt->mfc_ttls[i] = 0; + rt->mfc_flags[i] = 0; + } + rt->mfc_parent = -1; + + /* clear the RP address */ + rt->mfc_rp.s_addr = INADDR_ANY; + rt->mfc_bw_meter = NULL; + + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + timevalclear(&rt->mfc_last_assert); + + TAILQ_INIT(&rt->mfc_stall); + rt->mfc_nstall = 0; + + /* link into table */ + LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); + TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link); + rt->mfc_nstall++; + + } else { + /* determine if queue has overflowed */ + if (rt->mfc_nstall > MAX_UPQ) { + MRTSTAT_INC(mrts_upq_ovflw); +non_fatal: + free(rte, M_MRTABLE); + m_freem(mb0); + MFC_UNLOCK(); + VIF_UNLOCK(); + return (0); + } + TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link); + rt->mfc_nstall++; + } + + rte->m = mb0; + rte->ifp = ifp; + + MFC_UNLOCK(); + VIF_UNLOCK(); + + return 0; + } +} + +/* + * Clean up the cache entry if upcall is not serviced + */ +static void +expire_upcalls(void *arg) +{ + int i; + + CURVNET_SET((struct vnet *) arg); + + MFC_LOCK(); + + for (i = 0; i < mfchashsize; i++) { + struct mfc *rt, *nrt; + + if (V_nexpire[i] == 0) + continue; + + for (rt = LIST_FIRST(&V_mfchashtbl[i]); rt; rt = nrt) { + nrt = LIST_NEXT(rt, mfc_hash); + + if (TAILQ_EMPTY(&rt->mfc_stall)) + continue; + + if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) + continue; + + /* + * free the bw_meter entries + */ + while (rt->mfc_bw_meter != NULL) { + struct bw_meter *x = rt->mfc_bw_meter; + + rt->mfc_bw_meter = x->bm_mfc_next; + free(x, M_BWMETER); + } + + MRTSTAT_INC(mrts_cache_cleanups); + CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__, + (u_long)ntohl(rt->mfc_origin.s_addr), + (u_long)ntohl(rt->mfc_mcastgrp.s_addr)); + + expire_mfc(rt); + } + } + + MFC_UNLOCK(); + + callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, + curvnet); + + CURVNET_RESTORE(); +} + +/* + * Packet forwarding routine once entry in the cache is made + */ +static int +ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) +{ + struct ip *ip = mtod(m, struct ip *); + vifi_t vifi; + int plen = ip->ip_len; + + VIF_LOCK_ASSERT(); + + /* + * If xmt_vif is not -1, send on only the requested vif. + * + * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) + */ + if (xmt_vif < V_numvifs) { + if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER) + pim_register_send(ip, V_viftable + xmt_vif, m, rt); + else + phyint_send(ip, V_viftable + xmt_vif, m); + return 1; + } + + /* + * Don't forward if it didn't arrive from the parent vif for its origin. + */ + vifi = rt->mfc_parent; + if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) { + CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)", + __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp); + MRTSTAT_INC(mrts_wrong_if); + ++rt->mfc_wrong_if; + /* + * If we are doing PIM assert processing, send a message + * to the routing daemon. + * + * XXX: A PIM-SM router needs the WRONGVIF detection so it + * can complete the SPT switch, regardless of the type + * of the iif (broadcast media, GRE tunnel, etc). + */ + if (V_pim_assert_enabled && (vifi < V_numvifs) && + V_viftable[vifi].v_ifp) { + + if (ifp == &V_multicast_register_if) + PIMSTAT_INC(pims_rcv_registers_wrongiif); + + /* Get vifi for the incoming packet */ + for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; + vifi++) + ; + if (vifi >= V_numvifs) + return 0; /* The iif is not found: ignore the packet. */ + + if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) + return 0; /* WRONGVIF disabled: ignore the packet */ + + if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) { + struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; + struct igmpmsg *im; + int hlen = ip->ip_hl << 2; + struct mbuf *mm = m_copy(m, 0, hlen); + + if (mm && (M_HASCL(mm) || mm->m_len < hlen)) + mm = m_pullup(mm, hlen); + if (mm == NULL) + return ENOBUFS; + + im = mtod(mm, struct igmpmsg *); + im->im_msgtype = IGMPMSG_WRONGVIF; + im->im_mbz = 0; + im->im_vif = vifi; + + MRTSTAT_INC(mrts_upcalls); + + k_igmpsrc.sin_addr = im->im_src; + if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { + CTR1(KTR_IPMF, "%s: socket queue full", __func__); + MRTSTAT_INC(mrts_upq_sockfull); + return ENOBUFS; + } + } + } + return 0; + } + + + /* If I sourced this packet, it counts as output, else it was input. */ + if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) { + V_viftable[vifi].v_pkt_out++; + V_viftable[vifi].v_bytes_out += plen; + } else { + V_viftable[vifi].v_pkt_in++; + V_viftable[vifi].v_bytes_in += plen; + } + rt->mfc_pkt_cnt++; + rt->mfc_byte_cnt += plen; + + /* + * For each vif, decide if a copy of the packet should be forwarded. + * Forward if: + * - the ttl exceeds the vif's threshold + * - there are group members downstream on interface + */ + for (vifi = 0; vifi < V_numvifs; vifi++) + if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { + V_viftable[vifi].v_pkt_out++; + V_viftable[vifi].v_bytes_out += plen; + if (V_viftable[vifi].v_flags & VIFF_REGISTER) + pim_register_send(ip, V_viftable + vifi, m, rt); + else + phyint_send(ip, V_viftable + vifi, m); + } + + /* + * Perform upcall-related bw measuring. + */ + if (rt->mfc_bw_meter != NULL) { + struct bw_meter *x; + struct timeval now; + + microtime(&now); + MFC_LOCK_ASSERT(); + for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) + bw_meter_receive_packet(x, plen, &now); + } + + return 0; +} + +/* + * Check if a vif number is legal/ok. This is used by in_mcast.c. + */ +static int +X_legal_vif_num(int vif) +{ + int ret; + + ret = 0; + if (vif < 0) + return (ret); + + VIF_LOCK(); + if (vif < V_numvifs) + ret = 1; + VIF_UNLOCK(); + + return (ret); +} + +/* + * Return the local address used by this vif + */ +static u_long +X_ip_mcast_src(int vifi) +{ + in_addr_t addr; + + addr = INADDR_ANY; + if (vifi < 0) + return (addr); + + VIF_LOCK(); + if (vifi < V_numvifs) + addr = V_viftable[vifi].v_lcl_addr.s_addr; + VIF_UNLOCK(); + + return (addr); +} + +static void +phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) +{ + struct mbuf *mb_copy; + int hlen = ip->ip_hl << 2; + + VIF_LOCK_ASSERT(); + + /* + * Make a new reference to the packet; make sure that + * the IP header is actually copied, not just referenced, + * so that ip_output() only scribbles on the copy. + */ + mb_copy = m_copypacket(m, M_DONTWAIT); + if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) + mb_copy = m_pullup(mb_copy, hlen); + if (mb_copy == NULL) + return; + + send_packet(vifp, mb_copy); +} + +static void +send_packet(struct vif *vifp, struct mbuf *m) +{ + struct ip_moptions imo; + struct in_multi *imm[2]; + int error; + + VIF_LOCK_ASSERT(); + + imo.imo_multicast_ifp = vifp->v_ifp; + imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; + imo.imo_multicast_loop = 1; + imo.imo_multicast_vif = -1; + imo.imo_num_memberships = 0; + imo.imo_max_memberships = 2; + imo.imo_membership = &imm[0]; + + /* + * Re-entrancy should not be a problem here, because + * the packets that we send out and are looped back at us + * should get rejected because they appear to come from + * the loopback interface, thus preventing looping. + */ + error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL); + CTR3(KTR_IPMF, "%s: vif %td err %d", __func__, + (ptrdiff_t)(vifp - V_viftable), error); +} + +/* + * Stubs for old RSVP socket shim implementation. + */ + +static int +X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused) +{ + + return (EOPNOTSUPP); +} + +static void +X_ip_rsvp_force_done(struct socket *so __unused) +{ + +} + +static void +X_rsvp_input(struct mbuf *m, int off __unused) +{ + + if (!V_rsvp_on) + m_freem(m); +} + +/* + * Code for bandwidth monitors + */ + +/* + * Define common interface for timeval-related methods + */ +#define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) +#define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) +#define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) + +static uint32_t +compute_bw_meter_flags(struct bw_upcall *req) +{ + uint32_t flags = 0; + + if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) + flags |= BW_METER_UNIT_PACKETS; + if (req->bu_flags & BW_UPCALL_UNIT_BYTES) + flags |= BW_METER_UNIT_BYTES; + if (req->bu_flags & BW_UPCALL_GEQ) + flags |= BW_METER_GEQ; + if (req->bu_flags & BW_UPCALL_LEQ) + flags |= BW_METER_LEQ; + + return flags; +} + +/* + * Add a bw_meter entry + */ +static int +add_bw_upcall(struct bw_upcall *req) +{ + struct mfc *mfc; + struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, + BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; + struct timeval now; + struct bw_meter *x; + uint32_t flags; + + if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) + return EOPNOTSUPP; + + /* Test if the flags are valid */ + if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) + return EINVAL; + if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) + return EINVAL; + if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) + == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) + return EINVAL; + + /* Test if the threshold time interval is valid */ + if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) + return EINVAL; + + flags = compute_bw_meter_flags(req); + + /* + * Find if we have already same bw_meter entry + */ + MFC_LOCK(); + mfc = mfc_find(&req->bu_src, &req->bu_dst); + if (mfc == NULL) { + MFC_UNLOCK(); + return EADDRNOTAVAIL; + } + for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { + if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, + &req->bu_threshold.b_time, ==)) && + (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && + (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && + (x->bm_flags & BW_METER_USER_FLAGS) == flags) { + MFC_UNLOCK(); + return 0; /* XXX Already installed */ + } + } + + /* Allocate the new bw_meter entry */ + x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); + if (x == NULL) { + MFC_UNLOCK(); + return ENOBUFS; + } + + /* Set the new bw_meter entry */ + x->bm_threshold.b_time = req->bu_threshold.b_time; + microtime(&now); + x->bm_start_time = now; + x->bm_threshold.b_packets = req->bu_threshold.b_packets; + x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; + x->bm_measured.b_packets = 0; + x->bm_measured.b_bytes = 0; + x->bm_flags = flags; + x->bm_time_next = NULL; + x->bm_time_hash = BW_METER_BUCKETS; + + /* Add the new bw_meter entry to the front of entries for this MFC */ + x->bm_mfc = mfc; + x->bm_mfc_next = mfc->mfc_bw_meter; + mfc->mfc_bw_meter = x; + schedule_bw_meter(x, &now); + MFC_UNLOCK(); + + return 0; +} + +static void +free_bw_list(struct bw_meter *list) +{ + while (list != NULL) { + struct bw_meter *x = list; + + list = list->bm_mfc_next; + unschedule_bw_meter(x); + free(x, M_BWMETER); + } +} + +/* + * Delete one or multiple bw_meter entries + */ +static int +del_bw_upcall(struct bw_upcall *req) +{ + struct mfc *mfc; + struct bw_meter *x; + + if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) + return EOPNOTSUPP; + + MFC_LOCK(); + + /* Find the corresponding MFC entry */ + mfc = mfc_find(&req->bu_src, &req->bu_dst); + if (mfc == NULL) { + MFC_UNLOCK(); + return EADDRNOTAVAIL; + } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { + /* + * Delete all bw_meter entries for this mfc + */ + struct bw_meter *list; + + list = mfc->mfc_bw_meter; + mfc->mfc_bw_meter = NULL; + free_bw_list(list); + MFC_UNLOCK(); + return 0; + } else { /* Delete a single bw_meter entry */ + struct bw_meter *prev; + uint32_t flags = 0; + + flags = compute_bw_meter_flags(req); + + /* Find the bw_meter entry to delete */ + for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; + prev = x, x = x->bm_mfc_next) { + if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, + &req->bu_threshold.b_time, ==)) && + (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && + (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && + (x->bm_flags & BW_METER_USER_FLAGS) == flags) + break; + } + if (x != NULL) { /* Delete entry from the list for this MFC */ + if (prev != NULL) + prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ + else + x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ + + unschedule_bw_meter(x); + MFC_UNLOCK(); + /* Free the bw_meter entry */ + free(x, M_BWMETER); + return 0; + } else { + MFC_UNLOCK(); + return EINVAL; + } + } + /* NOTREACHED */ +} + +/* + * Perform bandwidth measurement processing that may result in an upcall + */ +static void +bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) +{ + struct timeval delta; + + MFC_LOCK_ASSERT(); + + delta = *nowp; + BW_TIMEVALDECR(&delta, &x->bm_start_time); + + if (x->bm_flags & BW_METER_GEQ) { + /* + * Processing for ">=" type of bw_meter entry + */ + if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { + /* Reset the bw_meter entry */ + x->bm_start_time = *nowp; + x->bm_measured.b_packets = 0; + x->bm_measured.b_bytes = 0; + x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; + } + + /* Record that a packet is received */ + x->bm_measured.b_packets++; + x->bm_measured.b_bytes += plen; + + /* + * Test if we should deliver an upcall + */ + if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { + if (((x->bm_flags & BW_METER_UNIT_PACKETS) && + (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || + ((x->bm_flags & BW_METER_UNIT_BYTES) && + (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { + /* Prepare an upcall for delivery */ + bw_meter_prepare_upcall(x, nowp); + x->bm_flags |= BW_METER_UPCALL_DELIVERED; + } + } + } else if (x->bm_flags & BW_METER_LEQ) { + /* + * Processing for "<=" type of bw_meter entry + */ + if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { + /* + * We are behind time with the multicast forwarding table + * scanning for "<=" type of bw_meter entries, so test now + * if we should deliver an upcall. + */ + if (((x->bm_flags & BW_METER_UNIT_PACKETS) && + (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || + ((x->bm_flags & BW_METER_UNIT_BYTES) && + (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { + /* Prepare an upcall for delivery */ + bw_meter_prepare_upcall(x, nowp); + } + /* Reschedule the bw_meter entry */ + unschedule_bw_meter(x); + schedule_bw_meter(x, nowp); + } + + /* Record that a packet is received */ + x->bm_measured.b_packets++; + x->bm_measured.b_bytes += plen; + + /* + * Test if we should restart the measuring interval + */ + if ((x->bm_flags & BW_METER_UNIT_PACKETS && + x->bm_measured.b_packets <= x->bm_threshold.b_packets) || + (x->bm_flags & BW_METER_UNIT_BYTES && + x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { + /* Don't restart the measuring interval */ + } else { + /* Do restart the measuring interval */ + /* + * XXX: note that we don't unschedule and schedule, because this + * might be too much overhead per packet. Instead, when we process + * all entries for a given timer hash bin, we check whether it is + * really a timeout. If not, we reschedule at that time. + */ + x->bm_start_time = *nowp; + x->bm_measured.b_packets = 0; + x->bm_measured.b_bytes = 0; + x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; + } + } +} + +/* + * Prepare a bandwidth-related upcall + */ +static void +bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) +{ + struct timeval delta; + struct bw_upcall *u; + + MFC_LOCK_ASSERT(); + + /* + * Compute the measured time interval + */ + delta = *nowp; + BW_TIMEVALDECR(&delta, &x->bm_start_time); + + /* + * If there are too many pending upcalls, deliver them now + */ + if (V_bw_upcalls_n >= BW_UPCALLS_MAX) + bw_upcalls_send(); + + /* + * Set the bw_upcall entry + */ + u = &V_bw_upcalls[V_bw_upcalls_n++]; + u->bu_src = x->bm_mfc->mfc_origin; + u->bu_dst = x->bm_mfc->mfc_mcastgrp; + u->bu_threshold.b_time = x->bm_threshold.b_time; + u->bu_threshold.b_packets = x->bm_threshold.b_packets; + u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; + u->bu_measured.b_time = delta; + u->bu_measured.b_packets = x->bm_measured.b_packets; + u->bu_measured.b_bytes = x->bm_measured.b_bytes; + u->bu_flags = 0; + if (x->bm_flags & BW_METER_UNIT_PACKETS) + u->bu_flags |= BW_UPCALL_UNIT_PACKETS; + if (x->bm_flags & BW_METER_UNIT_BYTES) + u->bu_flags |= BW_UPCALL_UNIT_BYTES; + if (x->bm_flags & BW_METER_GEQ) + u->bu_flags |= BW_UPCALL_GEQ; + if (x->bm_flags & BW_METER_LEQ) + u->bu_flags |= BW_UPCALL_LEQ; +} + +/* + * Send the pending bandwidth-related upcalls + */ +static void +bw_upcalls_send(void) +{ + struct mbuf *m; + int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]); + struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; + static struct igmpmsg igmpmsg = { 0, /* unused1 */ + 0, /* unused2 */ + IGMPMSG_BW_UPCALL,/* im_msgtype */ + 0, /* im_mbz */ + 0, /* im_vif */ + 0, /* unused3 */ + { 0 }, /* im_src */ + { 0 } }; /* im_dst */ + + MFC_LOCK_ASSERT(); + + if (V_bw_upcalls_n == 0) + return; /* No pending upcalls */ + + V_bw_upcalls_n = 0; + + /* + * Allocate a new mbuf, initialize it with the header and + * the payload for the pending calls. + */ + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) { + log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); + return; + } + + m->m_len = m->m_pkthdr.len = 0; + m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); + m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]); + + /* + * Send the upcalls + * XXX do we need to set the address in k_igmpsrc ? + */ + MRTSTAT_INC(mrts_upcalls); + if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) { + log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); + MRTSTAT_INC(mrts_upq_sockfull); + } +} + +/* + * Compute the timeout hash value for the bw_meter entries + */ +#define BW_METER_TIMEHASH(bw_meter, hash) \ + do { \ + struct timeval next_timeval = (bw_meter)->bm_start_time; \ + \ + BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ + (hash) = next_timeval.tv_sec; \ + if (next_timeval.tv_usec) \ + (hash)++; /* XXX: make sure we don't timeout early */ \ + (hash) %= BW_METER_BUCKETS; \ + } while (0) + +/* + * Schedule a timer to process periodically bw_meter entry of type "<=" + * by linking the entry in the proper hash bucket. + */ +static void +schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) +{ + int time_hash; + + MFC_LOCK_ASSERT(); + + if (!(x->bm_flags & BW_METER_LEQ)) + return; /* XXX: we schedule timers only for "<=" entries */ + + /* + * Reset the bw_meter entry + */ + x->bm_start_time = *nowp; + x->bm_measured.b_packets = 0; + x->bm_measured.b_bytes = 0; + x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; + + /* + * Compute the timeout hash value and insert the entry + */ + BW_METER_TIMEHASH(x, time_hash); + x->bm_time_next = V_bw_meter_timers[time_hash]; + V_bw_meter_timers[time_hash] = x; + x->bm_time_hash = time_hash; +} + +/* + * Unschedule the periodic timer that processes bw_meter entry of type "<=" + * by removing the entry from the proper hash bucket. + */ +static void +unschedule_bw_meter(struct bw_meter *x) +{ + int time_hash; + struct bw_meter *prev, *tmp; + + MFC_LOCK_ASSERT(); + + if (!(x->bm_flags & BW_METER_LEQ)) + return; /* XXX: we schedule timers only for "<=" entries */ + + /* + * Compute the timeout hash value and delete the entry + */ + time_hash = x->bm_time_hash; + if (time_hash >= BW_METER_BUCKETS) + return; /* Entry was not scheduled */ + + for (prev = NULL, tmp = V_bw_meter_timers[time_hash]; + tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) + if (tmp == x) + break; + + if (tmp == NULL) + panic("unschedule_bw_meter: bw_meter entry not found"); + + if (prev != NULL) + prev->bm_time_next = x->bm_time_next; + else + V_bw_meter_timers[time_hash] = x->bm_time_next; + + x->bm_time_next = NULL; + x->bm_time_hash = BW_METER_BUCKETS; +} + + +/* + * Process all "<=" type of bw_meter that should be processed now, + * and for each entry prepare an upcall if necessary. Each processed + * entry is rescheduled again for the (periodic) processing. + * + * This is run periodically (once per second normally). On each round, + * all the potentially matching entries are in the hash slot that we are + * looking at. + */ +static void +bw_meter_process() +{ + uint32_t loops; + int i; + struct timeval now, process_endtime; + + microtime(&now); + if (V_last_tv_sec == now.tv_sec) + return; /* nothing to do */ + + loops = now.tv_sec - V_last_tv_sec; + V_last_tv_sec = now.tv_sec; + if (loops > BW_METER_BUCKETS) + loops = BW_METER_BUCKETS; + + MFC_LOCK(); + /* + * Process all bins of bw_meter entries from the one after the last + * processed to the current one. On entry, i points to the last bucket + * visited, so we need to increment i at the beginning of the loop. + */ + for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { + struct bw_meter *x, *tmp_list; + + if (++i >= BW_METER_BUCKETS) + i = 0; + + /* Disconnect the list of bw_meter entries from the bin */ + tmp_list = V_bw_meter_timers[i]; + V_bw_meter_timers[i] = NULL; + + /* Process the list of bw_meter entries */ + while (tmp_list != NULL) { + x = tmp_list; + tmp_list = tmp_list->bm_time_next; + + /* Test if the time interval is over */ + process_endtime = x->bm_start_time; + BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); + if (BW_TIMEVALCMP(&process_endtime, &now, >)) { + /* Not yet: reschedule, but don't reset */ + int time_hash; + + BW_METER_TIMEHASH(x, time_hash); + if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { + /* + * XXX: somehow the bin processing is a bit ahead of time. + * Put the entry in the next bin. + */ + if (++time_hash >= BW_METER_BUCKETS) + time_hash = 0; + } + x->bm_time_next = V_bw_meter_timers[time_hash]; + V_bw_meter_timers[time_hash] = x; + x->bm_time_hash = time_hash; + + continue; + } + + /* + * Test if we should deliver an upcall + */ + if (((x->bm_flags & BW_METER_UNIT_PACKETS) && + (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || + ((x->bm_flags & BW_METER_UNIT_BYTES) && + (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { + /* Prepare an upcall for delivery */ + bw_meter_prepare_upcall(x, &now); + } + + /* + * Reschedule for next processing + */ + schedule_bw_meter(x, &now); + } + } + + /* Send all upcalls that are pending delivery */ + bw_upcalls_send(); + + MFC_UNLOCK(); +} + +/* + * A periodic function for sending all upcalls that are pending delivery + */ +static void +expire_bw_upcalls_send(void *arg) +{ + CURVNET_SET((struct vnet *) arg); + + MFC_LOCK(); + bw_upcalls_send(); + MFC_UNLOCK(); + + callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, + curvnet); + CURVNET_RESTORE(); +} + +/* + * A periodic function for periodic scanning of the multicast forwarding + * table for processing all "<=" bw_meter entries. + */ +static void +expire_bw_meter_process(void *arg) +{ + CURVNET_SET((struct vnet *) arg); + + if (V_mrt_api_config & MRT_MFC_BW_UPCALL) + bw_meter_process(); + + callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, + curvnet); + CURVNET_RESTORE(); +} + +/* + * End of bandwidth monitoring code + */ + +/* + * Send the packet up to the user daemon, or eventually do kernel encapsulation + * + */ +static int +pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, + struct mfc *rt) +{ + struct mbuf *mb_copy, *mm; + + /* + * Do not send IGMP_WHOLEPKT notifications to userland, if the + * rendezvous point was unspecified, and we were told not to. + */ + if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) && + in_nullhost(rt->mfc_rp)) + return 0; + + mb_copy = pim_register_prepare(ip, m); + if (mb_copy == NULL) + return ENOBUFS; + + /* + * Send all the fragments. Note that the mbuf for each fragment + * is freed by the sending machinery. + */ + for (mm = mb_copy; mm; mm = mb_copy) { + mb_copy = mm->m_nextpkt; + mm->m_nextpkt = 0; + mm = m_pullup(mm, sizeof(struct ip)); + if (mm != NULL) { + ip = mtod(mm, struct ip *); + if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) { + pim_register_send_rp(ip, vifp, mm, rt); + } else { + pim_register_send_upcall(ip, vifp, mm, rt); + } + } + } + + return 0; +} + +/* + * Return a copy of the data packet that is ready for PIM Register + * encapsulation. + * XXX: Note that in the returned copy the IP header is a valid one. + */ +static struct mbuf * +pim_register_prepare(struct ip *ip, struct mbuf *m) +{ + struct mbuf *mb_copy = NULL; + int mtu; + + /* Take care of delayed checksums */ + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + + /* + * Copy the old packet & pullup its IP header into the + * new mbuf so we can modify it. + */ + mb_copy = m_copypacket(m, M_DONTWAIT); + if (mb_copy == NULL) + return NULL; + mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); + if (mb_copy == NULL) + return NULL; + + /* take care of the TTL */ + ip = mtod(mb_copy, struct ip *); + --ip->ip_ttl; + + /* Compute the MTU after the PIM Register encapsulation */ + mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); + + if (ip->ip_len <= mtu) { + /* Turn the IP header into a valid one */ + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); + } else { + /* Fragment the packet */ + if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) { + m_freem(mb_copy); + return NULL; + } + } + return mb_copy; +} + +/* + * Send an upcall with the data packet to the user-level process. + */ +static int +pim_register_send_upcall(struct ip *ip, struct vif *vifp, + struct mbuf *mb_copy, struct mfc *rt) +{ + struct mbuf *mb_first; + int len = ntohs(ip->ip_len); + struct igmpmsg *im; + struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; + + VIF_LOCK_ASSERT(); + + /* + * Add a new mbuf with an upcall header + */ + MGETHDR(mb_first, M_DONTWAIT, MT_DATA); + if (mb_first == NULL) { + m_freem(mb_copy); + return ENOBUFS; + } + mb_first->m_data += max_linkhdr; + mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); + mb_first->m_len = sizeof(struct igmpmsg); + mb_first->m_next = mb_copy; + + /* Send message to routing daemon */ + im = mtod(mb_first, struct igmpmsg *); + im->im_msgtype = IGMPMSG_WHOLEPKT; + im->im_mbz = 0; + im->im_vif = vifp - V_viftable; + im->im_src = ip->ip_src; + im->im_dst = ip->ip_dst; + + k_igmpsrc.sin_addr = ip->ip_src; + + MRTSTAT_INC(mrts_upcalls); + + if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) { + CTR1(KTR_IPMF, "%s: socket queue full", __func__); + MRTSTAT_INC(mrts_upq_sockfull); + return ENOBUFS; + } + + /* Keep statistics */ + PIMSTAT_INC(pims_snd_registers_msgs); + PIMSTAT_ADD(pims_snd_registers_bytes, len); + + return 0; +} + +/* + * Encapsulate the data packet in PIM Register message and send it to the RP. + */ +static int +pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, + struct mfc *rt) +{ + struct mbuf *mb_first; + struct ip *ip_outer; + struct pim_encap_pimhdr *pimhdr; + int len = ntohs(ip->ip_len); + vifi_t vifi = rt->mfc_parent; + + VIF_LOCK_ASSERT(); + + if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) { + m_freem(mb_copy); + return EADDRNOTAVAIL; /* The iif vif is invalid */ + } + + /* + * Add a new mbuf with the encapsulating header + */ + MGETHDR(mb_first, M_DONTWAIT, MT_DATA); + if (mb_first == NULL) { + m_freem(mb_copy); + return ENOBUFS; + } + mb_first->m_data += max_linkhdr; + mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); + mb_first->m_next = mb_copy; + + mb_first->m_pkthdr.len = len + mb_first->m_len; + + /* + * Fill in the encapsulating IP and PIM header + */ + ip_outer = mtod(mb_first, struct ip *); + *ip_outer = pim_encap_iphdr; + ip_outer->ip_id = ip_newid(); + ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); + ip_outer->ip_src = V_viftable[vifi].v_lcl_addr; + ip_outer->ip_dst = rt->mfc_rp; + /* + * Copy the inner header TOS to the outer header, and take care of the + * IP_DF bit. + */ + ip_outer->ip_tos = ip->ip_tos; + if (ntohs(ip->ip_off) & IP_DF) + ip_outer->ip_off |= IP_DF; + pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer + + sizeof(pim_encap_iphdr)); + *pimhdr = pim_encap_pimhdr; + /* If the iif crosses a border, set the Border-bit */ + if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config) + pimhdr->flags |= htonl(PIM_BORDER_REGISTER); + + mb_first->m_data += sizeof(pim_encap_iphdr); + pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); + mb_first->m_data -= sizeof(pim_encap_iphdr); + + send_packet(vifp, mb_first); + + /* Keep statistics */ + PIMSTAT_INC(pims_snd_registers_msgs); + PIMSTAT_ADD(pims_snd_registers_bytes, len); + + return 0; +} + +/* + * pim_encapcheck() is called by the encap4_input() path at runtime to + * determine if a packet is for PIM; allowing PIM to be dynamically loaded + * into the kernel. + */ +static int +pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) +{ + +#ifdef DIAGNOSTIC + KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); +#endif + if (proto != IPPROTO_PIM) + return 0; /* not for us; reject the datagram. */ + + return 64; /* claim the datagram. */ +} + +/* + * PIM-SMv2 and PIM-DM messages processing. + * Receives and verifies the PIM control messages, and passes them + * up to the listening socket, using rip_input(). + * The only message with special processing is the PIM_REGISTER message + * (used by PIM-SM): the PIM header is stripped off, and the inner packet + * is passed to if_simloop(). + */ +void +pim_input(struct mbuf *m, int off) +{ + struct ip *ip = mtod(m, struct ip *); + struct pim *pim; + int minlen; + int datalen = ip->ip_len; + int ip_tos; + int iphlen = off; + + /* Keep statistics */ + PIMSTAT_INC(pims_rcv_total_msgs); + PIMSTAT_ADD(pims_rcv_total_bytes, datalen); + + /* + * Validate lengths + */ + if (datalen < PIM_MINLEN) { + PIMSTAT_INC(pims_rcv_tooshort); + CTR3(KTR_IPMF, "%s: short packet (%d) from %s", + __func__, datalen, inet_ntoa(ip->ip_src)); + m_freem(m); + return; + } + + /* + * If the packet is at least as big as a REGISTER, go agead + * and grab the PIM REGISTER header size, to avoid another + * possible m_pullup() later. + * + * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 + * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 + */ + minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); + /* + * Get the IP and PIM headers in contiguous memory, and + * possibly the PIM REGISTER header. + */ + if ((m->m_flags & M_EXT || m->m_len < minlen) && + (m = m_pullup(m, minlen)) == 0) { + CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__); + return; + } + + /* m_pullup() may have given us a new mbuf so reset ip. */ + ip = mtod(m, struct ip *); + ip_tos = ip->ip_tos; + + /* adjust mbuf to point to the PIM header */ + m->m_data += iphlen; + m->m_len -= iphlen; + pim = mtod(m, struct pim *); + + /* + * Validate checksum. If PIM REGISTER, exclude the data packet. + * + * XXX: some older PIMv2 implementations don't make this distinction, + * so for compatibility reason perform the checksum over part of the + * message, and if error, then over the whole message. + */ + if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { + /* do nothing, checksum okay */ + } else if (in_cksum(m, datalen)) { + PIMSTAT_INC(pims_rcv_badsum); + CTR1(KTR_IPMF, "%s: invalid checksum", __func__); + m_freem(m); + return; + } + + /* PIM version check */ + if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { + PIMSTAT_INC(pims_rcv_badversion); + CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__, + (int)PIM_VT_V(pim->pim_vt), PIM_VERSION); + m_freem(m); + return; + } + + /* restore mbuf back to the outer IP */ + m->m_data -= iphlen; + m->m_len += iphlen; + + if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { + /* + * Since this is a REGISTER, we'll make a copy of the register + * headers ip + pim + u_int32 + encap_ip, to be passed up to the + * routing daemon. + */ + struct sockaddr_in dst = { sizeof(dst), AF_INET }; + struct mbuf *mcp; + struct ip *encap_ip; + u_int32_t *reghdr; + struct ifnet *vifp; + + VIF_LOCK(); + if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) { + VIF_UNLOCK(); + CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, + (int)V_reg_vif_num); + m_freem(m); + return; + } + /* XXX need refcnt? */ + vifp = V_viftable[V_reg_vif_num].v_ifp; + VIF_UNLOCK(); + + /* + * Validate length + */ + if (datalen < PIM_REG_MINLEN) { + PIMSTAT_INC(pims_rcv_tooshort); + PIMSTAT_INC(pims_rcv_badregisters); + CTR1(KTR_IPMF, "%s: register packet size too small", __func__); + m_freem(m); + return; + } + + reghdr = (u_int32_t *)(pim + 1); + encap_ip = (struct ip *)(reghdr + 1); + + CTR3(KTR_IPMF, "%s: register: encap ip src %s len %d", + __func__, inet_ntoa(encap_ip->ip_src), ntohs(encap_ip->ip_len)); + + /* verify the version number of the inner packet */ + if (encap_ip->ip_v != IPVERSION) { + PIMSTAT_INC(pims_rcv_badregisters); + CTR1(KTR_IPMF, "%s: bad encap ip version", __func__); + m_freem(m); + return; + } + + /* verify the inner packet is destined to a mcast group */ + if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { + PIMSTAT_INC(pims_rcv_badregisters); + CTR2(KTR_IPMF, "%s: bad encap ip dest %s", __func__, + inet_ntoa(encap_ip->ip_dst)); + m_freem(m); + return; + } + + /* If a NULL_REGISTER, pass it to the daemon */ + if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) + goto pim_input_to_daemon; + + /* + * Copy the TOS from the outer IP header to the inner IP header. + */ + if (encap_ip->ip_tos != ip_tos) { + /* Outer TOS -> inner TOS */ + encap_ip->ip_tos = ip_tos; + /* Recompute the inner header checksum. Sigh... */ + + /* adjust mbuf to point to the inner IP header */ + m->m_data += (iphlen + PIM_MINLEN); + m->m_len -= (iphlen + PIM_MINLEN); + + encap_ip->ip_sum = 0; + encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); + + /* restore mbuf to point back to the outer IP header */ + m->m_data -= (iphlen + PIM_MINLEN); + m->m_len += (iphlen + PIM_MINLEN); + } + + /* + * Decapsulate the inner IP packet and loopback to forward it + * as a normal multicast packet. Also, make a copy of the + * outer_iphdr + pimhdr + reghdr + encap_iphdr + * to pass to the daemon later, so it can take the appropriate + * actions (e.g., send back PIM_REGISTER_STOP). + * XXX: here m->m_data points to the outer IP header. + */ + mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); + if (mcp == NULL) { + CTR1(KTR_IPMF, "%s: m_copy() failed", __func__); + m_freem(m); + return; + } + + /* Keep statistics */ + /* XXX: registers_bytes include only the encap. mcast pkt */ + PIMSTAT_INC(pims_rcv_registers_msgs); + PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len)); + + /* + * forward the inner ip packet; point m_data at the inner ip. + */ + m_adj(m, iphlen + PIM_MINLEN); + + CTR4(KTR_IPMF, + "%s: forward decap'd REGISTER: src %lx dst %lx vif %d", + __func__, + (u_long)ntohl(encap_ip->ip_src.s_addr), + (u_long)ntohl(encap_ip->ip_dst.s_addr), + (int)V_reg_vif_num); + + /* NB: vifp was collected above; can it change on us? */ + if_simloop(vifp, m, dst.sin_family, 0); + + /* prepare the register head to send to the mrouting daemon */ + m = mcp; + } + +pim_input_to_daemon: + /* + * Pass the PIM message up to the daemon; if it is a Register message, + * pass the 'head' only up to the daemon. This includes the + * outer IP header, PIM header, PIM-Register header and the + * inner IP header. + * XXX: the outer IP header pkt size of a Register is not adjust to + * reflect the fact that the inner multicast data is truncated. + */ + rip_input(m, iphlen); + + return; +} + +static int +sysctl_mfctable(SYSCTL_HANDLER_ARGS) +{ + struct mfc *rt; + int error, i; + + if (req->newptr) + return (EPERM); + if (V_mfchashtbl == NULL) /* XXX unlocked */ + return (0); + error = sysctl_wire_old_buffer(req, 0); + if (error) + return (error); + + MFC_LOCK(); + for (i = 0; i < mfchashsize; i++) { + LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) { + error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); + if (error) + goto out_locked; + } + } +out_locked: + MFC_UNLOCK(); + return (error); +} + +SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable, + "IPv4 Multicast Forwarding Table (struct *mfc[mfchashsize], " + "netinet/ip_mroute.h)"); + +static void +vnet_mroute_init(const void *unused __unused) +{ + + MALLOC(V_nexpire, u_char *, mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO); + bzero(V_bw_meter_timers, sizeof(V_bw_meter_timers)); + callout_init(&V_expire_upcalls_ch, CALLOUT_MPSAFE); + callout_init(&V_bw_upcalls_ch, CALLOUT_MPSAFE); + callout_init(&V_bw_meter_ch, CALLOUT_MPSAFE); +} + +VNET_SYSINIT(vnet_mroute_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, vnet_mroute_init, + NULL); + +static void +vnet_mroute_uninit(const void *unused __unused) +{ + + FREE(V_nexpire, M_MRTABLE); + V_nexpire = NULL; +} + +VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, + vnet_mroute_uninit, NULL); + +static int +ip_mroute_modevent(module_t mod, int type, void *unused) +{ + + switch (type) { + case MOD_LOAD: + MROUTER_LOCK_INIT(); + + if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, + if_detached_event, NULL, EVENTHANDLER_PRI_ANY); + if (if_detach_event_tag == NULL) { + printf("ip_mroute: unable to ifnet_deperture_even handler\n"); + MROUTER_LOCK_DESTROY(); + return (EINVAL); + } + + MFC_LOCK_INIT(); + VIF_LOCK_INIT(); + + mfchashsize = MFCHASHSIZE; +#ifndef __rtems__ + if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) && + !powerof2(mfchashsize)) { + printf("WARNING: %s not a power of 2; using default\n", + "net.inet.ip.mfchashsize"); + mfchashsize = MFCHASHSIZE; + } +#endif + + pim_squelch_wholepkt = 0; + TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt", + &pim_squelch_wholepkt); + + pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM, + pim_encapcheck, &in_pim_protosw, NULL); + if (pim_encap_cookie == NULL) { + printf("ip_mroute: unable to attach pim encap\n"); + VIF_LOCK_DESTROY(); + MFC_LOCK_DESTROY(); + MROUTER_LOCK_DESTROY(); + return (EINVAL); + } + + ip_mcast_src = X_ip_mcast_src; + ip_mforward = X_ip_mforward; + ip_mrouter_done = X_ip_mrouter_done; + ip_mrouter_get = X_ip_mrouter_get; + ip_mrouter_set = X_ip_mrouter_set; + + ip_rsvp_force_done = X_ip_rsvp_force_done; + ip_rsvp_vif = X_ip_rsvp_vif; + + legal_vif_num = X_legal_vif_num; + mrt_ioctl = X_mrt_ioctl; + rsvp_input_p = X_rsvp_input; + break; + + case MOD_UNLOAD: + /* + * Typically module unload happens after the user-level + * process has shutdown the kernel services (the check + * below insures someone can't just yank the module out + * from under a running process). But if the module is + * just loaded and then unloaded w/o starting up a user + * process we still need to cleanup. + */ + MROUTER_LOCK(); + if (ip_mrouter_cnt != 0) { + MROUTER_UNLOCK(); + return (EINVAL); + } + ip_mrouter_unloading = 1; + MROUTER_UNLOCK(); + + EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); + + if (pim_encap_cookie) { + encap_detach(pim_encap_cookie); + pim_encap_cookie = NULL; + } + + ip_mcast_src = NULL; + ip_mforward = NULL; + ip_mrouter_done = NULL; + ip_mrouter_get = NULL; + ip_mrouter_set = NULL; + + ip_rsvp_force_done = NULL; + ip_rsvp_vif = NULL; + + legal_vif_num = NULL; + mrt_ioctl = NULL; + rsvp_input_p = NULL; + + VIF_LOCK_DESTROY(); + MFC_LOCK_DESTROY(); + MROUTER_LOCK_DESTROY(); + break; + + default: + return EOPNOTSUPP; + } + return 0; +} + +static moduledata_t ip_mroutemod = { + "ip_mroute", + ip_mroute_modevent, + 0 +}; + +DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/freebsd/sys/netinet/ip_mroute.h b/freebsd/sys/netinet/ip_mroute.h new file mode 100644 index 00000000..3bc7f52f --- /dev/null +++ b/freebsd/sys/netinet/ip_mroute.h @@ -0,0 +1,359 @@ +/*- + * Copyright (c) 1989 Stephen Deering. + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Stephen Deering of Stanford University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_mroute.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_MROUTE_HH_ +#define _NETINET_IP_MROUTE_HH_ + +/* + * Definitions for IP multicast forwarding. + * + * Written by David Waitzman, BBN Labs, August 1988. + * Modified by Steve Deering, Stanford, February 1989. + * Modified by Ajit Thyagarajan, PARC, August 1993. + * Modified by Ajit Thyagarajan, PARC, August 1994. + * Modified by Ahmed Helmy, SGI, June 1996. + * Modified by Pavlin Radoslavov, ICSI, October 2002. + * + * MROUTING Revision: 3.3.1.3 + * and PIM-SMv2 and PIM-DM support, advanced API support, + * bandwidth metering and signaling. + */ + +/* + * Multicast Routing set/getsockopt commands. + */ +#define MRT_INIT 100 /* initialize forwarder */ +#define MRT_DONE 101 /* shut down forwarder */ +#define MRT_ADD_VIF 102 /* create virtual interface */ +#define MRT_DEL_VIF 103 /* delete virtual interface */ +#define MRT_ADD_MFC 104 /* insert forwarding cache entry */ +#define MRT_DEL_MFC 105 /* delete forwarding cache entry */ +#define MRT_VERSION 106 /* get kernel version number */ +#define MRT_ASSERT 107 /* enable assert processing */ +#define MRT_PIM MRT_ASSERT /* enable PIM processing */ +#define MRT_API_SUPPORT 109 /* supported MRT API */ +#define MRT_API_CONFIG 110 /* config MRT API */ +#define MRT_ADD_BW_UPCALL 111 /* create bandwidth monitor */ +#define MRT_DEL_BW_UPCALL 112 /* delete bandwidth monitor */ + +/* + * Types and macros for handling bitmaps with one bit per virtual interface. + */ +#define MAXVIFS 32 +typedef u_long vifbitmap_t; +typedef u_short vifi_t; /* type of a vif index */ +#define ALL_VIFS (vifi_t)-1 + +#define VIFM_SET(n, m) ((m) |= (1 << (n))) +#define VIFM_CLR(n, m) ((m) &= ~(1 << (n))) +#define VIFM_ISSET(n, m) ((m) & (1 << (n))) +#define VIFM_CLRALL(m) ((m) = 0x00000000) +#define VIFM_COPY(mfrom, mto) ((mto) = (mfrom)) +#define VIFM_SAME(m1, m2) ((m1) == (m2)) + +struct mfc; + +/* + * Argument structure for MRT_ADD_VIF. + * (MRT_DEL_VIF takes a single vifi_t argument.) + */ +struct vifctl { + vifi_t vifc_vifi; /* the index of the vif to be added */ + u_char vifc_flags; /* VIFF_ flags defined below */ + u_char vifc_threshold; /* min ttl required to forward on vif */ + u_int vifc_rate_limit; /* max rate */ + struct in_addr vifc_lcl_addr; /* local interface address */ + struct in_addr vifc_rmt_addr; /* remote address (tunnels only) */ +}; + +#define VIFF_TUNNEL 0x1 /* no-op; retained for old source */ +#define VIFF_SRCRT 0x2 /* no-op; retained for old source */ +#define VIFF_REGISTER 0x4 /* used for PIM Register encap/decap */ + +/* + * Argument structure for MRT_ADD_MFC and MRT_DEL_MFC + * XXX if you change this, make sure to change struct mfcctl2 as well. + */ +struct mfcctl { + struct in_addr mfcc_origin; /* ip origin of mcasts */ + struct in_addr mfcc_mcastgrp; /* multicast group associated*/ + vifi_t mfcc_parent; /* incoming vif */ + u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ +}; + +/* + * The new argument structure for MRT_ADD_MFC and MRT_DEL_MFC overlays + * and extends the old struct mfcctl. + */ +struct mfcctl2 { + /* the mfcctl fields */ + struct in_addr mfcc_origin; /* ip origin of mcasts */ + struct in_addr mfcc_mcastgrp; /* multicast group associated*/ + vifi_t mfcc_parent; /* incoming vif */ + u_char mfcc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ + + /* extension fields */ + uint8_t mfcc_flags[MAXVIFS]; /* the MRT_MFC_FLAGS_* flags */ + struct in_addr mfcc_rp; /* the RP address */ +}; +/* + * The advanced-API flags. + * + * The MRT_MFC_FLAGS_XXX API flags are also used as flags + * for the mfcc_flags field. + */ +#define MRT_MFC_FLAGS_DISABLE_WRONGVIF (1 << 0) /* disable WRONGVIF signals */ +#define MRT_MFC_FLAGS_BORDER_VIF (1 << 1) /* border vif */ +#define MRT_MFC_RP (1 << 8) /* enable RP address */ +#define MRT_MFC_BW_UPCALL (1 << 9) /* enable bw upcalls */ +#define MRT_MFC_FLAGS_ALL (MRT_MFC_FLAGS_DISABLE_WRONGVIF | \ + MRT_MFC_FLAGS_BORDER_VIF) +#define MRT_API_FLAGS_ALL (MRT_MFC_FLAGS_ALL | \ + MRT_MFC_RP | \ + MRT_MFC_BW_UPCALL) + +/* + * Structure for installing or delivering an upcall if the + * measured bandwidth is above or below a threshold. + * + * User programs (e.g. daemons) may have a need to know when the + * bandwidth used by some data flow is above or below some threshold. + * This interface allows the userland to specify the threshold (in + * bytes and/or packets) and the measurement interval. Flows are + * all packet with the same source and destination IP address. + * At the moment the code is only used for multicast destinations + * but there is nothing that prevents its use for unicast. + * + * The measurement interval cannot be shorter than some Tmin (currently, 3s). + * The threshold is set in packets and/or bytes per_interval. + * + * Measurement works as follows: + * + * For >= measurements: + * The first packet marks the start of a measurement interval. + * During an interval we count packets and bytes, and when we + * pass the threshold we deliver an upcall and we are done. + * The first packet after the end of the interval resets the + * count and restarts the measurement. + * + * For <= measurement: + * We start a timer to fire at the end of the interval, and + * then for each incoming packet we count packets and bytes. + * When the timer fires, we compare the value with the threshold, + * schedule an upcall if we are below, and restart the measurement + * (reschedule timer and zero counters). + */ + +struct bw_data { + struct timeval b_time; + uint64_t b_packets; + uint64_t b_bytes; +}; + +struct bw_upcall { + struct in_addr bu_src; /* source address */ + struct in_addr bu_dst; /* destination address */ + uint32_t bu_flags; /* misc flags (see below) */ +#define BW_UPCALL_UNIT_PACKETS (1 << 0) /* threshold (in packets) */ +#define BW_UPCALL_UNIT_BYTES (1 << 1) /* threshold (in bytes) */ +#define BW_UPCALL_GEQ (1 << 2) /* upcall if bw >= threshold */ +#define BW_UPCALL_LEQ (1 << 3) /* upcall if bw <= threshold */ +#define BW_UPCALL_DELETE_ALL (1 << 4) /* delete all upcalls for s,d*/ + struct bw_data bu_threshold; /* the bw threshold */ + struct bw_data bu_measured; /* the measured bw */ +}; + +/* max. number of upcalls to deliver together */ +#define BW_UPCALLS_MAX 128 +/* min. threshold time interval for bandwidth measurement */ +#define BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC 3 +#define BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC 0 + +/* + * The kernel's multicast routing statistics. + */ +struct mrtstat { + u_long mrts_mfc_lookups; /* # forw. cache hash table hits */ + u_long mrts_mfc_misses; /* # forw. cache hash table misses */ + u_long mrts_upcalls; /* # calls to multicast routing daemon */ + u_long mrts_no_route; /* no route for packet's origin */ + u_long mrts_bad_tunnel; /* malformed tunnel options */ + u_long mrts_cant_tunnel; /* no room for tunnel options */ + u_long mrts_wrong_if; /* arrived on wrong interface */ + u_long mrts_upq_ovflw; /* upcall Q overflow */ + u_long mrts_cache_cleanups; /* # entries with no upcalls */ + u_long mrts_drop_sel; /* pkts dropped selectively */ + u_long mrts_q_overflow; /* pkts dropped - Q overflow */ + u_long mrts_pkt2large; /* pkts dropped - size > BKT SIZE */ + u_long mrts_upq_sockfull; /* upcalls dropped - socket full */ +}; + +#ifdef _KERNEL +#define MRTSTAT_ADD(name, val) V_mrtstat.name += (val) +#define MRTSTAT_INC(name) MRTSTAT_ADD(name, 1) +#endif + +/* + * Argument structure used by mrouted to get src-grp pkt counts + */ +struct sioc_sg_req { + struct in_addr src; + struct in_addr grp; + u_long pktcnt; + u_long bytecnt; + u_long wrong_if; +}; + +/* + * Argument structure used by mrouted to get vif pkt counts + */ +struct sioc_vif_req { + vifi_t vifi; /* vif number */ + u_long icount; /* Input packet count on vif */ + u_long ocount; /* Output packet count on vif */ + u_long ibytes; /* Input byte count on vif */ + u_long obytes; /* Output byte count on vif */ +}; + + +/* + * The kernel's virtual-interface structure. + */ +struct vif { + u_char v_flags; /* VIFF_ flags defined above */ + u_char v_threshold; /* min ttl required to forward on vif*/ + struct in_addr v_lcl_addr; /* local interface address */ + struct in_addr v_rmt_addr; /* remote address (tunnels only) */ + struct ifnet *v_ifp; /* pointer to interface */ + u_long v_pkt_in; /* # pkts in on interface */ + u_long v_pkt_out; /* # pkts out on interface */ + u_long v_bytes_in; /* # bytes in on interface */ + u_long v_bytes_out; /* # bytes out on interface */ + struct route v_route; /* cached route */ +}; + +#ifdef _KERNEL +/* + * The kernel's multicast forwarding cache entry structure + */ +struct mfc { + LIST_ENTRY(mfc) mfc_hash; + struct in_addr mfc_origin; /* IP origin of mcasts */ + struct in_addr mfc_mcastgrp; /* multicast group associated*/ + vifi_t mfc_parent; /* incoming vif */ + u_char mfc_ttls[MAXVIFS]; /* forwarding ttls on vifs */ + u_long mfc_pkt_cnt; /* pkt count for src-grp */ + u_long mfc_byte_cnt; /* byte count for src-grp */ + u_long mfc_wrong_if; /* wrong if for src-grp */ + int mfc_expire; /* time to clean entry up */ + struct timeval mfc_last_assert; /* last time I sent an assert*/ + uint8_t mfc_flags[MAXVIFS]; /* the MRT_MFC_FLAGS_* flags */ + struct in_addr mfc_rp; /* the RP address */ + struct bw_meter *mfc_bw_meter; /* list of bandwidth meters */ + u_long mfc_nstall; /* # of packets awaiting mfc */ + TAILQ_HEAD(, rtdetq) mfc_stall; /* q of packets awaiting mfc */ +}; +#endif /* _KERNEL */ + +/* + * Struct used to communicate from kernel to multicast router + * note the convenient similarity to an IP packet + */ +struct igmpmsg { + uint32_t unused1; + uint32_t unused2; + u_char im_msgtype; /* what type of message */ +#define IGMPMSG_NOCACHE 1 /* no MFC in the kernel */ +#define IGMPMSG_WRONGVIF 2 /* packet came from wrong interface */ +#define IGMPMSG_WHOLEPKT 3 /* PIM pkt for user level encap. */ +#define IGMPMSG_BW_UPCALL 4 /* BW monitoring upcall */ + u_char im_mbz; /* must be zero */ + u_char im_vif; /* vif rec'd on */ + u_char unused3; + struct in_addr im_src, im_dst; +}; + +#ifdef _KERNEL +/* + * Argument structure used for pkt info. while upcall is made + */ +struct rtdetq { + TAILQ_ENTRY(rtdetq) rte_link; + struct mbuf *m; /* A copy of the packet */ + struct ifnet *ifp; /* Interface pkt came in on */ + vifi_t xmt_vif; /* Saved copy of imo_multicast_vif */ +}; +#define MAX_UPQ 4 /* max. no of pkts in upcall Q */ +#endif /* _KERNEL */ + +/* + * Structure for measuring the bandwidth and sending an upcall if the + * measured bandwidth is above or below a threshold. + */ +struct bw_meter { + struct bw_meter *bm_mfc_next; /* next bw meter (same mfc) */ + struct bw_meter *bm_time_next; /* next bw meter (same time) */ + uint32_t bm_time_hash; /* the time hash value */ + struct mfc *bm_mfc; /* the corresponding mfc */ + uint32_t bm_flags; /* misc flags (see below) */ +#define BW_METER_UNIT_PACKETS (1 << 0) /* threshold (in packets) */ +#define BW_METER_UNIT_BYTES (1 << 1) /* threshold (in bytes) */ +#define BW_METER_GEQ (1 << 2) /* upcall if bw >= threshold */ +#define BW_METER_LEQ (1 << 3) /* upcall if bw <= threshold */ +#define BW_METER_USER_FLAGS (BW_METER_UNIT_PACKETS | \ + BW_METER_UNIT_BYTES | \ + BW_METER_GEQ | \ + BW_METER_LEQ) + +#define BW_METER_UPCALL_DELIVERED (1 << 24) /* upcall was delivered */ + + struct bw_data bm_threshold; /* the upcall threshold */ + struct bw_data bm_measured; /* the measured bw */ + struct timeval bm_start_time; /* abs. time */ +}; + +#ifdef _KERNEL + +struct sockopt; + +extern int (*ip_mrouter_set)(struct socket *, struct sockopt *); +extern int (*ip_mrouter_get)(struct socket *, struct sockopt *); +extern int (*ip_mrouter_done)(void); +extern int (*mrt_ioctl)(u_long, caddr_t, int); + +#endif /* _KERNEL */ + +#endif /* _NETINET_IP_MROUTE_HH_ */ diff --git a/freebsd/sys/netinet/ip_options.c b/freebsd/sys/netinet/ip_options.c new file mode 100644 index 00000000..f8b31607 --- /dev/null +++ b/freebsd/sys/netinet/ip_options.c @@ -0,0 +1,747 @@ +#include + +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. + * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +static int ip_dosourceroute = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, + &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); + +static int ip_acceptsourceroute = 0; +SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, + CTLFLAG_RW, &ip_acceptsourceroute, 0, + "Enable accepting source routed IP packets"); + +int ip_doopts = 1; /* 0 = ignore, 1 = process, 2 = reject */ +SYSCTL_INT(_net_inet_ip, OID_AUTO, process_options, CTLFLAG_RW, + &ip_doopts, 0, "Enable IP options processing ([LS]SRR, RR, TS)"); + +static void save_rte(struct mbuf *m, u_char *, struct in_addr); + +/* + * Do option processing on a datagram, possibly discarding it if bad options + * are encountered, or forwarding it if source-routed. + * + * The pass argument is used when operating in the IPSTEALTH mode to tell + * what options to process: [LS]SRR (pass 0) or the others (pass 1). The + * reason for as many as two passes is that when doing IPSTEALTH, non-routing + * options should be processed only if the packet is for us. + * + * Returns 1 if packet has been forwarded/freed, 0 if the packet should be + * processed further. + */ +int +ip_dooptions(struct mbuf *m, int pass) +{ + struct ip *ip = mtod(m, struct ip *); + u_char *cp; + struct in_ifaddr *ia; + int opt, optlen, cnt, off, code, type = ICMP_PARAMPROB, forward = 0; + struct in_addr *sin, dst; + uint32_t ntime; + struct sockaddr_in ipaddr = { sizeof(ipaddr), AF_INET }; + + /* Ignore or reject packets with IP options. */ + if (ip_doopts == 0) + return 0; + else if (ip_doopts == 2) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_FILTER_PROHIB; + goto bad; + } + + dst = ip->ip_dst; + cp = (u_char *)(ip + 1); + cnt = (ip->ip_hl << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + if (cnt < IPOPT_OLEN + sizeof(*cp)) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + optlen = cp[IPOPT_OLEN]; + if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + } + switch (opt) { + + default: + break; + + /* + * Source routing with record. Find interface with current + * destination address. If none on this machine then drop if + * strictly routed, or do nothing if loosely routed. Record + * interface address and bring up next address component. If + * strictly routed make sure next address is on directly + * accessible net. + */ + case IPOPT_LSRR: + case IPOPT_SSRR: +#ifdef IPSTEALTH + if (V_ipstealth && pass > 0) + break; +#endif + if (optlen < IPOPT_OFFSET + sizeof(*cp)) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + ipaddr.sin_addr = ip->ip_dst; + if (ifa_ifwithaddr_check((struct sockaddr *)&ipaddr) + == 0) { + if (opt == IPOPT_SSRR) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + if (!ip_dosourceroute) + goto nosourcerouting; + /* + * Loose routing, and not at next destination + * yet; nothing to do except forward. + */ + break; + } + off--; /* 0 origin */ + if (off > optlen - (int)sizeof(struct in_addr)) { + /* + * End of source route. Should be for us. + */ + if (!ip_acceptsourceroute) + goto nosourcerouting; + save_rte(m, cp, ip->ip_src); + break; + } +#ifdef IPSTEALTH + if (V_ipstealth) + goto dropit; +#endif + if (!ip_dosourceroute) { + if (V_ipforwarding) { + char buf[16]; /* aaa.bbb.ccc.ddd\0 */ + /* + * Acting as a router, so generate + * ICMP + */ +nosourcerouting: + strcpy(buf, inet_ntoa(ip->ip_dst)); + log(LOG_WARNING, + "attempted source route from %s to %s\n", + inet_ntoa(ip->ip_src), buf); + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } else { + /* + * Not acting as a router, so + * silently drop. + */ +#ifdef IPSTEALTH +dropit: +#endif + IPSTAT_INC(ips_cantforward); + m_freem(m); + return (1); + } + } + + /* + * locate outgoing interface + */ + (void)memcpy(&ipaddr.sin_addr, cp + off, + sizeof(ipaddr.sin_addr)); + + if (opt == IPOPT_SSRR) { +#define INA struct in_ifaddr * +#define SA struct sockaddr * + if ((ia = (INA)ifa_ifwithdstaddr((SA)&ipaddr)) == NULL) + ia = (INA)ifa_ifwithnet((SA)&ipaddr, 0); + } else +/* XXX MRT 0 for routing */ + ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m)); + if (ia == NULL) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_SRCFAIL; + goto bad; + } + ip->ip_dst = ipaddr.sin_addr; + (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + ifa_free(&ia->ia_ifa); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + /* + * Let ip_intr's mcast routing check handle mcast pkts + */ + forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr)); + break; + + case IPOPT_RR: +#ifdef IPSTEALTH + if (V_ipstealth && pass == 0) + break; +#endif + if (optlen < IPOPT_OFFSET + sizeof(*cp)) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + /* + * If no space remains, ignore. + */ + off--; /* 0 origin */ + if (off > optlen - (int)sizeof(struct in_addr)) + break; + (void)memcpy(&ipaddr.sin_addr, &ip->ip_dst, + sizeof(ipaddr.sin_addr)); + /* + * Locate outgoing interface; if we're the + * destination, use the incoming interface (should be + * same). + */ + if ((ia = (INA)ifa_ifwithaddr((SA)&ipaddr)) == NULL && + (ia = ip_rtaddr(ipaddr.sin_addr, M_GETFIB(m))) == NULL) { + type = ICMP_UNREACH; + code = ICMP_UNREACH_HOST; + goto bad; + } + (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), + sizeof(struct in_addr)); + ifa_free(&ia->ia_ifa); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + break; + + case IPOPT_TS: +#ifdef IPSTEALTH + if (V_ipstealth && pass == 0) + break; +#endif + code = cp - (u_char *)ip; + if (optlen < 4 || optlen > 40) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + if ((off = cp[IPOPT_OFFSET]) < 5) { + code = &cp[IPOPT_OLEN] - (u_char *)ip; + goto bad; + } + if (off > optlen - (int)sizeof(int32_t)) { + cp[IPOPT_OFFSET + 1] += (1 << 4); + if ((cp[IPOPT_OFFSET + 1] & 0xf0) == 0) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + break; + } + off--; /* 0 origin */ + sin = (struct in_addr *)(cp + off); + switch (cp[IPOPT_OFFSET + 1] & 0x0f) { + + case IPOPT_TS_TSONLY: + break; + + case IPOPT_TS_TSANDADDR: + if (off + sizeof(uint32_t) + + sizeof(struct in_addr) > optlen) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + ipaddr.sin_addr = dst; + ia = (INA)ifaof_ifpforaddr((SA)&ipaddr, + m->m_pkthdr.rcvif); + if (ia == NULL) + continue; + (void)memcpy(sin, &IA_SIN(ia)->sin_addr, + sizeof(struct in_addr)); + ifa_free(&ia->ia_ifa); + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + off += sizeof(struct in_addr); + break; + + case IPOPT_TS_PRESPEC: + if (off + sizeof(uint32_t) + + sizeof(struct in_addr) > optlen) { + code = &cp[IPOPT_OFFSET] - (u_char *)ip; + goto bad; + } + (void)memcpy(&ipaddr.sin_addr, sin, + sizeof(struct in_addr)); + if (ifa_ifwithaddr_check((SA)&ipaddr) == 0) + continue; + cp[IPOPT_OFFSET] += sizeof(struct in_addr); + off += sizeof(struct in_addr); + break; + + default: + code = &cp[IPOPT_OFFSET + 1] - (u_char *)ip; + goto bad; + } + ntime = iptime(); + (void)memcpy(cp + off, &ntime, sizeof(uint32_t)); + cp[IPOPT_OFFSET] += sizeof(uint32_t); + } + } + if (forward && V_ipforwarding) { + ip_forward(m, 1); + return (1); + } + return (0); +bad: + icmp_error(m, type, code, 0, 0); + IPSTAT_INC(ips_badoptions); + return (1); +} + +/* + * Save incoming source route for use in replies, to be picked up later by + * ip_srcroute if the receiver is interested. + */ +static void +save_rte(struct mbuf *m, u_char *option, struct in_addr dst) +{ + unsigned olen; + struct ipopt_tag *opts; + + opts = (struct ipopt_tag *)m_tag_get(PACKET_TAG_IPOPTIONS, + sizeof(struct ipopt_tag), M_NOWAIT); + if (opts == NULL) + return; + + olen = option[IPOPT_OLEN]; + if (olen > sizeof(opts->ip_srcrt) - (1 + sizeof(dst))) { + m_tag_free((struct m_tag *)opts); + return; + } + bcopy(option, opts->ip_srcrt.srcopt, olen); + opts->ip_nhops = (olen - IPOPT_OFFSET - 1) / sizeof(struct in_addr); + opts->ip_srcrt.dst = dst; + m_tag_prepend(m, (struct m_tag *)opts); +} + +/* + * Retrieve incoming source route for use in replies, in the same form used + * by setsockopt. The first hop is placed before the options, will be + * removed later. + */ +struct mbuf * +ip_srcroute(struct mbuf *m0) +{ + struct in_addr *p, *q; + struct mbuf *m; + struct ipopt_tag *opts; + + opts = (struct ipopt_tag *)m_tag_find(m0, PACKET_TAG_IPOPTIONS, NULL); + if (opts == NULL) + return (NULL); + + if (opts->ip_nhops == 0) + return (NULL); + m = m_get(M_DONTWAIT, MT_DATA); + if (m == NULL) + return (NULL); + +#define OPTSIZ (sizeof(opts->ip_srcrt.nop) + sizeof(opts->ip_srcrt.srcopt)) + + /* length is (nhops+1)*sizeof(addr) + sizeof(nop + srcrt header) */ + m->m_len = opts->ip_nhops * sizeof(struct in_addr) + + sizeof(struct in_addr) + OPTSIZ; + + /* + * First, save first hop for return route. + */ + p = &(opts->ip_srcrt.route[opts->ip_nhops - 1]); + *(mtod(m, struct in_addr *)) = *p--; + + /* + * Copy option fields and padding (nop) to mbuf. + */ + opts->ip_srcrt.nop = IPOPT_NOP; + opts->ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF; + (void)memcpy(mtod(m, caddr_t) + sizeof(struct in_addr), + &(opts->ip_srcrt.nop), OPTSIZ); + q = (struct in_addr *)(mtod(m, caddr_t) + + sizeof(struct in_addr) + OPTSIZ); +#undef OPTSIZ + /* + * Record return path as an IP source route, reversing the path + * (pointers are now aligned). + */ + while (p >= opts->ip_srcrt.route) { + *q++ = *p--; + } + /* + * Last hop goes to final destination. + */ + *q = opts->ip_srcrt.dst; + m_tag_delete(m0, (struct m_tag *)opts); + return (m); +} + +/* + * Strip out IP options, at higher level protocol in the kernel. Second + * argument is buffer to which options will be moved, and return value is + * their length. + * + * XXX should be deleted; last arg currently ignored. + */ +void +ip_stripoptions(struct mbuf *m, struct mbuf *mopt) +{ + int i; + struct ip *ip = mtod(m, struct ip *); + caddr_t opts; + int olen; + + olen = (ip->ip_hl << 2) - sizeof (struct ip); + opts = (caddr_t)(ip + 1); + i = m->m_len - (sizeof (struct ip) + olen); + bcopy(opts + olen, opts, (unsigned)i); + m->m_len -= olen; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len -= olen; + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(struct ip) >> 2; +} + +/* + * Insert IP options into preformed packet. Adjust IP destination as + * required for IP source routing, as indicated by a non-zero in_addr at the + * start of the options. + * + * XXX This routine assumes that the packet has no options in place. + */ +struct mbuf * +ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) +{ + struct ipoption *p = mtod(opt, struct ipoption *); + struct mbuf *n; + struct ip *ip = mtod(m, struct ip *); + unsigned optlen; + + optlen = opt->m_len - sizeof(p->ipopt_dst); + if (optlen + ip->ip_len > IP_MAXPACKET) { + *phlen = 0; + return (m); /* XXX should fail */ + } + if (p->ipopt_dst.s_addr) + ip->ip_dst = p->ipopt_dst; + if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { + MGETHDR(n, M_DONTWAIT, MT_DATA); + if (n == NULL) { + *phlen = 0; + return (m); + } + M_MOVE_PKTHDR(n, m); + n->m_pkthdr.rcvif = NULL; + n->m_pkthdr.len += optlen; + m->m_len -= sizeof(struct ip); + m->m_data += sizeof(struct ip); + n->m_next = m; + m = n; + m->m_len = optlen + sizeof(struct ip); + m->m_data += max_linkhdr; + bcopy(ip, mtod(m, void *), sizeof(struct ip)); + } else { + m->m_data -= optlen; + m->m_len += optlen; + m->m_pkthdr.len += optlen; + bcopy(ip, mtod(m, void *), sizeof(struct ip)); + } + ip = mtod(m, struct ip *); + bcopy(p->ipopt_list, ip + 1, optlen); + *phlen = sizeof(struct ip) + optlen; + ip->ip_v = IPVERSION; + ip->ip_hl = *phlen >> 2; + ip->ip_len += optlen; + return (m); +} + +/* + * Copy options from ip to jp, omitting those not copied during + * fragmentation. + */ +int +ip_optcopy(struct ip *ip, struct ip *jp) +{ + u_char *cp, *dp; + int opt, optlen, cnt; + + cp = (u_char *)(ip + 1); + dp = (u_char *)(jp + 1); + cnt = (ip->ip_hl << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) { + /* Preserve for IP mcast tunnel's LSRR alignment. */ + *dp++ = IPOPT_NOP; + optlen = 1; + continue; + } + + KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp), + ("ip_optcopy: malformed ipv4 option")); + optlen = cp[IPOPT_OLEN]; + KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen <= cnt, + ("ip_optcopy: malformed ipv4 option")); + + /* Bogus lengths should have been caught by ip_dooptions. */ + if (optlen > cnt) + optlen = cnt; + if (IPOPT_COPIED(opt)) { + bcopy(cp, dp, optlen); + dp += optlen; + } + } + for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) + *dp++ = IPOPT_EOL; + return (optlen); +} + +/* + * Set up IP options in pcb for insertion in output packets. Store in mbuf + * with pointer in pcbopt, adding pseudo-option with destination address if + * source routed. + */ +int +ip_pcbopts(struct inpcb *inp, int optname, struct mbuf *m) +{ + int cnt, optlen; + u_char *cp; + struct mbuf **pcbopt; + u_char opt; + + INP_WLOCK_ASSERT(inp); + + pcbopt = &inp->inp_options; + + /* turn off any old options */ + if (*pcbopt) + (void)m_free(*pcbopt); + *pcbopt = 0; + if (m == NULL || m->m_len == 0) { + /* + * Only turning off any previous options. + */ + if (m != NULL) + (void)m_free(m); + return (0); + } + + if (m->m_len % sizeof(int32_t)) + goto bad; + /* + * IP first-hop destination address will be stored before actual + * options; move other options back and clear it when none present. + */ + if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN]) + goto bad; + cnt = m->m_len; + m->m_len += sizeof(struct in_addr); + cp = mtod(m, u_char *) + sizeof(struct in_addr); + bcopy(mtod(m, void *), cp, (unsigned)cnt); + bzero(mtod(m, void *), sizeof(struct in_addr)); + + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + if (cnt < IPOPT_OLEN + sizeof(*cp)) + goto bad; + optlen = cp[IPOPT_OLEN]; + if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) + goto bad; + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + case IPOPT_SSRR: + /* + * User process specifies route as: + * + * ->A->B->C->D + * + * D must be our final destination (but we can't + * check that since we may not have connected yet). + * A is first hop destination, which doesn't appear + * in actual IP option, but is stored before the + * options. + */ + /* XXX-BZ PRIV_NETINET_SETHDROPTS? */ + if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) + goto bad; + m->m_len -= sizeof(struct in_addr); + cnt -= sizeof(struct in_addr); + optlen -= sizeof(struct in_addr); + cp[IPOPT_OLEN] = optlen; + /* + * Move first hop before start of options. + */ + bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t), + sizeof(struct in_addr)); + /* + * Then copy rest of options back + * to close up the deleted entry. + */ + bcopy((&cp[IPOPT_OFFSET+1] + sizeof(struct in_addr)), + &cp[IPOPT_OFFSET+1], + (unsigned)cnt - (IPOPT_MINOFF - 1)); + break; + } + } + if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) + goto bad; + *pcbopt = m; + return (0); + +bad: + (void)m_free(m); + return (EINVAL); +} + +/* + * Check for the presence of the IP Router Alert option [RFC2113] + * in the header of an IPv4 datagram. + * + * This call is not intended for use from the forwarding path; it is here + * so that protocol domains may check for the presence of the option. + * Given how FreeBSD's IPv4 stack is currently structured, the Router Alert + * option does not have much relevance to the implementation, though this + * may change in future. + * Router alert options SHOULD be passed if running in IPSTEALTH mode and + * we are not the endpoint. + * Length checks on individual options should already have been peformed + * by ip_dooptions() therefore they are folded under INVARIANTS here. + * + * Return zero if not present or options are invalid, non-zero if present. + */ +int +ip_checkrouteralert(struct mbuf *m) +{ + struct ip *ip = mtod(m, struct ip *); + u_char *cp; + int opt, optlen, cnt, found_ra; + + found_ra = 0; + cp = (u_char *)(ip + 1); + cnt = (ip->ip_hl << 2) - sizeof (struct ip); + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[IPOPT_OPTVAL]; + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { +#ifdef INVARIANTS + if (cnt < IPOPT_OLEN + sizeof(*cp)) + break; +#endif + optlen = cp[IPOPT_OLEN]; +#ifdef INVARIANTS + if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) + break; +#endif + } + switch (opt) { + case IPOPT_RA: +#ifdef INVARIANTS + if (optlen != IPOPT_OFFSET + sizeof(uint16_t) || + (*((uint16_t *)&cp[IPOPT_OFFSET]) != 0)) + break; + else +#endif + found_ra = 1; + break; + default: + break; + } + } + + return (found_ra); +} diff --git a/freebsd/sys/netinet/ip_options.h b/freebsd/sys/netinet/ip_options.h new file mode 100644 index 00000000..9c08004d --- /dev/null +++ b/freebsd/sys/netinet/ip_options.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. + * Copyright (c) 2005 Andre Oppermann, Internet Business Solutions AG. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_OPTIONS_HH_ +#define _NETINET_IP_OPTIONS_HH_ + +struct ipoptrt { + struct in_addr dst; /* final destination */ + char nop; /* one NOP to align */ + char srcopt[IPOPT_OFFSET + 1]; /* OPTVAL, OLEN and OFFSET */ + struct in_addr route[MAX_IPOPTLEN/sizeof(struct in_addr)]; +}; + +struct ipopt_tag { + struct m_tag tag; /* m_tag */ + int ip_nhops; + struct ipoptrt ip_srcrt; +}; + +extern int ip_doopts; /* process or ignore IP options */ + +int ip_checkrouteralert(struct mbuf *); +int ip_dooptions(struct mbuf *, int); +struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); +int ip_optcopy(struct ip *, struct ip *); +int ip_pcbopts(struct inpcb *, int, struct mbuf *); +void ip_stripoptions(struct mbuf *, struct mbuf *); +struct mbuf *ip_srcroute(struct mbuf *); + +#endif /* !_NETINET_IP_OPTIONS_HH_ */ diff --git a/freebsd/sys/netinet/ip_output.c b/freebsd/sys/netinet/ip_output.c new file mode 100644 index 00000000..51132333 --- /dev/null +++ b/freebsd/sys/netinet/ip_output.c @@ -0,0 +1,1284 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef RADIX_MPATH +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef SCTP +#include +#include +#endif + +#ifdef IPSEC +#include +#include +#endif /* IPSEC*/ + +#include + +#include + +#define print_ip(x, a, y) printf("%s %d.%d.%d.%d%s",\ + x, (ntohl(a.s_addr)>>24)&0xFF,\ + (ntohl(a.s_addr)>>16)&0xFF,\ + (ntohl(a.s_addr)>>8)&0xFF,\ + (ntohl(a.s_addr))&0xFF, y); + +VNET_DEFINE(u_short, ip_id); + +#ifdef MBUF_STRESS_TEST +int mbuf_frag_size = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, + &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); +#endif + +static void ip_mloopback + (struct ifnet *, struct mbuf *, struct sockaddr_in *, int); + + +extern int in_mcast_loop; +extern struct protosw inetsw[]; + +/* + * IP output. The packet in mbuf chain m contains a skeletal IP + * header (with len, off, ttl, proto, tos, src, dst). + * The mbuf chain containing the packet will be freed. + * The mbuf opt, if present, will not be freed. + * In the IP forwarding case, the packet will arrive with options already + * inserted, so must have a NULL opt pointer. + */ +int +ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, + struct ip_moptions *imo, struct inpcb *inp) +{ + struct ip *ip; + struct ifnet *ifp = NULL; /* keep compiler happy */ + struct mbuf *m0; + int hlen = sizeof (struct ip); + int mtu; + int len, error = 0; + int nortfree = 0; + struct sockaddr_in *dst = NULL; /* keep compiler happy */ + struct in_ifaddr *ia = NULL; + int isbroadcast, sw_csum; + struct route iproute; + struct rtentry *rte; /* cache for ro->ro_rt */ + struct in_addr odst; +#ifdef IPFIREWALL_FORWARD + struct m_tag *fwd_tag = NULL; +#endif +#ifdef IPSEC + int no_route_but_check_spd = 0; +#endif + M_ASSERTPKTHDR(m); + + if (inp != NULL) { + INP_LOCK_ASSERT(inp); + M_SETFIB(m, inp->inp_inc.inc_fibnum); + if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { + m->m_pkthdr.flowid = inp->inp_flowid; + m->m_flags |= M_FLOWID; + } + } + + if (ro == NULL) { + ro = &iproute; + bzero(ro, sizeof (*ro)); + +#ifdef FLOWTABLE + { + struct flentry *fle; + + /* + * The flow table returns route entries valid for up to 30 + * seconds; we rely on the remainder of ip_output() taking no + * longer than that long for the stability of ro_rt. The + * flow ID assignment must have happened before this point. + */ + if ((fle = flowtable_lookup_mbuf(V_ip_ft, m, AF_INET)) != NULL) { + flow_to_route(fle, ro); + nortfree = 1; + } + } +#endif + } + + if (opt) { + len = 0; + m = ip_insertoptions(m, opt, &len); + if (len != 0) + hlen = len; + } + ip = mtod(m, struct ip *); + + /* + * Fill in IP header. If we are not allowing fragmentation, + * then the ip_id field is meaningless, but we don't set it + * to zero. Doing so causes various problems when devices along + * the path (routers, load balancers, firewalls, etc.) illegally + * disable DF on our packet. Note that a 16-bit counter + * will wrap around in less than 10 seconds at 100 Mbit/s on a + * medium with MTU 1500. See Steven M. Bellovin, "A Technique + * for Counting NATted Hosts", Proc. IMW'02, available at + * . + */ + if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { + ip->ip_v = IPVERSION; + ip->ip_hl = hlen >> 2; + ip->ip_id = ip_newid(); + IPSTAT_INC(ips_localout); + } else { + hlen = ip->ip_hl << 2; + } + + dst = (struct sockaddr_in *)&ro->ro_dst; +again: + /* + * If there is a cached route, + * check that it is to the same destination + * and is still up. If not, free it and try again. + * The address family should also be checked in case of sharing the + * cache with IPv6. + */ + rte = ro->ro_rt; + if (rte && ((rte->rt_flags & RTF_UP) == 0 || + rte->rt_ifp == NULL || + !RT_LINK_IS_UP(rte->rt_ifp) || + dst->sin_family != AF_INET || + dst->sin_addr.s_addr != ip->ip_dst.s_addr)) { + if (!nortfree) + RTFREE(rte); + rte = ro->ro_rt = (struct rtentry *)NULL; + ro->ro_lle = (struct llentry *)NULL; + } +#ifdef IPFIREWALL_FORWARD + if (rte == NULL && fwd_tag == NULL) { +#else + if (rte == NULL) { +#endif + bzero(dst, sizeof(*dst)); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = ip->ip_dst; + } + /* + * If routing to interface only, short circuit routing lookup. + * The use of an all-ones broadcast address implies this; an + * interface is specified by the broadcast address of an interface, + * or the destination address of a ptp interface. + */ + if (flags & IP_SENDONES) { + if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst)))) == NULL && + (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) { + IPSTAT_INC(ips_noroute); + error = ENETUNREACH; + goto bad; + } + ip->ip_dst.s_addr = INADDR_BROADCAST; + dst->sin_addr = ip->ip_dst; + ifp = ia->ia_ifp; + ip->ip_ttl = 1; + isbroadcast = 1; + } else if (flags & IP_ROUTETOIF) { + if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL && + (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0))) == NULL) { + IPSTAT_INC(ips_noroute); + error = ENETUNREACH; + goto bad; + } + ifp = ia->ia_ifp; + ip->ip_ttl = 1; + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && + imo != NULL && imo->imo_multicast_ifp != NULL) { + /* + * Bypass the normal routing lookup for multicast + * packets if the interface is specified. + */ + ifp = imo->imo_multicast_ifp; + IFP_TO_IA(ifp, ia); + isbroadcast = 0; /* fool gcc */ + } else { + /* + * We want to do any cloning requested by the link layer, + * as this is probably required in all cases for correct + * operation (as it is for ARP). + */ + if (rte == NULL) { +#ifdef RADIX_MPATH + rtalloc_mpath_fib(ro, + ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), + inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); +#else + in_rtalloc_ign(ro, 0, + inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m)); +#endif + rte = ro->ro_rt; + } + if (rte == NULL || + rte->rt_ifp == NULL || + !RT_LINK_IS_UP(rte->rt_ifp)) { +#ifdef IPSEC + /* + * There is no route for this packet, but it is + * possible that a matching SPD entry exists. + */ + no_route_but_check_spd = 1; + mtu = 0; /* Silence GCC warning. */ + goto sendit; +#endif + IPSTAT_INC(ips_noroute); + error = EHOSTUNREACH; + goto bad; + } + ia = ifatoia(rte->rt_ifa); + ifa_ref(&ia->ia_ifa); + ifp = rte->rt_ifp; + rte->rt_rmx.rmx_pksent++; + if (rte->rt_flags & RTF_GATEWAY) + dst = (struct sockaddr_in *)rte->rt_gateway; + if (rte->rt_flags & RTF_HOST) + isbroadcast = (rte->rt_flags & RTF_BROADCAST); + else + isbroadcast = in_broadcast(dst->sin_addr, ifp); + } + /* + * Calculate MTU. If we have a route that is up, use that, + * otherwise use the interface's MTU. + */ + if (rte != NULL && (rte->rt_flags & (RTF_UP|RTF_HOST))) { + /* + * This case can happen if the user changed the MTU + * of an interface after enabling IP on it. Because + * most netifs don't keep track of routes pointing to + * them, there is no way for one to update all its + * routes when the MTU is changed. + */ + if (rte->rt_rmx.rmx_mtu > ifp->if_mtu) + rte->rt_rmx.rmx_mtu = ifp->if_mtu; + mtu = rte->rt_rmx.rmx_mtu; + } else { + mtu = ifp->if_mtu; + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + m->m_flags |= M_MCAST; + /* + * IP destination address is multicast. Make sure "dst" + * still points to the address in "ro". (It may have been + * changed to point to a gateway address, above.) + */ + dst = (struct sockaddr_in *)&ro->ro_dst; + /* + * See if the caller provided any multicast options + */ + if (imo != NULL) { + ip->ip_ttl = imo->imo_multicast_ttl; + if (imo->imo_multicast_vif != -1) + ip->ip_src.s_addr = + ip_mcast_src ? + ip_mcast_src(imo->imo_multicast_vif) : + INADDR_ANY; + } else + ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + /* + * Confirm that the outgoing interface supports multicast. + */ + if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + IPSTAT_INC(ips_noroute); + error = ENETUNREACH; + goto bad; + } + } + /* + * If source address not specified yet, use address + * of outgoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) { + /* Interface may have no addresses. */ + if (ia != NULL) + ip->ip_src = IA_SIN(ia)->sin_addr; + } + + if ((imo == NULL && in_mcast_loop) || + (imo && imo->imo_multicast_loop)) { + /* + * Loop back multicast datagram if not expressly + * forbidden to do so, even if we are not a member + * of the group; ip_input() will filter it later, + * thus deferring a hash lookup and mutex acquisition + * at the expense of a cheap copy using m_copym(). + */ + ip_mloopback(ifp, m, dst, hlen); + } else { + /* + * If we are acting as a multicast router, perform + * multicast forwarding as if the packet had just + * arrived on the interface to which we are about + * to send. The multicast forwarding function + * recursively calls this function, using the + * IP_FORWARDING flag to prevent infinite recursion. + * + * Multicasts that are looped back by ip_mloopback(), + * above, will be forwarded by the ip_input() routine, + * if necessary. + */ + if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { + /* + * If rsvp daemon is not running, do not + * set ip_moptions. This ensures that the packet + * is multicast and not just sent down one link + * as prescribed by rsvpd. + */ + if (!V_rsvp_on) + imo = NULL; + if (ip_mforward && + ip_mforward(ip, ifp, m, imo) != 0) { + m_freem(m); + goto done; + } + } + } + + /* + * Multicasts with a time-to-live of zero may be looped- + * back, above, but must not be transmitted on a network. + * Also, multicasts addressed to the loopback interface + * are not sent -- the above call to ip_mloopback() will + * loop back a copy. ip_input() will drop the copy if + * this host does not belong to the destination group on + * the loopback interface. + */ + if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { + m_freem(m); + goto done; + } + + goto sendit; + } + + /* + * If the source address is not specified yet, use the address + * of the outoing interface. + */ + if (ip->ip_src.s_addr == INADDR_ANY) { + /* Interface may have no addresses. */ + if (ia != NULL) { + ip->ip_src = IA_SIN(ia)->sin_addr; + } + } + + /* + * Verify that we have any chance at all of being able to queue the + * packet or packet fragments, unless ALTQ is enabled on the given + * interface in which case packetdrop should be done by queueing. + */ +#ifdef ALTQ + if ((!ALTQ_IS_ENABLED(&ifp->if_snd)) && + ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= + ifp->if_snd.ifq_maxlen)) +#else + if ((ifp->if_snd.ifq_len + ip->ip_len / mtu + 1) >= + ifp->if_snd.ifq_maxlen) +#endif /* ALTQ */ + { + error = ENOBUFS; + IPSTAT_INC(ips_odropped); + ifp->if_snd.ifq_drops += (ip->ip_len / ifp->if_mtu + 1); + goto bad; + } + + /* + * Look for broadcast address and + * verify user is allowed to send + * such a packet. + */ + if (isbroadcast) { + if ((ifp->if_flags & IFF_BROADCAST) == 0) { + error = EADDRNOTAVAIL; + goto bad; + } + if ((flags & IP_ALLOWBROADCAST) == 0) { + error = EACCES; + goto bad; + } + /* don't allow broadcast messages to be fragmented */ + if (ip->ip_len > mtu) { + error = EMSGSIZE; + goto bad; + } + m->m_flags |= M_BCAST; + } else { + m->m_flags &= ~M_BCAST; + } + +sendit: +#ifdef IPSEC + switch(ip_ipsec_output(&m, inp, &flags, &error, &ifp)) { + case 1: + goto bad; + case -1: + goto done; + case 0: + default: + break; /* Continue with packet processing. */ + } + /* + * Check if there was a route for this packet; return error if not. + */ + if (no_route_but_check_spd) { + IPSTAT_INC(ips_noroute); + error = EHOSTUNREACH; + goto bad; + } + /* Update variables that are affected by ipsec4_output(). */ + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; +#endif /* IPSEC */ + + /* Jump over all PFIL processing if hooks are not active. */ + if (!PFIL_HOOKED(&V_inet_pfil_hook)) + goto passout; + + /* Run through list of hooks for output packets. */ + odst.s_addr = ip->ip_dst.s_addr; + error = pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_OUT, inp); + if (error != 0 || m == NULL) + goto done; + + ip = mtod(m, struct ip *); + + /* See if destination IP address was changed by packet filter. */ + if (odst.s_addr != ip->ip_dst.s_addr) { + m->m_flags |= M_SKIP_FIREWALL; + /* If destination is now ourself drop to ip_input(). */ + if (in_localip(ip->ip_dst)) { + m->m_flags |= M_FASTFWD_OURS; + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = V_loif; + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xffff; + } + m->m_pkthdr.csum_flags |= + CSUM_IP_CHECKED | CSUM_IP_VALID; +#ifdef SCTP + if (m->m_pkthdr.csum_flags & CSUM_SCTP) + m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; +#endif + error = netisr_queue(NETISR_IP, m); + goto done; + } else + goto again; /* Redo the routing table lookup. */ + } + +#ifdef IPFIREWALL_FORWARD + /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ + if (m->m_flags & M_FASTFWD_OURS) { + if (m->m_pkthdr.rcvif == NULL) + m->m_pkthdr.rcvif = V_loif; + if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xffff; + } +#ifdef SCTP + if (m->m_pkthdr.csum_flags & CSUM_SCTP) + m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; +#endif + m->m_pkthdr.csum_flags |= + CSUM_IP_CHECKED | CSUM_IP_VALID; + + error = netisr_queue(NETISR_IP, m); + goto done; + } + /* Or forward to some other address? */ + fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); + if (fwd_tag) { + dst = (struct sockaddr_in *)&ro->ro_dst; + bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); + m->m_flags |= M_SKIP_FIREWALL; + m_tag_delete(m, fwd_tag); + goto again; + } +#endif /* IPFIREWALL_FORWARD */ + +passout: + /* 127/8 must not appear on wire - RFC1122. */ + if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || + (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + if ((ifp->if_flags & IFF_LOOPBACK) == 0) { + IPSTAT_INC(ips_badaddr); + error = EADDRNOTAVAIL; + goto bad; + } + } + + m->m_pkthdr.csum_flags |= CSUM_IP; + sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_hwassist; + if (sw_csum & CSUM_DELAY_DATA) { + in_delayed_cksum(m); + sw_csum &= ~CSUM_DELAY_DATA; + } +#ifdef SCTP + if (sw_csum & CSUM_SCTP) { + sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); + sw_csum &= ~CSUM_SCTP; + } +#endif + m->m_pkthdr.csum_flags &= ifp->if_hwassist; + + /* + * If small enough for interface, or the interface will take + * care of the fragmentation for us, we can just send directly. + */ + if (ip->ip_len <= mtu || + (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 || + ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) { + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) + ip->ip_sum = in_cksum(m, hlen); + + /* + * Record statistics for this interface address. + * With CSUM_TSO the byte/packet count will be slightly + * incorrect because we count the IP+TCP headers only + * once instead of for every generated packet. + */ + if (!(flags & IP_FORWARDING) && ia) { + if (m->m_pkthdr.csum_flags & CSUM_TSO) + ia->ia_ifa.if_opackets += + m->m_pkthdr.len / m->m_pkthdr.tso_segsz; + else + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + } +#ifdef MBUF_STRESS_TEST + if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) + m = m_fragment(m, M_DONTWAIT, mbuf_frag_size); +#endif + /* + * Reset layer specific mbuf flags + * to avoid confusing lower layers. + */ + m->m_flags &= ~(M_PROTOFLAGS); + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro); + goto done; + } + + /* Balk when DF bit is set or the interface didn't support TSO. */ + if ((ip->ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { + error = EMSGSIZE; + IPSTAT_INC(ips_cantfrag); + goto bad; + } + + /* + * Too large for interface; fragment if possible. If successful, + * on return, m will point to a list of packets to be sent. + */ + error = ip_fragment(ip, &m, mtu, ifp->if_hwassist, sw_csum); + if (error) + goto bad; + for (; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = 0; + if (error == 0) { + /* Record statistics for this interface address. */ + if (ia != NULL) { + ia->ia_ifa.if_opackets++; + ia->ia_ifa.if_obytes += m->m_pkthdr.len; + } + /* + * Reset layer specific mbuf flags + * to avoid confusing upper layers. + */ + m->m_flags &= ~(M_PROTOFLAGS); + + error = (*ifp->if_output)(ifp, m, + (struct sockaddr *)dst, ro); + } else + m_freem(m); + } + + if (error == 0) + IPSTAT_INC(ips_fragmented); + +done: + if (ro == &iproute && ro->ro_rt && !nortfree) { + RTFREE(ro->ro_rt); + } + if (ia != NULL) + ifa_free(&ia->ia_ifa); + return (error); +bad: + m_freem(m); + goto done; +} + +/* + * Create a chain of fragments which fit the given mtu. m_frag points to the + * mbuf to be fragmented; on return it points to the chain with the fragments. + * Return 0 if no error. If error, m_frag may contain a partially built + * chain of fragments that should be freed by the caller. + * + * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) + * sw_csum contains the delayed checksums flags (e.g., CSUM_DELAY_IP). + */ +int +ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, + u_long if_hwassist_flags, int sw_csum) +{ + int error = 0; + int hlen = ip->ip_hl << 2; + int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ + int off; + struct mbuf *m0 = *m_frag; /* the original packet */ + int firstlen; + struct mbuf **mnext; + int nfrags; + + if (ip->ip_off & IP_DF) { /* Fragmentation not allowed */ + IPSTAT_INC(ips_cantfrag); + return EMSGSIZE; + } + + /* + * Must be able to put at least 8 bytes per fragment. + */ + if (len < 8) + return EMSGSIZE; + + /* + * If the interface will not calculate checksums on + * fragmented packets, then do it here. + */ + if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA && + (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { + in_delayed_cksum(m0); + m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } +#ifdef SCTP + if (m0->m_pkthdr.csum_flags & CSUM_SCTP && + (if_hwassist_flags & CSUM_IP_FRAGS) == 0) { + sctp_delayed_cksum(m0, hlen); + m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; + } +#endif + if (len > PAGE_SIZE) { + /* + * Fragment large datagrams such that each segment + * contains a multiple of PAGE_SIZE amount of data, + * plus headers. This enables a receiver to perform + * page-flipping zero-copy optimizations. + * + * XXX When does this help given that sender and receiver + * could have different page sizes, and also mtu could + * be less than the receiver's page size ? + */ + int newlen; + struct mbuf *m; + + for (m = m0, off = 0; m && (off+m->m_len) <= mtu; m = m->m_next) + off += m->m_len; + + /* + * firstlen (off - hlen) must be aligned on an + * 8-byte boundary + */ + if (off < hlen) + goto smart_frag_failure; + off = ((off - hlen) & ~7) + hlen; + newlen = (~PAGE_MASK) & mtu; + if ((newlen + sizeof (struct ip)) > mtu) { + /* we failed, go back the default */ +smart_frag_failure: + newlen = len; + off = hlen + len; + } + len = newlen; + + } else { + off = hlen + len; + } + + firstlen = off - hlen; + mnext = &m0->m_nextpkt; /* pointer to next packet */ + + /* + * Loop through length of segment after first fragment, + * make new header and copy data of each part and link onto chain. + * Here, m0 is the original packet, m is the fragment being created. + * The fragments are linked off the m_nextpkt of the original + * packet, which after processing serves as the first fragment. + */ + for (nfrags = 1; off < ip->ip_len; off += len, nfrags++) { + struct ip *mhip; /* ip header on the fragment */ + struct mbuf *m; + int mhlen = sizeof (struct ip); + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + IPSTAT_INC(ips_odropped); + goto done; + } + m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG; + /* + * In the first mbuf, leave room for the link header, then + * copy the original IP header including options. The payload + * goes into an additional mbuf chain returned by m_copym(). + */ + m->m_data += max_linkhdr; + mhip = mtod(m, struct ip *); + *mhip = *ip; + if (hlen > sizeof (struct ip)) { + mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); + mhip->ip_v = IPVERSION; + mhip->ip_hl = mhlen >> 2; + } + m->m_len = mhlen; + /* XXX do we need to add ip->ip_off below ? */ + mhip->ip_off = ((off - hlen) >> 3) + ip->ip_off; + if (off + len >= ip->ip_len) { /* last fragment */ + len = ip->ip_len - off; + m->m_flags |= M_LASTFRAG; + } else + mhip->ip_off |= IP_MF; + mhip->ip_len = htons((u_short)(len + mhlen)); + m->m_next = m_copym(m0, off, len, M_DONTWAIT); + if (m->m_next == NULL) { /* copy failed */ + m_free(m); + error = ENOBUFS; /* ??? */ + IPSTAT_INC(ips_odropped); + goto done; + } + m->m_pkthdr.len = mhlen + len; + m->m_pkthdr.rcvif = NULL; +#ifdef MAC + mac_netinet_fragment(m0, m); +#endif + m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags; + mhip->ip_off = htons(mhip->ip_off); + mhip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) + mhip->ip_sum = in_cksum(m, mhlen); + *mnext = m; + mnext = &m->m_nextpkt; + } + IPSTAT_ADD(ips_ofragments, nfrags); + + /* set first marker for fragment chain */ + m0->m_flags |= M_FIRSTFRAG | M_FRAG; + m0->m_pkthdr.csum_data = nfrags; + + /* + * Update first fragment by trimming what's been copied out + * and updating header. + */ + m_adj(m0, hlen + firstlen - ip->ip_len); + m0->m_pkthdr.len = hlen + firstlen; + ip->ip_len = htons((u_short)m0->m_pkthdr.len); + ip->ip_off |= IP_MF; + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (sw_csum & CSUM_DELAY_IP) + ip->ip_sum = in_cksum(m0, hlen); + +done: + *m_frag = m0; + return error; +} + +void +in_delayed_cksum(struct mbuf *m) +{ + struct ip *ip; + u_short csum, offset; + + ip = mtod(m, struct ip *); + offset = ip->ip_hl << 2 ; + csum = in_cksum_skip(m, ip->ip_len, offset); + if (m->m_pkthdr.csum_flags & CSUM_UDP && csum == 0) + csum = 0xffff; + offset += m->m_pkthdr.csum_data; /* checksum offset */ + + if (offset + sizeof(u_short) > m->m_len) { + printf("delayed m_pullup, m->len: %d off: %d p: %d\n", + m->m_len, offset, ip->ip_p); + /* + * XXX + * this shouldn't happen, but if it does, the + * correct behavior may be to insert the checksum + * in the appropriate next mbuf in the chain. + */ + return; + } + *(u_short *)(m->m_data + offset) = csum; +} + +/* + * IP socket option processing. + */ +int +ip_ctloutput(struct socket *so, struct sockopt *sopt) +{ + struct inpcb *inp = sotoinpcb(so); + int error, optval; + + error = optval = 0; + if (sopt->sopt_level != IPPROTO_IP) { + if ((sopt->sopt_level == SOL_SOCKET) && + (sopt->sopt_name == SO_SETFIB)) { + inp->inp_inc.inc_fibnum = so->so_fibnum; + return (0); + } + return (EINVAL); + } + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case IP_OPTIONS: +#ifdef notyet + case IP_RETOPTS: +#endif + { + struct mbuf *m; + if (sopt->sopt_valsize > MLEN) { + error = EMSGSIZE; + break; + } + MGET(m, sopt->sopt_td ? M_WAIT : M_DONTWAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + break; + } + m->m_len = sopt->sopt_valsize; + error = sooptcopyin(sopt, mtod(m, char *), m->m_len, + m->m_len); + if (error) { + m_free(m); + break; + } + INP_WLOCK(inp); + error = ip_pcbopts(inp, sopt->sopt_name, m); + INP_WUNLOCK(inp); + return (error); + } + + case IP_BINDANY: + if (sopt->sopt_td != NULL) { + error = priv_check(sopt->sopt_td, + PRIV_NETINET_BINDANY); + if (error) + break; + } + /* FALLTHROUGH */ + case IP_TOS: + case IP_TTL: + case IP_MINTTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + case IP_RECVTTL: + case IP_RECVIF: + case IP_FAITH: + case IP_ONESBCAST: + case IP_DONTFRAG: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + switch (sopt->sopt_name) { + case IP_TOS: + inp->inp_ip_tos = optval; + break; + + case IP_TTL: + inp->inp_ip_ttl = optval; + break; + + case IP_MINTTL: + if (optval >= 0 && optval <= MAXTTL) + inp->inp_ip_minttl = optval; + else + error = EINVAL; + break; + +#define OPTSET(bit) do { \ + INP_WLOCK(inp); \ + if (optval) \ + inp->inp_flags |= bit; \ + else \ + inp->inp_flags &= ~bit; \ + INP_WUNLOCK(inp); \ +} while (0) + + case IP_RECVOPTS: + OPTSET(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + OPTSET(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + OPTSET(INP_RECVDSTADDR); + break; + + case IP_RECVTTL: + OPTSET(INP_RECVTTL); + break; + + case IP_RECVIF: + OPTSET(INP_RECVIF); + break; + + case IP_FAITH: + OPTSET(INP_FAITH); + break; + + case IP_ONESBCAST: + OPTSET(INP_ONESBCAST); + break; + case IP_DONTFRAG: + OPTSET(INP_DONTFRAG); + break; + case IP_BINDANY: + OPTSET(INP_BINDANY); + break; + } + break; +#undef OPTSET + + /* + * Multicast socket options are processed by the in_mcast + * module. + */ + case IP_MULTICAST_IF: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_MSFILTER: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = inp_setmoptions(inp, sopt); + break; + + case IP_PORTRANGE: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + + INP_WLOCK(inp); + switch (optval) { + case IP_PORTRANGE_DEFAULT: + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags &= ~(INP_HIGHPORT); + break; + + case IP_PORTRANGE_HIGH: + inp->inp_flags &= ~(INP_LOWPORT); + inp->inp_flags |= INP_HIGHPORT; + break; + + case IP_PORTRANGE_LOW: + inp->inp_flags &= ~(INP_HIGHPORT); + inp->inp_flags |= INP_LOWPORT; + break; + + default: + error = EINVAL; + break; + } + INP_WUNLOCK(inp); + break; + +#ifdef IPSEC + case IP_IPSEC_POLICY: + { + caddr_t req; + struct mbuf *m; + + if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ + break; + if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ + break; + req = mtod(m, caddr_t); + error = ipsec_set_policy(inp, sopt->sopt_name, req, + m->m_len, (sopt->sopt_td != NULL) ? + sopt->sopt_td->td_ucred : NULL); + m_freem(m); + break; + } +#endif /* IPSEC */ + + default: + error = ENOPROTOOPT; + break; + } + break; + + case SOPT_GET: + switch (sopt->sopt_name) { + case IP_OPTIONS: + case IP_RETOPTS: + if (inp->inp_options) + error = sooptcopyout(sopt, + mtod(inp->inp_options, + char *), + inp->inp_options->m_len); + else + sopt->sopt_valsize = 0; + break; + + case IP_TOS: + case IP_TTL: + case IP_MINTTL: + case IP_RECVOPTS: + case IP_RECVRETOPTS: + case IP_RECVDSTADDR: + case IP_RECVTTL: + case IP_RECVIF: + case IP_PORTRANGE: + case IP_FAITH: + case IP_ONESBCAST: + case IP_DONTFRAG: + case IP_BINDANY: + switch (sopt->sopt_name) { + + case IP_TOS: + optval = inp->inp_ip_tos; + break; + + case IP_TTL: + optval = inp->inp_ip_ttl; + break; + + case IP_MINTTL: + optval = inp->inp_ip_minttl; + break; + +#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) + + case IP_RECVOPTS: + optval = OPTBIT(INP_RECVOPTS); + break; + + case IP_RECVRETOPTS: + optval = OPTBIT(INP_RECVRETOPTS); + break; + + case IP_RECVDSTADDR: + optval = OPTBIT(INP_RECVDSTADDR); + break; + + case IP_RECVTTL: + optval = OPTBIT(INP_RECVTTL); + break; + + case IP_RECVIF: + optval = OPTBIT(INP_RECVIF); + break; + + case IP_PORTRANGE: + if (inp->inp_flags & INP_HIGHPORT) + optval = IP_PORTRANGE_HIGH; + else if (inp->inp_flags & INP_LOWPORT) + optval = IP_PORTRANGE_LOW; + else + optval = 0; + break; + + case IP_FAITH: + optval = OPTBIT(INP_FAITH); + break; + + case IP_ONESBCAST: + optval = OPTBIT(INP_ONESBCAST); + break; + case IP_DONTFRAG: + optval = OPTBIT(INP_DONTFRAG); + break; + case IP_BINDANY: + optval = OPTBIT(INP_BINDANY); + break; + } + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + /* + * Multicast socket options are processed by the in_mcast + * module. + */ + case IP_MULTICAST_IF: + case IP_MULTICAST_VIF: + case IP_MULTICAST_TTL: + case IP_MULTICAST_LOOP: + case IP_MSFILTER: + error = inp_getmoptions(inp, sopt); + break; + +#ifdef IPSEC + case IP_IPSEC_POLICY: + { + struct mbuf *m = NULL; + caddr_t req = NULL; + size_t len = 0; + + if (m != 0) { + req = mtod(m, caddr_t); + len = m->m_len; + } + error = ipsec_get_policy(sotoinpcb(so), req, len, &m); + if (error == 0) + error = soopt_mcopyout(sopt, m); /* XXX */ + if (error == 0) + m_freem(m); + break; + } +#endif /* IPSEC */ + + default: + error = ENOPROTOOPT; + break; + } + break; + } + return (error); +} + +/* + * Routine called from ip_output() to loop back a copy of an IP multicast + * packet to the input queue of a specified interface. Note that this + * calls the output routine of the loopback "driver", but with an interface + * pointer that might NOT be a loopback interface -- evil, but easier than + * replicating that code here. + */ +static void +ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst, + int hlen) +{ + register struct ip *ip; + struct mbuf *copym; + + /* + * Make a deep copy of the packet because we're going to + * modify the pack in order to generate checksums. + */ + copym = m_dup(m, M_DONTWAIT); + if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen)) + copym = m_pullup(copym, hlen); + if (copym != NULL) { + /* If needed, compute the checksum and mark it as valid. */ + if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + in_delayed_cksum(copym); + copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + copym->m_pkthdr.csum_flags |= + CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + copym->m_pkthdr.csum_data = 0xffff; + } + /* + * We don't bother to fragment if the IP length is greater + * than the interface's MTU. Can this possibly matter? + */ + ip = mtod(copym, struct ip *); + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + ip->ip_sum = in_cksum(copym, hlen); +#if 1 /* XXX */ + if (dst->sin_family != AF_INET) { + printf("ip_mloopback: bad address family %d\n", + dst->sin_family); + dst->sin_family = AF_INET; + } +#endif + if_simloop(ifp, copym, dst->sin_family, 0); + } +} diff --git a/freebsd/sys/netinet/ip_var.h b/freebsd/sys/netinet/ip_var.h new file mode 100644 index 00000000..2902174d --- /dev/null +++ b/freebsd/sys/netinet/ip_var.h @@ -0,0 +1,315 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ip_var.h 8.2 (Berkeley) 1/9/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_IP_VAR_HH_ +#define _NETINET_IP_VAR_HH_ + +#include + +/* + * Overlay for ip header used by other protocols (tcp, udp). + */ +struct ipovly { + u_char ih_x1[9]; /* (unused) */ + u_char ih_pr; /* protocol */ + u_short ih_len; /* protocol length */ + struct in_addr ih_src; /* source internet address */ + struct in_addr ih_dst; /* destination internet address */ +}; + +#ifdef _KERNEL +/* + * Ip reassembly queue structure. Each fragment + * being reassembled is attached to one of these structures. + * They are timed out after ipq_ttl drops to 0, and may also + * be reclaimed if memory becomes tight. + */ +struct ipq { + TAILQ_ENTRY(ipq) ipq_list; /* to other reass headers */ + u_char ipq_ttl; /* time for reass q to live */ + u_char ipq_p; /* protocol of this fragment */ + u_short ipq_id; /* sequence id for reassembly */ + struct mbuf *ipq_frags; /* to ip headers of fragments */ + struct in_addr ipq_src,ipq_dst; + u_char ipq_nfrags; /* # frags in this packet */ + struct label *ipq_label; /* MAC label */ +}; +#endif /* _KERNEL */ + +/* + * Structure stored in mbuf in inpcb.ip_options + * and passed to ip_output when ip options are in use. + * The actual length of the options (including ipopt_dst) + * is in m_len. + */ +#define MAX_IPOPTLEN 40 + +struct ipoption { + struct in_addr ipopt_dst; /* first-hop dst if source routed */ + char ipopt_list[MAX_IPOPTLEN]; /* options proper */ +}; + +/* + * Structure attached to inpcb.ip_moptions and + * passed to ip_output when IP multicast options are in use. + * This structure is lazy-allocated. + */ +struct ip_moptions { + struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ + struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */ + u_long imo_multicast_vif; /* vif num outgoing multicasts */ + u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ + u_char imo_multicast_loop; /* 1 => hear sends if a member */ + u_short imo_num_memberships; /* no. memberships this socket */ + u_short imo_max_memberships; /* max memberships this socket */ + struct in_multi **imo_membership; /* group memberships */ + struct in_mfilter *imo_mfilters; /* source filters */ +}; + +struct ipstat { + u_long ips_total; /* total packets received */ + u_long ips_badsum; /* checksum bad */ + u_long ips_tooshort; /* packet too short */ + u_long ips_toosmall; /* not enough data */ + u_long ips_badhlen; /* ip header length < data size */ + u_long ips_badlen; /* ip length < ip header length */ + u_long ips_fragments; /* fragments received */ + u_long ips_fragdropped; /* frags dropped (dups, out of space) */ + u_long ips_fragtimeout; /* fragments timed out */ + u_long ips_forward; /* packets forwarded */ + u_long ips_fastforward; /* packets fast forwarded */ + u_long ips_cantforward; /* packets rcvd for unreachable dest */ + u_long ips_redirectsent; /* packets forwarded on same net */ + u_long ips_noproto; /* unknown or unsupported protocol */ + u_long ips_delivered; /* datagrams delivered to upper level*/ + u_long ips_localout; /* total ip packets generated here */ + u_long ips_odropped; /* lost packets due to nobufs, etc. */ + u_long ips_reassembled; /* total packets reassembled ok */ + u_long ips_fragmented; /* datagrams successfully fragmented */ + u_long ips_ofragments; /* output fragments created */ + u_long ips_cantfrag; /* don't fragment flag was set, etc. */ + u_long ips_badoptions; /* error in option processing */ + u_long ips_noroute; /* packets discarded due to no route */ + u_long ips_badvers; /* ip version != 4 */ + u_long ips_rawout; /* total raw ip packets generated */ + u_long ips_toolong; /* ip length > max ip packet size */ + u_long ips_notmember; /* multicasts for unregistered grps */ + u_long ips_nogif; /* no match gif found */ + u_long ips_badaddr; /* invalid address on header */ +}; + +#ifdef _KERNEL + +#include + +/* + * In-kernel consumers can use these accessor macros directly to update + * stats. + */ +#define IPSTAT_ADD(name, val) V_ipstat.name += (val) +#define IPSTAT_SUB(name, val) V_ipstat.name -= (val) +#define IPSTAT_INC(name) IPSTAT_ADD(name, 1) +#define IPSTAT_DEC(name) IPSTAT_SUB(name, 1) + +/* + * Kernel module consumers must use this accessor macro. + */ +void kmod_ipstat_inc(int statnum); +#define KMOD_IPSTAT_INC(name) \ + kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(u_long)) +void kmod_ipstat_dec(int statnum); +#define KMOD_IPSTAT_DEC(name) \ + kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(u_long)) + +/* flags passed to ip_output as last parameter */ +#define IP_FORWARDING 0x1 /* most of ip header exists */ +#define IP_RAWOUTPUT 0x2 /* raw ip header exists */ +#define IP_SENDONES 0x4 /* send all-ones broadcast */ +#define IP_SENDTOIF 0x8 /* send on specific ifnet */ +#define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */ +#define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ + +/* + * mbuf flag used by ip_fastfwd + */ +#define M_FASTFWD_OURS M_PROTO1 /* changed dst to local */ + +#ifdef __NO_STRICT_ALIGNMENT +#define IP_HDR_ALIGNED_P(ip) 1 +#else +#define IP_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) +#endif + +struct ip; +struct inpcb; +struct route; +struct sockopt; + +VNET_DECLARE(struct ipstat, ipstat); +VNET_DECLARE(u_short, ip_id); /* ip packet ctr, for ids */ +VNET_DECLARE(int, ip_defttl); /* default IP ttl */ +VNET_DECLARE(int, ipforwarding); /* ip forwarding */ +#ifdef IPSTEALTH +VNET_DECLARE(int, ipstealth); /* stealth forwarding */ +#endif +extern u_char ip_protox[]; +VNET_DECLARE(struct socket *, ip_rsvpd); /* reservation protocol daemon*/ +VNET_DECLARE(struct socket *, ip_mrouter); /* multicast routing daemon */ +extern int (*legal_vif_num)(int); +extern u_long (*ip_mcast_src)(int); +VNET_DECLARE(int, rsvp_on); +extern struct pr_usrreqs rip_usrreqs; + +#define V_ipstat VNET(ipstat) +#define V_ip_id VNET(ip_id) +#define V_ip_defttl VNET(ip_defttl) +#define V_ipforwarding VNET(ipforwarding) +#ifdef IPSTEALTH +#define V_ipstealth VNET(ipstealth) +#endif +#define V_ip_rsvpd VNET(ip_rsvpd) +#define V_ip_mrouter VNET(ip_mrouter) +#define V_rsvp_on VNET(rsvp_on) + +void inp_freemoptions(struct ip_moptions *); +int inp_getmoptions(struct inpcb *, struct sockopt *); +int inp_setmoptions(struct inpcb *, struct sockopt *); + +int ip_ctloutput(struct socket *, struct sockopt *sopt); +void ip_drain(void); +void ip_fini(void *xtp); +int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, + u_long if_hwassist_flags, int sw_csum); +void ip_forward(struct mbuf *m, int srcrt); +void ip_init(void); +#ifdef VIMAGE +void ip_destroy(void); +#endif +extern int + (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *); +int ip_output(struct mbuf *, + struct mbuf *, struct route *, int, struct ip_moptions *, + struct inpcb *); +int ipproto_register(short); +int ipproto_unregister(short); +struct mbuf * + ip_reass(struct mbuf *); +struct in_ifaddr * + ip_rtaddr(struct in_addr, u_int fibnum); +void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, + struct mbuf *); +void ip_slowtimo(void); +u_int16_t ip_randomid(void); +int rip_ctloutput(struct socket *, struct sockopt *); +void rip_ctlinput(int, struct sockaddr *, void *); +void rip_init(void); +#ifdef VIMAGE +void rip_destroy(void); +#endif +void rip_input(struct mbuf *, int); +int rip_output(struct mbuf *, struct socket *, u_long); +void ipip_input(struct mbuf *, int); +void rsvp_input(struct mbuf *, int); +int ip_rsvp_init(struct socket *); +int ip_rsvp_done(void); +extern int (*ip_rsvp_vif)(struct socket *, struct sockopt *); +extern void (*ip_rsvp_force_done)(struct socket *); +extern void (*rsvp_input_p)(struct mbuf *m, int off); + +VNET_DECLARE(struct pfil_head, inet_pfil_hook); /* packet filter hooks */ +#define V_inet_pfil_hook VNET(inet_pfil_hook) + +void in_delayed_cksum(struct mbuf *m); + +/* Hooks for ipfw, dummynet, divert etc. Most are declared in raw_ip.c */ +/* + * Reference to an ipfw or packet filter rule that can be carried + * outside critical sections. + * A rule is identified by rulenum:rule_id which is ordered. + * In version chain_id the rule can be found in slot 'slot', so + * we don't need a lookup if chain_id == chain->id. + * + * On exit from the firewall this structure refers to the rule after + * the matching one (slot points to the new rule; rulenum:rule_id-1 + * is the matching rule), and additional info (e.g. info often contains + * the insn argument or tablearg in the low 16 bits, in host format). + * On entry, the structure is valid if slot>0, and refers to the starting + * rules. 'info' contains the reason for reinject, e.g. divert port, + * divert direction, and so on. + */ +struct ipfw_rule_ref { + uint32_t slot; /* slot for matching rule */ + uint32_t rulenum; /* matching rule number */ + uint32_t rule_id; /* matching rule id */ + uint32_t chain_id; /* ruleset id */ + uint32_t info; /* see below */ +}; + +enum { + IPFW_INFO_MASK = 0x0000ffff, + IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */ + IPFW_INFO_IN = 0x80000000, /* incoming, overloads dir */ + IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */ + IPFW_IS_MASK = 0x30000000, /* which source ? */ + IPFW_IS_DIVERT = 0x20000000, + IPFW_IS_DUMMYNET =0x10000000, + IPFW_IS_PIPE = 0x08000000, /* pip1=1, queue = 0 */ +}; +#define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ +#define MTAG_IPFW_RULE 1262273568 /* rule reference */ + +struct ip_fw_args; +typedef int (*ip_fw_chk_ptr_t)(struct ip_fw_args *args); +typedef int (*ip_fw_ctl_ptr_t)(struct sockopt *); +VNET_DECLARE(ip_fw_chk_ptr_t, ip_fw_chk_ptr); +VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr); +#define V_ip_fw_chk_ptr VNET(ip_fw_chk_ptr) +#define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) + +/* Divert hooks. */ +extern void (*ip_divert_ptr)(struct mbuf *m, int incoming); +/* ng_ipfw hooks -- XXX make it the same as divert and dummynet */ +extern int (*ng_ipfw_input_p)(struct mbuf **, int, + struct ip_fw_args *, int); + +extern int (*ip_dn_ctl_ptr)(struct sockopt *); +extern int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); + +VNET_DECLARE(int, ip_do_randomid); +#define V_ip_do_randomid VNET(ip_do_randomid) +#define ip_newid() ((V_ip_do_randomid != 0) ? ip_randomid() : \ + htons(V_ip_id++)) + +#endif /* _KERNEL */ + +#endif /* !_NETINET_IP_VAR_HH_ */ diff --git a/freebsd/sys/netinet/ipfw/dn_heap.c b/freebsd/sys/netinet/ipfw/dn_heap.c new file mode 100644 index 00000000..1e6133bc --- /dev/null +++ b/freebsd/sys/netinet/ipfw/dn_heap.c @@ -0,0 +1,552 @@ +#include + +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, used in dummynet + * + * $FreeBSD$ + */ + +#include +#include +#ifdef _KERNEL +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#ifndef log +#define log(x, arg...) +#endif + +#else /* !_KERNEL */ + +#include +#include +#include +#include + +#include "dn_heap.h" +#define log(x, arg...) fprintf(stderr, ## arg) +#define panic(x...) fprintf(stderr, ## x), exit(1) +#define MALLOC_DEFINE(a, b, c) +static void *my_malloc(int s) { return malloc(s); } +static void my_free(void *p) { free(p); } +#define malloc(s, t, w) my_malloc(s) +#define free(p, t) my_free(p) +#endif /* !_KERNEL */ + +MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); + +/* + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + * + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 16 entries. + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( (x)+(x) + 1 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 15 + +static int +heap_resize(struct dn_heap *h, unsigned int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) /* have enough room */ + return 0; +#if 1 /* round to the next power of 2 */ + new_size |= new_size >> 1; + new_size |= new_size >> 2; + new_size |= new_size >> 4; + new_size |= new_size >> 8; + new_size |= new_size >> 16; +#else + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; +#endif + p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); + if (p == NULL) { + printf("--- %s, resize %d failed\n", __func__, new_size ); + return 1; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_DN_HEAP); + } + h->p = p; + h->size = new_size; + return 0; +} + +int +heap_init(struct dn_heap *h, int size, int ofs) +{ + if (heap_resize(h, size)) + return 1; + h->elements = 0; + h->ofs = ofs; + return 0; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + * + * If ofs > 0 the position (index, int) of the element in the heap is + * also stored in the element itself at the given offset in bytes. + */ +#define SET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ + } while (0) +/* + * RESET_OFFSET is used for sanity checks. It sets ofs + * to an invalid value. + */ +#define RESET_OFFSET(h, i) do { \ + if (h->ofs > 0) \ + *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ + } while (0) + +int +heap_insert(struct dn_heap *h, uint64_t key1, void *p) +{ + int son = h->elements; + + //log("%s key %llu p %p\n", __FUNCTION__, key1, p); + if (p == NULL) { /* data already there, set starting point */ + son = key1; + } else { /* insert new element at the end, possibly resize */ + son = h->elements; + if (son == h->size) /* need resize... */ + // XXX expand by 16 or so + if (heap_resize(h, h->elements+16) ) + return 1; /* failure... */ + h->p[son].object = p; + h->p[son].key = key1; + h->elements++; + } + /* make sure that son >= father along the path */ + while (son > 0) { + int father = HEAP_FATHER(son); + struct dn_heap_entry tmp; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break; /* found right position */ + /* son smaller than father, swap and repeat */ + HEAP_SWAP(h->p[son], h->p[father], tmp); + SET_OFFSET(h, son); + son = father; + } + SET_OFFSET(h, son); + return 0; +} + +/* + * remove top element from heap, or obj if obj != NULL + */ +void +heap_extract(struct dn_heap *h, void *obj) +{ + int child, father, max = h->elements - 1; + + if (max < 0) { + printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); + return; + } + if (obj == NULL) + father = 0; /* default: move up smallest child */ + else { /* extract specific element, index is at offset */ + if (h->ofs <= 0) + panic("%s: extract from middle not set on %p\n", + __FUNCTION__, h); + father = *((int *)((char *)obj + h->ofs)); + if (father < 0 || father >= h->elements) { + panic("%s: father %d out of bound 0..%d\n", + __FUNCTION__, father, h->elements); + } + } + /* + * below, father is the index of the empty element, which + * we replace at each step with the smallest child until we + * reach the bottom level. + */ + // XXX why removing RESET_OFFSET increases runtime by 10% ? + RESET_OFFSET(h, father); + while ( (child = HEAP_LEFT(father)) <= max ) { + if (child != max && + DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child++; /* take right child, otherwise left */ + h->p[father] = h->p[child]; + SET_OFFSET(h, father); + father = child; + } + h->elements--; + if (father != max) { + /* + * Fill hole with last entry and bubble up, + * reusing the insert code + */ + h->p[father] = h->p[max]; + heap_insert(h, father, NULL); + } +} + +#if 0 +/* + * change object position and update references + * XXX this one is never used! + */ +static void +heap_move(struct dn_heap *h, uint64_t new_key, void *object) +{ + int temp, i, max = h->elements-1; + struct dn_heap_entry *p, buf; + + if (h->ofs <= 0) + panic("cannot move items on this heap"); + p = h->p; /* shortcut */ + + i = *((int *)((char *)object + h->ofs)); + if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ + p[i].key = new_key; + for (; i>0 && + DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); + i = temp ) { /* bubble up */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } + } else { /* must move down */ + p[i].key = new_key; + while ( (temp = HEAP_LEFT(i)) <= max ) { + /* found left child */ + if (temp != max && + DN_KEY_LT(p[temp+1].key, p[temp].key)) + temp++; /* select child with min key */ + if (DN_KEY_LT(>p[temp].key, new_key)) { + /* go down */ + HEAP_SWAP(p[i], p[temp], buf); + SET_OFFSET(h, i); + } else + break; + i = temp; + } + } + SET_OFFSET(h, i); +} +#endif /* heap_move, unused */ + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +static void +heapify(struct dn_heap *h) +{ + int i; + + for (i = 0; i < h->elements; i++ ) + heap_insert(h, i , NULL); +} + +int +heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), + uintptr_t arg) +{ + int i, ret, found; + + for (i = found = 0 ; i < h->elements ;) { + ret = fn(h->p[i].object, arg); + if (ret & HEAP_SCAN_DEL) { + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; + if (ret & HEAP_SCAN_END) + break; + } + if (found) + heapify(h); + return found; +} + +/* + * cleanup the heap and free data structure + */ +void +heap_free(struct dn_heap *h) +{ + if (h->size >0 ) + free(h->p, M_DN_HEAP); + bzero(h, sizeof(*h) ); +} + +/* + * hash table support. + */ + +struct dn_ht { + int buckets; /* how many buckets, really buckets - 1*/ + int entries; /* how many entries */ + int ofs; /* offset of link field */ + uint32_t (*hash)(uintptr_t, int, void *arg); + int (*match)(void *_el, uintptr_t key, int, void *); + void *(*newh)(uintptr_t, int, void *); + void **ht; /* bucket heads */ +}; +/* + * Initialize, allocating bucket pointers inline. + * Recycle previous record if possible. + * If the 'newh' function is not supplied, we assume that the + * key passed to ht_find is the same object to be stored in. + */ +struct dn_ht * +dn_ht_init(struct dn_ht *ht, int buckets, int ofs, + uint32_t (*h)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*newh)(uintptr_t, int, void *)) +{ + int l; + + /* + * Notes about rounding bucket size to a power of two. + * Given the original bucket size, we compute the nearest lower and + * higher power of two, minus 1 (respectively b_min and b_max) because + * this value will be used to do an AND with the index returned + * by hash function. + * To choice between these two values, the original bucket size is + * compared with b_min. If the original size is greater than 4/3 b_min, + * we round the bucket size to b_max, else to b_min. + * This ratio try to round to the nearest power of two, advantaging + * the greater size if the different between two power is relatively + * big. + * Rounding the bucket size to a power of two avoid the use of + * module when calculating the correct bucket. + * The ht->buckets variable store the bucket size - 1 to simply + * do an AND between the index returned by hash function and ht->bucket + * instead of a module. + */ + int b_min; /* min buckets */ + int b_max; /* max buckets */ + int b_ori; /* original buckets */ + + if (h == NULL || match == NULL) { + printf("--- missing hash or match function"); + return NULL; + } + if (buckets < 1 || buckets > 65536) + return NULL; + + b_ori = buckets; + /* calculate next power of 2, - 1*/ + buckets |= buckets >> 1; + buckets |= buckets >> 2; + buckets |= buckets >> 4; + buckets |= buckets >> 8; + buckets |= buckets >> 16; + + b_max = buckets; /* Next power */ + b_min = buckets >> 1; /* Previous power */ + + /* Calculate the 'nearest' bucket size */ + if (b_min * 4000 / 3000 < b_ori) + buckets = b_max; + else + buckets = b_min; + + if (ht) { /* see if we can reuse */ + if (buckets <= ht->buckets) { + ht->buckets = buckets; + } else { + /* free pointers if not allocated inline */ + if (ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + ht = NULL; + } + } + if (ht == NULL) { + /* Allocate buckets + 1 entries because buckets is use to + * do the AND with the index returned by hash function + */ + l = sizeof(*ht) + (buckets + 1) * sizeof(void **); + ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); + } + if (ht) { + ht->ht = (void **)(ht + 1); + ht->buckets = buckets; + ht->ofs = ofs; + ht->hash = h; + ht->match = match; + ht->newh = newh; + } + return ht; +} + +/* dummy callback for dn_ht_free to unlink all */ +static int +do_del(void *obj, void *arg) +{ + return DNHT_SCAN_DEL; +} + +void +dn_ht_free(struct dn_ht *ht, int flags) +{ + if (ht == NULL) + return; + if (flags & DNHT_REMOVE) { + (void)dn_ht_scan(ht, do_del, NULL); + } else { + if (ht->ht && ht->ht != (void *)(ht + 1)) + free(ht->ht, M_DN_HEAP); + free(ht, M_DN_HEAP); + } +} + +int +dn_ht_entries(struct dn_ht *ht) +{ + return ht ? ht->entries : 0; +} + +/* lookup and optionally create or delete element */ +void * +dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) +{ + int i; + void **pp, *p; + + if (ht == NULL) /* easy on an empty hash */ + return NULL; + i = (ht->buckets == 1) ? 0 : + (ht->hash(key, flags, arg) & ht->buckets); + + for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) { + if (flags & DNHT_MATCH_PTR) { + if (key == (uintptr_t)p) + break; + } else if (ht->match(p, key, flags, arg)) /* found match */ + break; + } + if (p) { + if (flags & DNHT_REMOVE) { + /* link in the next element */ + *pp = *(void **)((char *)p + ht->ofs); + *(void **)((char *)p + ht->ofs) = NULL; + ht->entries--; + } + } else if (flags & DNHT_INSERT) { + // printf("%s before calling new, bucket %d ofs %d\n", + // __FUNCTION__, i, ht->ofs); + p = ht->newh ? ht->newh(key, flags, arg) : (void *)key; + // printf("%s newh returns %p\n", __FUNCTION__, p); + if (p) { + ht->entries++; + *(void **)((char *)p + ht->ofs) = ht->ht[i]; + ht->ht[i] = p; + } + } + return p; +} + +/* + * do a scan with the option to delete the object. Extract next before + * running the callback because the element may be destroyed there. + */ +int +dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) +{ + int i, ret, found = 0; + void **curp, *cur, *next; + + if (ht == NULL || fn == NULL) + return 0; + for (i = 0; i <= ht->buckets; i++) { + curp = &ht->ht[i]; + while ( (cur = *curp) != NULL) { + next = *(void **)((char *)cur + ht->ofs); + ret = fn(cur, arg); + if (ret & DNHT_SCAN_DEL) { + found++; + ht->entries--; + *curp = next; + } else { + curp = (void **)((char *)cur + ht->ofs); + } + if (ret & DNHT_SCAN_END) + return found; + } + } + return found; +} + +/* + * Similar to dn_ht_scan(), except thah the scan is performed only + * in the bucket 'bucket'. The function returns a correct bucket number if + * the original is invalid + */ +int +dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), + void *arg) +{ + int i, ret, found = 0; + void **curp, *cur, *next; + + if (ht == NULL || fn == NULL) + return 0; + if (*bucket > ht->buckets) + *bucket = 0; + i = *bucket; + + curp = &ht->ht[i]; + while ( (cur = *curp) != NULL) { + next = *(void **)((char *)cur + ht->ofs); + ret = fn(cur, arg); + if (ret & DNHT_SCAN_DEL) { + found++; + ht->entries--; + *curp = next; + } else { + curp = (void **)((char *)cur + ht->ofs); + } + if (ret & DNHT_SCAN_END) + return found; + } + return found; +} + diff --git a/freebsd/sys/netinet/ipfw/dn_heap.h b/freebsd/sys/netinet/ipfw/dn_heap.h new file mode 100644 index 00000000..c95473ad --- /dev/null +++ b/freebsd/sys/netinet/ipfw/dn_heap.h @@ -0,0 +1,191 @@ +/*- + * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Binary heap and hash tables, header file + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_HEAP_H +#define _IP_DN_HEAP_H + +#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) + +/* + * This module implements a binary heap supporting random extraction. + * + * A heap entry contains an uint64_t key and a pointer to object. + * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' + * + * The heap is a struct dn_heap plus a dynamically allocated + * array of dn_heap_entry entries. 'size' represents the size of + * the array, 'elements' count entries in use. The topmost + * element has the smallest key. + * The heap supports ordered insert, and extract from the top. + * To extract an object from the middle of the heap, we the object + * must reserve an 'int32_t' to store the position of the object + * in the heap itself, and the location of this field must be + * passed as an argument to heap_init() -- use -1 if the feature + * is not used. + */ +struct dn_heap_entry { + uint64_t key; /* sorting key, smallest comes first */ + void *object; /* object pointer */ +}; + +struct dn_heap { + int size; /* the size of the array */ + int elements; /* elements in use */ + int ofs; /* offset in the object of heap index */ + struct dn_heap_entry *p; /* array of "size" entries */ +}; + +enum { + HEAP_SCAN_DEL = 1, + HEAP_SCAN_END = 2, +}; + +/* + * heap_init() reinitializes the heap setting the size and the offset + * of the index for random extraction (use -1 if not used). + * The 'elements' counter is set to 0. + * + * SET_HEAP_OFS() indicates where, in the object, is stored the index + * for random extractions from the heap. + * + * heap_free() frees the memory associated to a heap. + * + * heap_insert() adds a key-pointer pair to the heap + * + * HEAP_TOP() returns a pointer to the top element of the heap, + * but makes no checks on its existance (XXX should we change ?) + * + * heap_extract() removes the entry at the top, returing the pointer. + * (the key should have been read before). + * + * heap_scan() invokes a callback on each entry of the heap. + * The callback can return a combination of HEAP_SCAN_DEL and + * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must + * be removed, and HEAP_SCAN_END means to terminate the scan. + * heap_scan() returns the number of elements removed. + * Because the order is not guaranteed, we should use heap_scan() + * only as a last resort mechanism. + */ +#define HEAP_TOP(h) ((h)->p) +#define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) +int heap_init(struct dn_heap *h, int size, int ofs); +int heap_insert(struct dn_heap *h, uint64_t key1, void *p); +void heap_extract(struct dn_heap *h, void *obj); +void heap_free(struct dn_heap *h); +int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); + +/*------------------------------------------------------ + * This module implements a generic hash table with support for + * running callbacks on the entire table. To avoid allocating + * memory during hash table operations, objects must reserve + * space for a link field. XXX if the heap is moderately full, + * an SLIST suffices, and we can tolerate the cost of a hash + * computation on each removal. + * + * dn_ht_init() initializes the table, setting the number of + * buckets, the offset of the link field, the main callbacks. + * Callbacks are: + * + * hash(key, flags, arg) called to return a bucket index. + * match(obj, key, flags, arg) called to determine if key + * matches the current 'obj' in the heap + * newh(key, flags, arg) optional, used to allocate a new + * object during insertions. + * + * dn_ht_free() frees the heap or unlink elements. + * DNHT_REMOVE unlink elements, 0 frees the heap. + * You need two calls to do both. + * + * dn_ht_find() is the main lookup function, which can also be + * used to insert or delete elements in the hash table. + * The final 'arg' is passed to all callbacks. + * + * dn_ht_scan() is used to invoke a callback on all entries of + * the heap, or possibly on just one bucket. The callback + * is invoked with a pointer to the object, and must return + * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the + * removal of the object from the heap and the end of the + * scan, respectively. + * + * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans + * only the specific bucket of the table. The bucket is a in-out + * parameter and return a valid bucket number if the original + * is invalid. + * + * A combination of flags can be used to modify the operation + * of the dn_ht_find(), and of the callbacks: + * + * DNHT_KEY_IS_OBJ means the key is the object pointer. + * It is usally of interest for the hash and match functions. + * + * DNHT_MATCH_PTR during a lookup, match pointers instead + * of calling match(). Normally used when removing specific + * entries. Does not imply KEY_IS_OBJ as the latter _is_ used + * by the match function. + * + * DNHT_INSERT insert the element if not found. + * Calls new() to allocates a new object unless + * DNHT_KEY_IS_OBJ is set. + * + * DNHT_UNIQUE only insert if object not found. + * XXX should it imply DNHT_INSERT ? + * + * DNHT_REMOVE remove objects if we find them. + */ +struct dn_ht; /* should be opaque */ + +struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, + uint32_t (*hash)(uintptr_t, int, void *), + int (*match)(void *, uintptr_t, int, void *), + void *(*newh)(uintptr_t, int, void *)); +void dn_ht_free(struct dn_ht *, int flags); + +void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); +int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); +int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *); +int dn_ht_entries(struct dn_ht *); + +enum { /* flags values. + * first two are returned by the scan callback to indicate + * to delete the matching element or to end the scan + */ + DNHT_SCAN_DEL = 0x0001, + DNHT_SCAN_END = 0x0002, + DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ + DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ + DNHT_INSERT = 0x0010, /* insert if not found */ + DNHT_UNIQUE = 0x0020, /* report error if already there */ + DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ +}; + +#endif /* _IP_DN_HEAP_H */ diff --git a/freebsd/sys/netinet/ipfw/dn_sched.h b/freebsd/sys/netinet/ipfw/dn_sched.h new file mode 100644 index 00000000..fe54b020 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/dn_sched.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * The API to write a packet scheduling algorithm for dummynet. + * + * $FreeBSD$ + */ + +#ifndef _DN_SCHED_H +#define _DN_SCHED_H + +#define DN_MULTIQUEUE 0x01 +/* + * Descriptor for a scheduling algorithm. + * Contains all function pointers for a given scheduler + * This is typically created when a module is loaded, and stored + * in a global list of schedulers. + */ +struct dn_alg { + uint32_t type; /* the scheduler type */ + const char *name; /* scheduler name */ + uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ + + /* + * The following define the size of 3 optional data structures + * that may need to be allocated at runtime, and are appended + * to each of the base data structures: scheduler, sched.inst, + * and queue. We don't have a per-flowset structure. + */ + /* + parameters attached to the template, e.g. + * default queue sizes, weights, quantum size, and so on; + */ + size_t schk_datalen; + + /* + per-instance parameters, such as timestamps, + * containers for queues, etc; + */ + size_t si_datalen; + + size_t q_datalen; /* per-queue parameters (e.g. S,F) */ + + /* + * Methods implemented by the scheduler: + * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. + * q is NULL for !MULTIQUEUE. + * Return 0 on success, 1 on drop (packet consumed anyways). + * Note that q should be interpreted only as a hint + * on the flow that the mbuf belongs to: while a + * scheduler will normally enqueue m into q, it is ok + * to leave q alone and put the mbuf elsewhere. + * This function is called in two cases: + * - when a new packet arrives to the scheduler; + * - when a scheduler is reconfigured. In this case the + * call is issued by the new_queue callback, with a + * non empty queue (q) and m pointing to the first + * mbuf in the queue. For this reason, the function + * should internally check for (m != q->mq.head) + * before calling dn_enqueue(). + * + * dequeue Called when scheduler instance 's' can + * dequeue a packet. Return NULL if none are available. + * XXX what about non work-conserving ? + * + * config called on 'sched X config ...', normally writes + * in the area of size sch_arg + * + * destroy called on 'sched delete', frees everything + * in sch_arg (other parts are handled by more specific + * functions) + * + * new_sched called when a new instance is created, e.g. + * to create the local queue for !MULTIQUEUE, set V or + * copy parameters for WFQ, and so on. + * + * free_sched called when deleting an instance, cleans + * extra data in the per-instance area. + * + * new_fsk called when a flowset is linked to a scheduler, + * e.g. to validate parameters such as weights etc. + * free_fsk when a flowset is unlinked from a scheduler. + * (probably unnecessary) + * + * new_queue called to set the per-queue parameters, + * e.g. S and F, adjust sum of weights in the parent, etc. + * + * The new_queue callback is normally called from when + * creating a new queue. In some cases (such as a + * scheduler change or reconfiguration) it can be called + * with a non empty queue. In this case, the queue + * In case of non empty queue, the new_queue callback could + * need to call the enqueue function. In this case, + * the callback should eventually call enqueue() passing + * as m the first element in the queue. + * + * free_queue actions related to a queue removal, e.g. undo + * all the above. If the queue has data in it, also remove + * from the scheduler. This can e.g. happen during a reconfigure. + */ + int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, + struct mbuf *); + struct mbuf * (*dequeue)(struct dn_sch_inst *); + + int (*config)(struct dn_schk *); + int (*destroy)(struct dn_schk*); + int (*new_sched)(struct dn_sch_inst *); + int (*free_sched)(struct dn_sch_inst *); + int (*new_fsk)(struct dn_fsk *f); + int (*free_fsk)(struct dn_fsk *f); + int (*new_queue)(struct dn_queue *q); + int (*free_queue)(struct dn_queue *q); + + /* run-time fields */ + int ref_count; /* XXX number of instances in the system */ + SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ +}; + +/* MSVC does not support initializers so we need this ugly macro */ +#ifdef _WIN32 +#define _SI(fld) +#else +#define _SI(fld) fld +#endif + +/* + * Additionally, dummynet exports some functions and macros + * to be used by schedulers: + */ + +void dn_free_pkts(struct mbuf *mnext); +int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); +/* bound a variable between min and max */ +int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); + +/* + * Extract the head of a queue, update stats. Must be the very last + * thing done on a dequeue as the queue itself may go away. + */ +static __inline struct mbuf* +dn_dequeue(struct dn_queue *q) +{ + struct mbuf *m = q->mq.head; + if (m == NULL) + return NULL; + q->mq.head = m->m_nextpkt; + q->ni.length--; + q->ni.len_bytes -= m->m_pkthdr.len; + if (q->_si) { + q->_si->ni.length--; + q->_si->ni.len_bytes -= m->m_pkthdr.len; + } + if (q->ni.length == 0) /* queue is now idle */ + q->q_time = dn_cfg.curr_time; + return m; +} + +int dn_sched_modevent(module_t mod, int cmd, void *arg); + +#define DECLARE_DNSCHED_MODULE(name, dnsched) \ + static moduledata_t name##_mod = { \ + #name, dn_sched_modevent, dnsched \ + }; \ + DECLARE_MODULE(name, name##_mod, \ + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ + MODULE_DEPEND(name, dummynet, 3, 3, 3); +#endif /* _DN_SCHED_H */ diff --git a/freebsd/sys/netinet/ipfw/dn_sched_fifo.c b/freebsd/sys/netinet/ipfw/dn_sched_fifo.c new file mode 100644 index 00000000..6d5a4a12 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/dn_sched_fifo.c @@ -0,0 +1,122 @@ +#include + +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +/* + * This file implements a FIFO scheduler for a single queue. + * The queue is allocated as part of the scheduler instance, + * and there is a single flowset is in the template which stores + * queue size and policy. + * Enqueue and dequeue use the default library functions. + */ +static int +fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) +{ + /* XXX if called with q != NULL and m=NULL, this is a + * re-enqueue from an existing scheduler, which we should + * handle. + */ + return dn_enqueue((struct dn_queue *)(si+1), m, 0); +} + +static struct mbuf * +fifo_dequeue(struct dn_sch_inst *si) +{ + return dn_dequeue((struct dn_queue *)(si + 1)); +} + +static int +fifo_new_sched(struct dn_sch_inst *si) +{ + /* This scheduler instance contains the queue */ + struct dn_queue *q = (struct dn_queue *)(si + 1); + + set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); + q->_si = si; + q->fs = si->sched->fs; + return 0; +} + +static int +fifo_free_sched(struct dn_sch_inst *si) +{ + struct dn_queue *q = (struct dn_queue *)(si + 1); + dn_free_pkts(q->mq.head); + bzero(q, sizeof(*q)); + return 0; +} + +/* + * FIFO scheduler descriptor + * contains the type of the scheduler, the name, the size of extra + * data structures, and function pointers. + */ +static struct dn_alg fifo_desc = { + _SI( .type = ) DN_SCHED_FIFO, + _SI( .name = ) "FIFO", + _SI( .flags = ) 0, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct dn_queue), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) fifo_enqueue, + _SI( .dequeue = ) fifo_dequeue, + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) fifo_new_sched, + _SI( .free_sched = ) fifo_free_sched, + _SI( .new_fsk = ) NULL, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) NULL, + _SI( .free_queue = ) NULL, +}; + +DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); diff --git a/freebsd/sys/netinet/ipfw/dn_sched_prio.c b/freebsd/sys/netinet/ipfw/dn_sched_prio.c new file mode 100644 index 00000000..c6b6027c --- /dev/null +++ b/freebsd/sys/netinet/ipfw/dn_sched_prio.c @@ -0,0 +1,231 @@ +#include + +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#define DN_SCHED_PRIO 5 //XXX + +#if !defined(_KERNEL) || !defined(__linux__) +#define test_bit(ix, pData) ((*pData) & (1<<(ix))) +#define __set_bit(ix, pData) (*pData) |= (1<<(ix)) +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif + +#ifdef __MIPSEL__ +#define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) +#endif + +/* Size of the array of queues pointers. */ +#define BITMAP_T unsigned long +#define MAXPRIO (sizeof(BITMAP_T) * 8) + +/* + * The scheduler instance contains an array of pointers to queues, + * one for each priority, and a bitmap listing backlogged queues. + */ +struct prio_si { + BITMAP_T bitmap; /* array bitmap */ + struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */ +}; + +/* + * If a queue with the same priority is already backlogged, use + * that one instead of the queue passed as argument. + */ +static int +prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + int prio = q->fs->fs.par[0]; + + if (test_bit(prio, &si->bitmap) == 0) { + /* No queue with this priority, insert */ + __set_bit(prio, &si->bitmap); + si->q_array[prio] = q; + } else { /* use the existing queue */ + q = si->q_array[prio]; + } + if (dn_enqueue(q, m, 0)) + return 1; + return 0; +} + +/* + * Packets are dequeued only from the highest priority queue. + * The function ffs() return the lowest bit in the bitmap that rapresent + * the array index (-1) which contains the pointer to the highest priority + * queue. + * After the dequeue, if this queue become empty, it is index is removed + * from the bitmap. + * Scheduler is idle if the bitmap is empty + * + * NOTE: highest priority is 0, lowest is sched->max_prio_q + */ +static struct mbuf * +prio_dequeue(struct dn_sch_inst *_si) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + struct mbuf *m; + struct dn_queue *q; + int prio; + + if (si->bitmap == 0) /* scheduler idle */ + return NULL; + + prio = ffs(si->bitmap) - 1; + + /* Take the highest priority queue in the scheduler */ + q = si->q_array[prio]; + // assert(q) + + m = dn_dequeue(q); + if (q->mq.head == NULL) { + /* Queue is now empty, remove from scheduler + * and mark it + */ + si->q_array[prio] = NULL; + __clear_bit(prio, &si->bitmap); + } + return m; +} + +static int +prio_new_sched(struct dn_sch_inst *_si) +{ + struct prio_si *si = (struct prio_si *)(_si + 1); + + bzero(si->q_array, sizeof(si->q_array)); + si->bitmap = 0; + + return 0; +} + +static int +prio_new_fsk(struct dn_fsk *fs) +{ + /* Check if the prioritiy is between 0 and MAXPRIO-1 */ + ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority"); + return 0; +} + +static int +prio_new_queue(struct dn_queue *q) +{ + struct prio_si *si = (struct prio_si *)(q->_si + 1); + int prio = q->fs->fs.par[0]; + struct dn_queue *oldq; + + q->ni.oid.subtype = DN_SCHED_PRIO; + + if (q->mq.head == NULL) + return 0; + + /* Queue already full, must insert in the scheduler or append + * mbufs to existing queue. This partly duplicates prio_enqueue + */ + if (test_bit(prio, &si->bitmap) == 0) { + /* No queue with this priority, insert */ + __set_bit(prio, &si->bitmap); + si->q_array[prio] = q; + } else if ( (oldq = si->q_array[prio]) != q) { + /* must append to the existing queue. + * can simply append q->mq.head to q2->... + * and add the counters to those of q2 + */ + oldq->mq.tail->m_nextpkt = q->mq.head; + oldq->mq.tail = q->mq.tail; + oldq->ni.length += q->ni.length; + q->ni.length = 0; + oldq->ni.len_bytes += q->ni.len_bytes; + q->ni.len_bytes = 0; + q->mq.tail = q->mq.head = NULL; + } + return 0; +} + +static int +prio_free_queue(struct dn_queue *q) +{ + int prio = q->fs->fs.par[0]; + struct prio_si *si = (struct prio_si *)(q->_si + 1); + + if (si->q_array[prio] == q) { + si->q_array[prio] = NULL; + __clear_bit(prio, &si->bitmap); + } + return 0; +} + + +static struct dn_alg prio_desc = { + _SI( .type = ) DN_SCHED_PRIO, + _SI( .name = ) "PRIO", + _SI( .flags = ) DN_MULTIQUEUE, + + /* we need extra space in the si and the queue */ + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct prio_si), + _SI( .q_datalen = ) 0, + + _SI( .enqueue = ) prio_enqueue, + _SI( .dequeue = ) prio_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) prio_new_sched, + _SI( .free_sched = ) NULL, + + _SI( .new_fsk = ) prio_new_fsk, + _SI( .free_fsk = ) NULL, + + _SI( .new_queue = ) prio_new_queue, + _SI( .free_queue = ) prio_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc); diff --git a/freebsd/sys/netinet/ipfw/dn_sched_qfq.c b/freebsd/sys/netinet/ipfw/dn_sched_qfq.c new file mode 100644 index 00000000..23890199 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/dn_sched_qfq.c @@ -0,0 +1,866 @@ +#include + +/* + * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#ifdef QFQ_DEBUG +struct qfq_sched; +static void dump_sched(struct qfq_sched *q, const char *msg); +#define NO(x) x +#else +#define NO(x) +#endif +#define DN_SCHED_QFQ 4 // XXX Where? +typedef unsigned long bitmap; + +/* + * bitmaps ops are critical. Some linux versions have __fls + * and the bitmap ops. Some machines have ffs + */ +#if defined(_WIN32) +int fls(unsigned int n) +{ + int i = 0; + for (i = 0; n > 0; n >>= 1, i++) + ; + return i; +} +#endif + +#if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) +static inline unsigned long __fls(unsigned long word) +{ + return fls(word) - 1; +} +#endif + +#if !defined(_KERNEL) || !defined(__linux__) +#ifdef QFQ_DEBUG +int test_bit(int ix, bitmap *p) +{ + if (ix < 0 || ix > 31) + D("bad index %d", ix); + return *p & (1< 31) + D("bad index %d", ix); + *p |= (1< 31) + D("bad index %d", ix); + *p &= ~(1<index = 0 + *.__grp->slot_shift + + where MIN_SLOT_SHIFT is derived by difference from the others. + +The max group index corresponds to Lmax/w_min, where +Lmax=1<group mapping. Class weights are + * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the + * group with the smallest index that can support the L_i / r_i + * configured for the class. + * + * grp->index is the index of the group; and grp->slot_shift + * is the shift for the corresponding (scaled) sigma_i. + * + * When computing the group index, we do (len<i_wsum) +#define IWSUM ((1< 0; +} + +/* Round a precise timestamp to its slotted value. */ +static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) +{ + return ts & ~((1ULL << shift) - 1); +} + +/* return the pointer to the group with lowest index in the bitmap */ +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, + unsigned long bitmap) +{ + int index = ffs(bitmap) - 1; // zero-based + return &q->groups[index]; +} + +/* + * Calculate a flow index, given its weight and maximum packet length. + * index = log_2(maxlen/weight) but we need to apply the scaling. + * This is used only once at flow creation. + */ +static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) +{ + uint64_t slot_size = (uint64_t)maxlen *inv_w; + unsigned long size_map; + int index = 0; + + size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); + if (!size_map) + goto out; + + index = __fls(size_map) + 1; // basically a log_2() + index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); + + if (index < 0) + index = 0; + +out: + ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); + return index; +} +/*---- end support functions ----*/ + +/*-------- API calls --------------------------------*/ +/* + * Validate and copy parameters from flowset. + */ +static int +qfq_new_queue(struct dn_queue *_q) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + int i; + uint32_t w; /* approximated weight */ + + /* import parameters from the flowset. They should be correct + * already. + */ + w = _q->fs->fs.par[0]; + cl->lmax = _q->fs->fs.par[1]; + if (!w || w > QFQ_MAX_WEIGHT) { + w = 1; + D("rounding weight to 1"); + } + cl->inv_w = ONE_FP/w; + w = ONE_FP/cl->inv_w; + if (q->wsum + w > QFQ_MAX_WSUM) + return EINVAL; + + i = qfq_calc_index(cl->inv_w, cl->lmax); + cl->grp = &q->groups[i]; + q->wsum += w; + // XXX cl->S = q->V; ? + // XXX compute q->i_wsum + return 0; +} + +/* remove an empty queue */ +static int +qfq_free_queue(struct dn_queue *_q) +{ + struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); + struct qfq_class *cl = (struct qfq_class *)_q; + if (cl->inv_w) { + q->wsum -= ONE_FP/cl->inv_w; + cl->inv_w = 0; /* reset weight to avoid run twice */ + } + return 0; +} + +/* Calculate a mask to mimic what would be ffs_from(). */ +static inline unsigned long +mask_from(unsigned long bitmap, int from) +{ + return bitmap & ~((1UL << from) - 1); +} + +/* + * The state computation relies on ER=0, IR=1, EB=2, IB=3 + * First compute eligibility comparing grp->S, q->V, + * then check if someone is blocking us and possibly add EB + */ +static inline unsigned int +qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) +{ + /* if S > V we are not eligible */ + unsigned int state = qfq_gt(grp->S, q->V); + unsigned long mask = mask_from(q->bitmaps[ER], grp->index); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (qfq_gt(grp->F, next->F)) + state |= EB; + } + + return state; +} + +/* + * In principle + * q->bitmaps[dst] |= q->bitmaps[src] & mask; + * q->bitmaps[src] &= ~mask; + * but we should make sure that src != dst + */ +static inline void +qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) +{ + q->bitmaps[dst] |= q->bitmaps[src] & mask; + q->bitmaps[src] &= ~mask; +} + +static inline void +qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) +{ + unsigned long mask = mask_from(q->bitmaps[ER], index + 1); + struct qfq_group *next; + + if (mask) { + next = qfq_ffs(q, mask); + if (!qfq_gt(next->F, old_finish)) + return; + } + + mask = (1UL << index) - 1; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); +} + +/* + * perhaps + * + old_V ^= q->V; + old_V >>= QFQ_MIN_SLOT_SHIFT; + if (old_V) { + ... + } + * + */ +static inline void +qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) +{ + unsigned long mask, vslot, old_vslot; + + vslot = q->V >> QFQ_MIN_SLOT_SHIFT; + old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; + + if (vslot != old_vslot) { + mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; + qfq_move_groups(q, mask, IR, ER); + qfq_move_groups(q, mask, IB, EB); + } +} + +/* + * XXX we should make sure that slot becomes less than 32. + * This is guaranteed by the input values. + * roundedS is always cl->S rounded on grp->slot_shift bits. + */ +static inline void +qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) +{ + uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; + unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; + + cl->next = grp->slots[i]; + grp->slots[i] = cl; + __set_bit(slot, &grp->full_slots); +} + +/* + * remove the entry from the slot + */ +static inline void +qfq_front_slot_remove(struct qfq_group *grp) +{ + struct qfq_class **h = &grp->slots[grp->front]; + + *h = (*h)->next; + if (!*h) + __clear_bit(0, &grp->full_slots); +} + +/* + * Returns the first full queue in a group. As a side effect, + * adjust the bucket list so the first non-empty bucket is at + * position 0 in full_slots. + */ +static inline struct qfq_class * +qfq_slot_scan(struct qfq_group *grp) +{ + int i; + + ND("grp %d full %x", grp->index, grp->full_slots); + if (!grp->full_slots) + return NULL; + + i = ffs(grp->full_slots) - 1; // zero-based + if (i > 0) { + grp->front = (grp->front + i) % QFQ_MAX_SLOTS; + grp->full_slots >>= i; + } + + return grp->slots[grp->front]; +} + +/* + * adjust the bucket list. When the start time of a group decreases, + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to + * move the objects. The mask of occupied slots must be shifted + * because we use ffs() to find the first non-empty slot. + * This covers decreases in the group's start time, but what about + * increases of the start time ? + * Here too we should make sure that i is less than 32 + */ +static inline void +qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) +{ + unsigned int i = (grp->S - roundedS) >> grp->slot_shift; + + grp->full_slots <<= i; + grp->front = (grp->front - i) % QFQ_MAX_SLOTS; +} + + +static inline void +qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) +{ + bitmap ineligible; + + ineligible = q->bitmaps[IR] | q->bitmaps[IB]; + if (ineligible) { + if (!q->bitmaps[ER]) { + struct qfq_group *grp; + grp = qfq_ffs(q, ineligible); + if (qfq_gt(grp->S, q->V)) + q->V = grp->S; + } + qfq_make_eligible(q, old_V); + } +} + +/* + * Updates the class, returns true if also the group needs to be updated. + */ +static inline int +qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl) +{ + + cl->S = cl->F; + if (cl->_q.mq.head == NULL) { + qfq_front_slot_remove(grp); + } else { + unsigned int len; + uint64_t roundedS; + + len = cl->_q.mq.head->m_pkthdr.len; + cl->F = cl->S + (uint64_t)len * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (roundedS == grp->S) + return 0; + + qfq_front_slot_remove(grp); + qfq_slot_insert(grp, cl, roundedS); + } + return 1; +} + +static struct mbuf * +qfq_dequeue(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl; + struct mbuf *m; + uint64_t old_V; + + NO(q->loops++;) + if (!q->bitmaps[ER]) { + NO(if (q->queued) + dump_sched(q, "start dequeue");) + return NULL; + } + + grp = qfq_ffs(q, q->bitmaps[ER]); + + cl = grp->slots[grp->front]; + /* extract from the first bucket in the bucket list */ + m = dn_dequeue(&cl->_q); + + if (!m) { + D("BUG/* non-workconserving leaf */"); + return NULL; + } + NO(q->queued--;) + old_V = q->V; + q->V += (uint64_t)m->m_pkthdr.len * IWSUM; + ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); + + if (qfq_update_class(q, grp, cl)) { + uint64_t old_F = grp->F; + cl = qfq_slot_scan(grp); + if (!cl) { /* group gone, remove from ER */ + __clear_bit(grp->index, &q->bitmaps[ER]); + // grp->S = grp->F + 1; // XXX debugging only + } else { + uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); + unsigned int s; + + if (grp->S == roundedS) + goto skip_unblock; + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + /* remove from ER and put in the new set */ + __clear_bit(grp->index, &q->bitmaps[ER]); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + /* we need to unblock even if the group has gone away */ + qfq_unblock_groups(q, grp->index, old_F); + } + +skip_unblock: + qfq_update_eligible(q, old_V); + NO(if (!q->bitmaps[ER] && q->queued) + dump_sched(q, "end dequeue");) + + return m; +} + +/* + * Assign a reasonable start time for a new flow k in group i. + * Admissible values for \hat(F) are multiples of \sigma_i + * no greater than V+\sigma_i . Larger values mean that + * we had a wraparound so we consider the timestamp to be stale. + * + * If F is not stale and F >= V then we set S = F. + * Otherwise we should assign S = V, but this may violate + * the ordering in ER. So, if we have groups in ER, set S to + * the F_j of the first group j which would be blocking us. + * We are guaranteed not to move S backward because + * otherwise our group i would still be blocked. + */ +static inline void +qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) +{ + unsigned long mask; + uint32_t limit, roundedF; + int slot_shift = cl->grp->slot_shift; + + roundedF = qfq_round_down(cl->F, slot_shift); + limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); + + if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { + /* timestamp was stale */ + mask = mask_from(q->bitmaps[ER], cl->grp->index); + if (mask) { + struct qfq_group *next = qfq_ffs(q, mask); + if (qfq_gt(roundedF, next->F)) { + cl->S = next->F; + return; + } + } + cl->S = q->V; + } else { /* timestamp is not stale */ + cl->S = cl->F; + } +} + +static int +qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + struct qfq_class *cl = (struct qfq_class *)_q; + uint64_t roundedS; + int s; + + NO(q->loops++;) + DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, + _q, cl->inv_w, cl->grp->index); + /* XXX verify that the packet obeys the parameters */ + if (m != _q->mq.head) { + if (dn_enqueue(_q, m, 0)) /* packet was dropped */ + return 1; + NO(q->queued++;) + if (m != _q->mq.head) + return 0; + } + /* If reach this point, queue q was idle */ + grp = cl->grp; + qfq_update_start(q, cl); /* adjust start time */ + /* compute new finish time and rounded start. */ + cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; + roundedS = qfq_round_down(cl->S, grp->slot_shift); + + /* + * insert cl in the correct bucket. + * If cl->S >= grp->S we don't need to adjust the + * bucket list and simply go to the insertion phase. + * Otherwise grp->S is decreasing, we must make room + * in the bucket list, and also recompute the group state. + * Finally, if there were no flows in this group and nobody + * was in ER make sure to adjust V. + */ + if (grp->full_slots) { + if (!qfq_gt(grp->S, cl->S)) + goto skip_update; + /* create a slot for this cl->S */ + qfq_slot_rotate(q, grp, roundedS); + /* group was surely ineligible, remove */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[IB]); + } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) + q->V = roundedS; + + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + ND("new state %d 0x%x", s, q->bitmaps[s]); + ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); +skip_update: + qfq_slot_insert(grp, cl, roundedS); + + return 0; +} + + +#if 0 +static inline void +qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, + struct qfq_class *cl, struct qfq_class **pprev) +{ + unsigned int i, offset; + uint64_t roundedS; + + roundedS = qfq_round_down(cl->S, grp->slot_shift); + offset = (roundedS - grp->S) >> grp->slot_shift; + i = (grp->front + offset) % QFQ_MAX_SLOTS; + +#ifdef notyet + if (!pprev) { + pprev = &grp->slots[i]; + while (*pprev && *pprev != cl) + pprev = &(*pprev)->next; + } +#endif + + *pprev = cl->next; + if (!grp->slots[i]) + __clear_bit(offset, &grp->full_slots); +} + +/* + * called to forcibly destroy a queue. + * If the queue is not in the front bucket, or if it has + * other queues in the front bucket, we can simply remove + * the queue with no other side effects. + * Otherwise we must propagate the event up. + * XXX description to be completed. + */ +static void +qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, + struct qfq_class **pprev) +{ + struct qfq_group *grp = &q->groups[cl->index]; + unsigned long mask; + uint64_t roundedS; + int s; + + cl->F = cl->S; // not needed if the class goes away. + qfq_slot_remove(q, grp, cl, pprev); + + if (!grp->full_slots) { + /* nothing left in the group, remove from all sets. + * Do ER last because if we were blocking other groups + * we must unblock them. + */ + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + + if (test_bit(grp->index, &q->bitmaps[ER]) && + !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { + mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); + if (mask) + mask = ~((1UL << __fls(mask)) - 1); + else + mask = ~0UL; + qfq_move_groups(q, mask, EB, ER); + qfq_move_groups(q, mask, IB, IR); + } + __clear_bit(grp->index, &q->bitmaps[ER]); + } else if (!grp->slots[grp->front]) { + cl = qfq_slot_scan(grp); + roundedS = qfq_round_down(cl->S, grp->slot_shift); + if (grp->S != roundedS) { + __clear_bit(grp->index, &q->bitmaps[ER]); + __clear_bit(grp->index, &q->bitmaps[IR]); + __clear_bit(grp->index, &q->bitmaps[EB]); + __clear_bit(grp->index, &q->bitmaps[IB]); + grp->S = roundedS; + grp->F = roundedS + (2ULL << grp->slot_shift); + s = qfq_calc_state(q, grp); + __set_bit(grp->index, &q->bitmaps[s]); + } + } + qfq_update_eligible(q, q->V); +} +#endif + +static int +qfq_new_fsk(struct dn_fsk *f) +{ + ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); + ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); + ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); + return 0; +} + +/* + * initialize a new scheduler instance + */ +static int +qfq_new_sched(struct dn_sch_inst *si) +{ + struct qfq_sched *q = (struct qfq_sched *)(si + 1); + struct qfq_group *grp; + int i; + + for (i = 0; i <= QFQ_MAX_INDEX; i++) { + grp = &q->groups[i]; + grp->index = i; + grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - + (QFQ_MAX_INDEX - i); + } + return 0; +} + +/* + * QFQ scheduler descriptor + */ +static struct dn_alg qfq_desc = { + _SI( .type = ) DN_SCHED_QFQ, + _SI( .name = ) "QFQ", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct qfq_sched), + _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), + + _SI( .enqueue = ) qfq_enqueue, + _SI( .dequeue = ) qfq_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) qfq_new_sched, + _SI( .free_sched = ) NULL, + _SI( .new_fsk = ) qfq_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) qfq_new_queue, + _SI( .free_queue = ) qfq_free_queue, +}; + +DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); + +#ifdef QFQ_DEBUG +static void +dump_groups(struct qfq_sched *q, uint32_t mask) +{ + int i, j; + + for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { + struct qfq_group *g = &q->groups[i]; + + if (0 == (mask & (1<slots[j]) + D(" bucket %d %p", j, g->slots[j]); + } + D("full_slots 0x%x", g->full_slots); + D(" %2d S 0x%20llx F 0x%llx %c", i, + g->S, g->F, + mask & (1<loops, q->queued, q->V); + D(" ER 0x%08x", q->bitmaps[ER]); + D(" EB 0x%08x", q->bitmaps[EB]); + D(" IR 0x%08x", q->bitmaps[IR]); + D(" IB 0x%08x", q->bitmaps[IB]); + dump_groups(q, 0xffffffff); +}; +#endif /* QFQ_DEBUG */ diff --git a/freebsd/sys/netinet/ipfw/dn_sched_rr.c b/freebsd/sys/netinet/ipfw/dn_sched_rr.c new file mode 100644 index 00000000..4aa833f6 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/dn_sched_rr.c @@ -0,0 +1,309 @@ +#include + +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#define DN_SCHED_RR 3 // XXX Where? + +struct rr_queue { + struct dn_queue q; /* Standard queue */ + int status; /* 1: queue is in the list */ + int credit; /* Number of bytes to transmit */ + int quantum; /* quantum * C */ + struct rr_queue *qnext; /* */ +}; + +/* struct rr_schk contains global config parameters + * and is right after dn_schk + */ +struct rr_schk { + int min_q; /* Min quantum */ + int max_q; /* Max quantum */ + int q_bytes; /* Bytes per quantum */ +}; + +/* per-instance round robin list, right after dn_sch_inst */ +struct rr_si { + struct rr_queue *head, *tail; /* Pointer to current queue */ +}; + +/* Append a queue to the rr list */ +static inline void +rr_append(struct rr_queue *q, struct rr_si *si) +{ + q->status = 1; /* mark as in-rr_list */ + q->credit = q->quantum; /* initialize credit */ + + /* append to the tail */ + if (si->head == NULL) + si->head = q; + else + si->tail->qnext = q; + si->tail = q; /* advance the tail pointer */ + q->qnext = si->head; /* make it circular */ +} + +/* Remove the head queue from circular list. */ +static inline void +rr_remove_head(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + si->head->status = 0; + + if (si->head == si->tail) { + si->head = si->tail = NULL; + return; + } + + si->head = si->head->qnext; + si->tail->qnext = si->head; +} + +/* Remove a queue from circular list. + * XXX see if ti can be merge with remove_queue() + */ +static inline void +remove_queue_q(struct rr_queue *q, struct rr_si *si) +{ + struct rr_queue *prev; + + if (q->status != 1) + return; + if (q == si->head) { + rr_remove_head(si); + return; + } + + for (prev = si->head; prev; prev = prev->qnext) { + if (prev->qnext != q) + continue; + prev->qnext = q->qnext; + if (q == si->tail) + si->tail = prev; + q->status = 0; + break; + } +} + + +static inline void +next_pointer(struct rr_si *si) +{ + if (si->head == NULL) + return; /* empty queue */ + + si->head = si->head->qnext; + si->tail = si->tail->qnext; +} + +static int +rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct rr_si *si; + struct rr_queue *rrq; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) + return 0; + } + + /* If reach this point, queue q was idle */ + si = (struct rr_si *)(_si + 1); + rrq = (struct rr_queue *)q; + + if (rrq->status == 1) /* Queue is already in the queue list */ + return 0; + + /* Insert the queue in the queue list */ + rr_append(rrq, si); + + return 0; +} + +static struct mbuf * +rr_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct rr_si *si = (struct rr_si *)(_si + 1); + struct rr_queue *rrq; + uint64_t len; + + while ( (rrq = si->head) ) { + struct mbuf *m = rrq->q.mq.head; + if ( m == NULL) { + /* empty queue, remove from list */ + rr_remove_head(si); + continue; + } + len = m->m_pkthdr.len; + + if (len > rrq->credit) { + /* Packet too big */ + rrq->credit += rrq->quantum; + /* Try next queue */ + next_pointer(si); + } else { + rrq->credit -= len; + return dn_dequeue(&rrq->q); + } + } + + /* no packet to dequeue*/ + return NULL; +} + +static int +rr_config(struct dn_schk *_schk) +{ + struct rr_schk *schk = (struct rr_schk *)(_schk + 1); + ND("called"); + + /* use reasonable quantums (64..2k bytes, default 1500) */ + schk->min_q = 64; + schk->max_q = 2048; + schk->q_bytes = 1500; /* quantum */ + + return 0; +} + +static int +rr_new_sched(struct dn_sch_inst *_si) +{ + struct rr_si *si = (struct rr_si *)(_si + 1); + + ND("called"); + si->head = si->tail = NULL; + + return 0; +} + +static int +rr_free_sched(struct dn_sch_inst *_si) +{ + ND("called"); + /* Nothing to do? */ + return 0; +} + +static int +rr_new_fsk(struct dn_fsk *fs) +{ + struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); + /* par[0] is the weight, par[1] is the quantum step */ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 65536, "RR weight"); + ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, + schk->min_q, schk->max_q, "RR quantum"); + return 0; +} + +static int +rr_new_queue(struct dn_queue *_q) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_RR; + + q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; + ND("called, q->quantum %d", q->quantum); + q->credit = q->quantum; + q->status = 0; + + if (_q->mq.head != NULL) { + /* Queue NOT empty, insert in the queue list */ + rr_append(q, (struct rr_si *)(_q->_si + 1)); + } + return 0; +} + +static int +rr_free_queue(struct dn_queue *_q) +{ + struct rr_queue *q = (struct rr_queue *)_q; + + ND("called"); + if (q->status == 1) { + struct rr_si *si = (struct rr_si *)(_q->_si + 1); + remove_queue_q(q, si); + } + return 0; +} + +/* + * RR scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg rr_desc = { + _SI( .type = ) DN_SCHED_RR, + _SI( .name = ) "RR", + _SI( .flags = ) DN_MULTIQUEUE, + + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct rr_si), + _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), + + _SI( .enqueue = ) rr_enqueue, + _SI( .dequeue = ) rr_dequeue, + + _SI( .config = ) rr_config, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) rr_new_sched, + _SI( .free_sched = ) rr_free_sched, + _SI( .new_fsk = ) rr_new_fsk, + _SI( .free_fsk = ) NULL, + _SI( .new_queue = ) rr_new_queue, + _SI( .free_queue = ) rr_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); diff --git a/freebsd/sys/netinet/ipfw/dn_sched_wf2q.c b/freebsd/sys/netinet/ipfw/dn_sched_wf2q.c new file mode 100644 index 00000000..c1e4c21d --- /dev/null +++ b/freebsd/sys/netinet/ipfw/dn_sched_wf2q.c @@ -0,0 +1,375 @@ +#include + +/* + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ */ +#include +#include /* ipfw_rule_ref */ +#include /* flow_id */ +#include +#include +#include +#include +#else +#include +#endif + +#ifndef MAX64 +#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) +#endif + +/* + * timestamps are computed on 64 bit using fixed point arithmetic. + * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len + * and sum of weights, respectively. FRAC_BITS is the number of + * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large + * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w + * using an unsigned 32-bit division, and to avoid wraparounds we need + * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 + * As an example + * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 + */ +#ifndef FRAC_BITS +#define FRAC_BITS 28 /* shift for fixed point arithmetic */ +#define ONE_FP (1UL << FRAC_BITS) +#endif + +/* + * Private information for the scheduler instance: + * sch_heap (key is Finish time) returns the next queue to serve + * ne_heap (key is Start time) stores not-eligible queues + * idle_heap (key=start/finish time) stores idle flows. It must + * support extract-from-middle. + * A flow is only in 1 of the three heaps. + * XXX todo: use a more efficient data structure, e.g. a tree sorted + * by F with min_subtree(S) in each node + */ +struct wf2qp_si { + struct dn_heap sch_heap; /* top extract - key Finish time */ + struct dn_heap ne_heap; /* top extract - key Start time */ + struct dn_heap idle_heap; /* random extract - key Start=Finish time */ + uint64_t V; /* virtual time */ + uint32_t inv_wsum; /* inverse of sum of weights */ + uint32_t wsum; /* sum of weights */ +}; + +struct wf2qp_queue { + struct dn_queue _q; + uint64_t S, F; /* start time, finish time */ + uint32_t inv_w; /* ONE_FP / weight */ + int32_t heap_pos; /* position (index) of struct in heap */ +}; + +/* + * This file implements a WF2Q+ scheduler as it has been in dummynet + * since 2000. + * The scheduler supports per-flow queues and has O(log N) complexity. + * + * WF2Q+ needs to drain entries from the idle heap so that we + * can keep the sum of weights up to date. We can do it whenever + * we get a chance, or periodically, or following some other + * strategy. The function idle_check() drains at most N elements + * from the idle heap. + */ +static void +idle_check(struct wf2qp_si *si, int n, int force) +{ + struct dn_heap *h = &si->idle_heap; + while (n-- > 0 && h->elements > 0 && + (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { + struct dn_queue *q = HEAP_TOP(h)->object; + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + + heap_extract(h, NULL); + /* XXX to let the flowset delete the queue we should + * mark it as 'unused' by the scheduler. + */ + alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ + si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + } +} + +static int +wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) +{ + struct dn_fsk *fs = q->fs; + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct wf2qp_queue *alg_fq; + uint64_t len = m->m_pkthdr.len; + + if (m != q->mq.head) { + if (dn_enqueue(q, m, 0)) /* packet was dropped */ + return 1; + if (m != q->mq.head) /* queue was already busy */ + return 0; + } + + /* If reach this point, queue q was idle */ + alg_fq = (struct wf2qp_queue *)q; + + if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { + /* Fbrand new queue. */ + alg_fq->S = si->V; /* init start time */ + si->wsum += fs->fs.par[0]; /* add weight of new queue. */ + si->inv_wsum = ONE_FP/si->wsum; + } else { /* if it was idle then it was in the idle heap */ + heap_extract(&si->idle_heap, q); + alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ + } + alg_fq->F = alg_fq->S + len * alg_fq->inv_w; + + /* if nothing is backlogged, make sure this flow is eligible */ + if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) + si->V = MAX64(alg_fq->S, si->V); + + /* + * Look at eligibility. A flow is not eligibile if S>V (when + * this happens, it means that there is some other flow already + * scheduled for the same pipe, so the sch_heap cannot be + * empty). If the flow is not eligible we just store it in the + * ne_heap. Otherwise, we store in the sch_heap. + * Note that for all flows in sch_heap (SCH), S_i <= V, + * and for all flows in ne_heap (NEH), S_i > V. + * So when we need to compute max(V, min(S_i)) forall i in + * SCH+NEH, we only need to look into NEH. + */ + if (DN_KEY_LT(si->V, alg_fq->S)) { + /* S>V means flow Not eligible. */ + if (si->sch_heap.elements == 0) + D("++ ouch! not eligible but empty scheduler!"); + heap_insert(&si->ne_heap, alg_fq->S, q); + } else { + heap_insert(&si->sch_heap, alg_fq->F, q); + } + return 0; +} + +/* XXX invariant: sch > 0 || V >= min(S in neh) */ +static struct mbuf * +wf2qp_dequeue(struct dn_sch_inst *_si) +{ + /* Access scheduler instance private data */ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + struct mbuf *m; + struct dn_queue *q; + struct dn_heap *sch = &si->sch_heap; + struct dn_heap *neh = &si->ne_heap; + struct wf2qp_queue *alg_fq; + + if (sch->elements == 0 && neh->elements == 0) { + /* we have nothing to do. We could kill the idle heap + * altogether and reset V + */ + idle_check(si, 0x7fffffff, 1); + si->V = 0; + si->wsum = 0; /* should be set already */ + return NULL; /* quick return if nothing to do */ + } + idle_check(si, 1, 0); /* drain something from the idle heap */ + + /* make sure at least one element is eligible, bumping V + * and moving entries that have become eligible. + * We need to repeat the first part twice, before and + * after extracting the candidate, or enqueue() will + * find the data structure in a wrong state. + */ + m = NULL; + for(;;) { + /* + * Compute V = max(V, min(S_i)). Remember that all elements + * in sch have by definition S_i <= V so if sch is not empty, + * V is surely the max and we must not update it. Conversely, + * if sch is empty we only need to look at neh. + * We don't need to move the queues, as it will be done at the + * next enqueue + */ + if (sch->elements == 0 && neh->elements > 0) { + si->V = MAX64(si->V, HEAP_TOP(neh)->key); + } + while (neh->elements > 0 && + DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { + q = HEAP_TOP(neh)->object; + alg_fq = (struct wf2qp_queue *)q; + heap_extract(neh, NULL); + heap_insert(sch, alg_fq->F, q); + } + if (m) /* pkt found in previous iteration */ + break; + /* ok we have at least one eligible pkt */ + q = HEAP_TOP(sch)->object; + alg_fq = (struct wf2qp_queue *)q; + m = dn_dequeue(q); + heap_extract(sch, NULL); /* Remove queue from heap. */ + si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; + alg_fq->S = alg_fq->F; /* Update start time. */ + if (q->mq.head == 0) { /* not backlogged any more. */ + heap_insert(&si->idle_heap, alg_fq->F, q); + } else { /* Still backlogged. */ + /* Update F, store in neh or sch */ + uint64_t len = q->mq.head->m_pkthdr.len; + alg_fq->F += len * alg_fq->inv_w; + if (DN_KEY_LEQ(alg_fq->S, si->V)) { + heap_insert(sch, alg_fq->F, q); + } else { + heap_insert(neh, alg_fq->S, q); + } + } + } + return m; +} + +static int +wf2qp_new_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + int ofs = offsetof(struct wf2qp_queue, heap_pos); + + /* all heaps support extract from middle */ + if (heap_init(&si->idle_heap, 16, ofs) || + heap_init(&si->sch_heap, 16, ofs) || + heap_init(&si->ne_heap, 16, ofs)) { + heap_free(&si->ne_heap); + heap_free(&si->sch_heap); + heap_free(&si->idle_heap); + return ENOMEM; + } + return 0; +} + +static int +wf2qp_free_sched(struct dn_sch_inst *_si) +{ + struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); + + heap_free(&si->sch_heap); + heap_free(&si->ne_heap); + heap_free(&si->idle_heap); + + return 0; +} + +static int +wf2qp_new_fsk(struct dn_fsk *fs) +{ + ipdn_bound_var(&fs->fs.par[0], 1, + 1, 100, "WF2Q+ weight"); + return 0; +} + +static int +wf2qp_new_queue(struct dn_queue *_q) +{ + struct wf2qp_queue *q = (struct wf2qp_queue *)_q; + + _q->ni.oid.subtype = DN_SCHED_WF2QP; + q->F = 0; /* not strictly necessary */ + q->S = q->F + 1; /* mark timestamp as invalid. */ + q->inv_w = ONE_FP / _q->fs->fs.par[0]; + if (_q->mq.head != NULL) { + wf2qp_enqueue(_q->_si, _q, _q->mq.head); + } + return 0; +} + +/* + * Called when the infrastructure removes a queue (e.g. flowset + * is reconfigured). Nothing to do if we did not 'own' the queue, + * otherwise remove it from the right heap and adjust the sum + * of weights. + */ +static int +wf2qp_free_queue(struct dn_queue *q) +{ + struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; + struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); + + if (alg_fq->S >= alg_fq->F + 1) + return 0; /* nothing to do, not in any heap */ + si->wsum -= q->fs->fs.par[0]; + if (si->wsum > 0) + si->inv_wsum = ONE_FP/si->wsum; + + /* extract from the heap. XXX TODO we may need to adjust V + * to make sure the invariants hold. + */ + if (q->mq.head == NULL) { + heap_extract(&si->idle_heap, q); + } else if (DN_KEY_LT(si->V, alg_fq->S)) { + heap_extract(&si->ne_heap, q); + } else { + heap_extract(&si->sch_heap, q); + } + return 0; +} + +/* + * WF2Q+ scheduler descriptor + * contains the type of the scheduler, the name, the size of the + * structures and function pointers. + */ +static struct dn_alg wf2qp_desc = { + _SI( .type = ) DN_SCHED_WF2QP, + _SI( .name = ) "WF2Q+", + _SI( .flags = ) DN_MULTIQUEUE, + + /* we need extra space in the si and the queue */ + _SI( .schk_datalen = ) 0, + _SI( .si_datalen = ) sizeof(struct wf2qp_si), + _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - + sizeof(struct dn_queue), + + _SI( .enqueue = ) wf2qp_enqueue, + _SI( .dequeue = ) wf2qp_dequeue, + + _SI( .config = ) NULL, + _SI( .destroy = ) NULL, + _SI( .new_sched = ) wf2qp_new_sched, + _SI( .free_sched = ) wf2qp_free_sched, + + _SI( .new_fsk = ) wf2qp_new_fsk, + _SI( .free_fsk = ) NULL, + + _SI( .new_queue = ) wf2qp_new_queue, + _SI( .free_queue = ) wf2qp_free_queue, +}; + + +DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); diff --git a/freebsd/sys/netinet/ipfw/ip_dn_glue.c b/freebsd/sys/netinet/ipfw/ip_dn_glue.c new file mode 100644 index 00000000..302c4d29 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_dn_glue.c @@ -0,0 +1,847 @@ +#include + +/*- + * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + * + * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include +#include +#include +#include + +/* FREEBSD7.2 ip_dummynet.h r191715*/ + +struct dn_heap_entry7 { + int64_t key; /* sorting key. Topmost element is smallest one */ + void *object; /* object pointer */ +}; + +struct dn_heap7 { + int size; + int elements; + int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ + struct dn_heap_entry7 *p; /* really an array of "size" entries */ +}; + +/* Common to 7.2 and 8 */ +struct dn_flow_set { + SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ + + u_short fs_nr ; /* flow_set number */ + u_short flags_fs; +#define DNOLD_HAVE_FLOW_MASK 0x0001 +#define DNOLD_IS_RED 0x0002 +#define DNOLD_IS_GENTLE_RED 0x0004 +#define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ +#define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ +#define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ +#define DNOLD_IS_PIPE 0x4000 +#define DNOLD_IS_QUEUE 0x8000 + + struct dn_pipe7 *pipe ; /* pointer to parent pipe */ + u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ + + int weight ; /* WFQ queue weight */ + int qsize ; /* queue size in slots or bytes */ + int plr ; /* pkt loss rate (2^31-1 means 100%) */ + + struct ipfw_flow_id flow_mask ; + + /* hash table of queues onto this flow_set */ + int rq_size ; /* number of slots */ + int rq_elements ; /* active elements */ + struct dn_flow_queue7 **rq; /* array of rq_size entries */ + + u_int32_t last_expired ; /* do not expire too frequently */ + int backlogged ; /* #active queues for this flowset */ + + /* RED parameters */ +#define SCALE_RED 16 +#define SCALE(x) ( (x) << SCALE_RED ) +#define SCALE_VAL(x) ( (x) >> SCALE_RED ) +#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; +SLIST_HEAD(dn_flow_set_head, dn_flow_set); + +#define DN_IS_PIPE 0x4000 +#define DN_IS_QUEUE 0x8000 +struct dn_flow_queue7 { + struct dn_flow_queue7 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + u_long numbytes; + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + u_int32_t q_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe7 { /* a pipe */ + SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + int numbytes; + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + + /* + * When the tx clock come from an interface (if_name[0] != '\0'), its name + * is stored below, whereas the ifp is filled when the rule is configured. + */ + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ +}; +SLIST_HEAD(dn_pipe_head7, dn_pipe7); + + +/* FREEBSD8 ip_dummynet.h r196045 */ +struct dn_flow_queue8 { + struct dn_flow_queue8 *next ; + struct ipfw_flow_id id ; + + struct mbuf *head, *tail ; /* queue of packets */ + u_int len ; + u_int len_bytes ; + + uint64_t numbytes ; /* credit for transmission (dynamic queues) */ + int64_t extra_bits; /* extra bits simulating unavailable channel */ + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + + int hash_slot ; /* debugging/diagnostic */ + + /* RED parameters */ + int avg ; /* average queue length est. (scaled) */ + int count ; /* arrivals since last RED drop */ + int random ; /* random value (scaled) */ + int64_t idle_time; /* start of queue idle time */ + + /* WF2Q+ support */ + struct dn_flow_set *fs ; /* parent flow set */ + int heap_pos ; /* position (index) of struct in heap */ + int64_t sched_time ; /* current time when queue enters ready_heap */ + + int64_t S,F ; /* start time, finish time */ +}; + +struct dn_pipe8 { /* a pipe */ + SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ + + int pipe_nr ; /* number */ + int bandwidth; /* really, bytes/tick. */ + int delay ; /* really, ticks */ + + struct mbuf *head, *tail ; /* packets in delay line */ + + /* WF2Q+ */ + struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ + struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ + struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ + + int64_t V ; /* virtual time */ + int sum; /* sum of weights of all active sessions */ + + /* Same as in dn_flow_queue, numbytes can become large */ + int64_t numbytes; /* bits I can transmit (more or less). */ + uint64_t burst; /* burst size, scaled: bits * hz */ + + int64_t sched_time ; /* time pipe was scheduled in ready_heap */ + int64_t idle_time; /* start of pipe idle time */ + + char if_name[IFNAMSIZ]; + struct ifnet *ifp ; + int ready ; /* set if ifp != NULL and we got a signal from it */ + + struct dn_flow_set fs ; /* used with fixed-rate flows */ + + /* fields to simulate a delay profile */ +#define ED_MAX_NAME_LEN 32 + char name[ED_MAX_NAME_LEN]; + int loss_level; + int samples_no; + int *samples; +}; + +#define ED_MAX_SAMPLES_NO 1024 +struct dn_pipe_max8 { + struct dn_pipe8 pipe; + int samples[ED_MAX_SAMPLES_NO]; +}; +SLIST_HEAD(dn_pipe_head8, dn_pipe8); + +/* + * Changes from 7.2 to 8: + * dn_pipe: + * numbytes from int to int64_t + * add burst (int64_t) + * add idle_time (int64_t) + * add profile + * add struct dn_pipe_max + * add flag DN_HAS_PROFILE + * + * dn_flow_queue + * numbytes from u_long to int64_t + * add extra_bits (int64_t) + * q_time from u_int32_t to int64_t and name idle_time + * + * dn_flow_set unchanged + * + */ + +/* NOTE:XXX copied from dummynet.c */ +#define O_NEXT(p, len) ((void *)((char *)p + len)) +static void +oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) +{ + oid->len = len; + oid->type = type; + oid->subtype = 0; + oid->id = id; +} +/* make room in the buffer and move the pointer forward */ +static void * +o_next(struct dn_id **o, int len, int type) +{ + struct dn_id *ret = *o; + oid_fill(ret, len, type, 0); + *o = O_NEXT(*o, len); + return ret; +} + + +static size_t pipesize7 = sizeof(struct dn_pipe7); +static size_t pipesize8 = sizeof(struct dn_pipe8); +static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); + +/* Indicate 'ipfw' version + * 1: from FreeBSD 7.2 + * 0: from FreeBSD 8 + * -1: unknow (for now is unused) + * + * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives + * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow, + * it is suppose to be the FreeBSD 8 version. + */ +static int is7 = 0; + +static int +convertflags2new(int src) +{ + int dst = 0; + + if (src & DNOLD_HAVE_FLOW_MASK) + dst |= DN_HAVE_MASK; + if (src & DNOLD_QSIZE_IS_BYTES) + dst |= DN_QSIZE_BYTES; + if (src & DNOLD_NOERROR) + dst |= DN_NOERROR; + if (src & DNOLD_IS_RED) + dst |= DN_IS_RED; + if (src & DNOLD_IS_GENTLE_RED) + dst |= DN_IS_GENTLE_RED; + if (src & DNOLD_HAS_PROFILE) + dst |= DN_HAS_PROFILE; + + return dst; +} + +static int +convertflags2old(int src) +{ + int dst = 0; + + if (src & DN_HAVE_MASK) + dst |= DNOLD_HAVE_FLOW_MASK; + if (src & DN_IS_RED) + dst |= DNOLD_IS_RED; + if (src & DN_IS_GENTLE_RED) + dst |= DNOLD_IS_GENTLE_RED; + if (src & DN_NOERROR) + dst |= DNOLD_NOERROR; + if (src & DN_HAS_PROFILE) + dst |= DNOLD_HAS_PROFILE; + if (src & DN_QSIZE_BYTES) + dst |= DNOLD_QSIZE_IS_BYTES; + + return dst; +} + +static int +dn_compat_del(void *v) +{ + struct dn_pipe7 *p = (struct dn_pipe7 *) v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; + struct { + struct dn_id oid; + uintptr_t a[1]; /* add more if we want a list */ + } cmd; + + /* XXX DN_API_VERSION ??? */ + oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); + + if (is7) { + if (p->pipe_nr == 0 && p->fs.fs_nr == 0) + return EINVAL; + if (p->pipe_nr != 0 && p->fs.fs_nr != 0) + return EINVAL; + } else { + if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) + return EINVAL; + if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) + return EINVAL; + } + + if (p->pipe_nr != 0) { /* pipe x delete */ + cmd.a[0] = p->pipe_nr; + cmd.oid.subtype = DN_LINK; + } else { /* queue x delete */ + cmd.oid.subtype = DN_FS; + cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; + } + + return do_config(&cmd, cmd.oid.len); +} + +static int +dn_compat_config_queue(struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + struct dn_flow_set *f; + + if (is7) + f = &p7->fs; + else + f = &p8->fs; + + fs->fs_nr = f->fs_nr; + fs->sched_nr = f->parent_nr; + fs->flow_mask = f->flow_mask; + fs->buckets = f->rq_size; + fs->qsize = f->qsize; + fs->plr = f->plr; + fs->par[0] = f->weight; + fs->flags = convertflags2new(f->flags_fs); + if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { + fs->w_q = f->w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->max_p; + } + + return 0; +} + +static int +dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, + struct dn_fs *fs, void* v) +{ + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + int i = p7->pipe_nr; + + sch->sched_nr = i; + sch->oid.subtype = 0; + p->link_nr = i; + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Common to 7 and 8 */ + p->bandwidth = p7->bandwidth; + p->delay = p7->delay; + if (!is7) { + /* FreeBSD 8 has burst */ + p->burst = p8->burst; + } + + /* fill the fifo flowset */ + dn_compat_config_queue(fs, v); + fs->fs_nr = i + 2*DN_MAX_ID; + fs->sched_nr = i + DN_MAX_ID; + + /* Move scheduler related parameter from fs to sch */ + sch->buckets = fs->buckets; /*XXX*/ + fs->buckets = 0; + if (fs->flags & DN_HAVE_MASK) { + sch->flags |= DN_HAVE_MASK; + fs->flags &= ~DN_HAVE_MASK; + sch->sched_mask = fs->flow_mask; + bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); + } + + return 0; +} + +static int +dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, + void *v) +{ + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); + + pf->link_nr = p->link_nr; + pf->loss_level = p8->loss_level; +// pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? + pf->samples_no = p8->samples_no; + strncpy(pf->name, p8->name,sizeof(pf->name)); + bcopy(p8->samples, pf->samples, sizeof(pf->samples)); + + return 0; +} + +/* + * If p->pipe_nr != 0 the command is 'pipe x config', so need to create + * the three main struct, else only a flowset is created + */ +static int +dn_compat_configure(void *v) +{ + struct dn_id *buf = NULL, *base; + struct dn_sch *sch = NULL; + struct dn_link *p = NULL; + struct dn_fs *fs = NULL; + struct dn_profile *pf = NULL; + int lmax; + int error; + + struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; + struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; + + int i; /* number of object to configure */ + + lmax = sizeof(struct dn_id); /* command header */ + lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + + sizeof(struct dn_fs) + sizeof(struct dn_profile); + + base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO); + o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); + base->id = DN_API_VERSION; + + /* pipe_nr is the same in p7 and p8 */ + i = p7->pipe_nr; + if (i != 0) { /* pipe config */ + sch = o_next(&buf, sizeof(*sch), DN_SCH); + p = o_next(&buf, sizeof(*p), DN_LINK); + fs = o_next(&buf, sizeof(*fs), DN_FS); + + error = dn_compat_config_pipe(sch, p, fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + if (!is7 && p8->samples_no > 0) { + /* Add profiles*/ + pf = o_next(&buf, sizeof(*pf), DN_PROFILE); + error = dn_compat_config_profile(pf, p, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + } else { /* queue config */ + fs = o_next(&buf, sizeof(*fs), DN_FS); + error = dn_compat_config_queue(fs, v); + if (error) { + free(buf, M_DUMMYNET); + return error; + } + } + error = do_config(base, (char *)buf - (char *)base); + + if (buf) + free(buf, M_DUMMYNET); + return error; +} + +int +dn_compat_calc_size(struct dn_parms dn_cfg) +{ + int need = 0; + /* XXX use FreeBSD 8 struct size */ + /* NOTE: + * - half scheduler: schk_count/2 + * - all flowset: fsk_count + * - all flowset queues: queue_count + * - all pipe queue: si_count + */ + need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; + need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); + need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); + need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); + + return need; +} + +int +dn_c_copy_q (void *_ni, void *arg) +{ + struct copy_args *a = arg; + struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; + struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; + struct dn_flow *ni = (struct dn_flow *)_ni; + int size = 0; + + /* XXX hash slot not set */ + /* No difference between 7.2/8 */ + fq7->len = ni->length; + fq7->len_bytes = ni->len_bytes; + fq7->id = ni->fid; + + if (is7) { + size = sizeof(struct dn_flow_queue7); + fq7->tot_pkts = ni->tot_pkts; + fq7->tot_bytes = ni->tot_bytes; + fq7->drops = ni->drops; + } else { + size = sizeof(struct dn_flow_queue8); + fq8->tot_pkts = ni->tot_pkts; + fq8->tot_bytes = ni->tot_bytes; + fq8->drops = ni->drops; + } + + *a->start += size; + return 0; +} + +int +dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) +{ + struct dn_link *l = &s->link; + struct dn_fsk *f = s->fs; + + struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; + struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; + struct dn_flow_set *fs; + int size = 0; + + if (is7) { + fs = &pipe7->fs; + size = sizeof(struct dn_pipe7); + } else { + fs = &pipe8->fs; + size = sizeof(struct dn_pipe8); + } + + /* These 4 field are the same in pipe7 and pipe8 */ + pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; + pipe7->bandwidth = l->bandwidth; + pipe7->delay = l->delay; + pipe7->pipe_nr = l->link_nr - DN_MAX_ID; + + if (!is7) { + if (s->profile) { + struct dn_profile *pf = s->profile; + strncpy(pipe8->name, pf->name, sizeof(pf->name)); + pipe8->loss_level = pf->loss_level; + pipe8->samples_no = pf->samples_no; + } + pipe8->burst = div64(l->burst , 8 * hz); + } + + fs->flow_mask = s->sch.sched_mask; + fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; + + fs->parent_nr = l->link_nr - DN_MAX_ID; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->rq_elements = nq; + + fs->flags_fs = convertflags2old(f->fs.flags); + + *a->start += size; + return 0; +} + + +int +dn_compat_copy_pipe(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int pipe_size = sizeof(struct dn_pipe8); + int queue_size = sizeof(struct dn_flow_queue8); + int n_queue = 0; /* number of queues */ + + struct dn_schk *s = (struct dn_schk *)_o; + /* calculate needed space: + * - struct dn_pipe + * - if there are instances, dn_queue * n_instances + */ + n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : + (s->siht ? 1 : 0)); + need = pipe_size + queue_size * n_queue; + if (have < need) { + D("have %d < need %d", have, need); + return 1; + } + /* copy pipe */ + dn_c_copy_pipe(s, a, n_queue); + + /* copy queues */ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, dn_c_copy_q, a); + else if (s->siht) + dn_c_copy_q(s->siht, a); + return 0; +} + +int +dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) +{ + struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; + + fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; + fs->fs_nr = f->fs.fs_nr; + fs->qsize = f->fs.qsize; + fs->plr = f->fs.plr; + fs->w_q = f->fs.w_q; + fs->max_th = f->max_th; + fs->min_th = f->min_th; + fs->max_p = f->fs.max_p; + fs->flow_mask = f->fs.flow_mask; + fs->rq_elements = nq; + fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); + fs->parent_nr = f->fs.sched_nr; + fs->weight = f->fs.par[0]; + + fs->flags_fs = convertflags2old(f->fs.flags); + *a->start += sizeof(struct dn_flow_set); + return 0; +} + +int +dn_compat_copy_queue(struct copy_args *a, void *_o) +{ + int have = a->end - *a->start; + int need = 0; + int fs_size = sizeof(struct dn_flow_set); + int queue_size = sizeof(struct dn_flow_queue8); + + struct dn_fsk *fs = (struct dn_fsk *)_o; + int n_queue = 0; /* number of queues */ + + n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : + (fs->qht ? 1 : 0)); + + need = fs_size + queue_size * n_queue; + if (have < need) { + D("have < need"); + return 1; + } + + /* copy flowset */ + dn_c_copy_fs(fs, a, n_queue); + + /* copy queues */ + if (fs->fs.flags & DN_HAVE_MASK) + dn_ht_scan(fs->qht, dn_c_copy_q, a); + else if (fs->qht) + dn_c_copy_q(fs->qht, a); + + return 0; +} + +int +copy_data_helper_compat(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + + if (a->type == DN_COMPAT_PIPE) { + struct dn_schk *s = _o; + if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { + return 0; /* not old type */ + } + /* copy pipe parameters, and if instance exists, copy + * other parameters and eventually queues. + */ + if(dn_compat_copy_pipe(a, _o)) + return DNHT_SCAN_END; + } else if (a->type == DN_COMPAT_QUEUE) { + struct dn_fsk *fs = _o; + if (fs->fs.fs_nr >= DN_MAX_ID) + return 0; + if (dn_compat_copy_queue(a, _o)) + return DNHT_SCAN_END; + } + return 0; +} + +/* Main function to manage old requests */ +int +ip_dummynet_compat(struct sockopt *sopt) +{ + int error=0; + void *v = NULL; + struct dn_id oid; + + /* Lenght of data, used to found ipfw version... */ + int len = sopt->sopt_valsize; + + /* len can be 0 if command was dummynet_flush */ + if (len == pipesize7) { + D("setting compatibility with FreeBSD 7.2"); + is7 = 1; + } + else if (len == pipesize8 || len == pipesizemax8) { + D("setting compatibility with FreeBSD 8"); + is7 = 0; + } + + switch (sopt->sopt_name) { + default: + printf("dummynet: -- unknown option %d", sopt->sopt_name); + error = EINVAL; + break; + + case IP_DUMMYNET_FLUSH: + oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); + do_config(&oid, oid.len); + break; + + case IP_DUMMYNET_DEL: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_del(v); + free(v, M_DUMMYNET); + break; + + case IP_DUMMYNET_CONFIGURE: + v = malloc(len, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, v, len, len); + if (error) + break; + error = dn_compat_configure(v); + free(v, M_DUMMYNET); + break; + + case IP_DUMMYNET_GET: { + void *buf; + int ret; + int original_size = sopt->sopt_valsize; + int size; + + ret = dummynet_get(sopt, &buf); + if (ret) + return 0;//XXX ? + size = sopt->sopt_valsize; + sopt->sopt_valsize = original_size; + D("size=%d, buf=%p", size, buf); + ret = sooptcopyout(sopt, buf, size); + if (ret) + printf(" %s ERROR sooptcopyout\n", __FUNCTION__); + if (buf) + free(buf, M_DUMMYNET); + } + } + + return error; +} + + diff --git a/freebsd/sys/netinet/ipfw/ip_dn_io.c b/freebsd/sys/netinet/ipfw/ip_dn_io.c new file mode 100644 index 00000000..7a2c46d4 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_dn_io.c @@ -0,0 +1,796 @@ +#include + +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Dummynet portions related to packet handling. + */ +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include +#include /* ip_len, ip_off */ +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include +#include +#include +#include + +#include /* various ether_* routines */ + +#include /* for ip6_input, ip6_output prototypes */ +#include + +/* + * We keep a private variable for the simulation time, but we could + * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) + * instead of dn_cfg.curr_time + */ + +struct dn_parms dn_cfg; + +static long tick_last; /* Last tick duration (usec). */ +static long tick_delta; /* Last vs standard tick diff (usec). */ +static long tick_delta_sum; /* Accumulated tick difference (usec).*/ +static long tick_adjustment; /* Tick adjustments done. */ +static long tick_lost; /* Lost(coalesced) ticks number. */ +/* Adjusted vs non-adjusted curr_time difference (ticks). */ +static long tick_diff; + +static unsigned long io_pkt; +static unsigned long io_pkt_fast; +static unsigned long io_pkt_drop; + +/* + * We use a heap to store entities for which we have pending timer events. + * The heap is checked at every tick and all entities with expired events + * are extracted. + */ + +MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); + +extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); + +#ifdef SYSCTL_NODE + +SYSBEGIN(f4) + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); + +/* parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, + CTLFLAG_RW, &dn_cfg.hash_size, 0, "Default hash table size"); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, + CTLFLAG_RW, &dn_cfg.slot_limit, 0, + "Upper limit in slots for pipe queue."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, + CTLFLAG_RW, &dn_cfg.byte_limit, 0, + "Upper limit in bytes for pipe queue."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, + CTLFLAG_RW, &dn_cfg.io_fast, 0, "Enable fast dummynet io."); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, + CTLFLAG_RW, &dn_cfg.debug, 0, "Dummynet debug level"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, + CTLFLAG_RW, &dn_cfg.expire, 0, "Expire empty queues/pipes"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, + CTLFLAG_RD, &dn_cfg.expire_cycle, 0, "Expire cycle for queues/pipes"); + +/* RED parameters */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, + CTLFLAG_RD, &dn_cfg.red_lookup_depth, 0, "Depth of RED lookup table"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, + CTLFLAG_RD, &dn_cfg.red_avg_pkt_size, 0, "RED Medium packet size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, + CTLFLAG_RD, &dn_cfg.red_max_pkt_size, 0, "RED Max packet size"); + +/* time adjustment */ +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, + CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, + CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, + CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, + CTLFLAG_RD, &tick_diff, 0, + "Adjusted vs non-adjusted curr_time difference (ticks)."); +SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, + CTLFLAG_RD, &tick_lost, 0, + "Number of ticks coalesced by dummynet taskqueue."); + +/* statistics */ +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, + CTLFLAG_RD, &dn_cfg.schk_count, 0, "Number of schedulers"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, + CTLFLAG_RD, &dn_cfg.si_count, 0, "Number of scheduler instances"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, + CTLFLAG_RD, &dn_cfg.fsk_count, 0, "Number of flowsets"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, + CTLFLAG_RD, &dn_cfg.queue_count, 0, "Number of queues"); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, + CTLFLAG_RD, &io_pkt, 0, + "Number of packets passed to dummynet."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, + CTLFLAG_RD, &io_pkt_fast, 0, + "Number of packets bypassed dummynet scheduler."); +SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, + CTLFLAG_RD, &io_pkt_drop, 0, + "Number of packets dropped by dummynet."); + +SYSEND + +#endif + +static void dummynet_send(struct mbuf *); + +/* + * Packets processed by dummynet have an mbuf tag associated with + * them that carries their dummynet state. + * Outside dummynet, only the 'rule' field is relevant, and it must + * be at the beginning of the structure. + */ +struct dn_pkt_tag { + struct ipfw_rule_ref rule; /* matching rule */ + + /* second part, dummynet specific */ + int dn_dir; /* action when packet comes out.*/ + /* see ip_fw_private.h */ + uint64_t output_time; /* when the pkt is due for delivery*/ + struct ifnet *ifp; /* interface, for ip_output */ + struct _ip6dn_args ip6opt; /* XXX ipv6 options */ +}; + +/* + * Return the mbuf tag holding the dummynet state (it should + * be the first one on the list). + */ +static struct dn_pkt_tag * +dn_tag_get(struct mbuf *m) +{ + struct m_tag *mtag = m_tag_first(m); + KASSERT(mtag != NULL && + mtag->m_tag_cookie == MTAG_ABI_COMPAT && + mtag->m_tag_id == PACKET_TAG_DUMMYNET, + ("packet on dummynet queue w/o dummynet tag!")); + return (struct dn_pkt_tag *)(mtag+1); +} + +static inline void +mq_append(struct mq *q, struct mbuf *m) +{ + if (q->head == NULL) + q->head = m; + else + q->tail->m_nextpkt = m; + q->tail = m; + m->m_nextpkt = NULL; +} + +/* + * Dispose a list of packet. Use a functions so if we need to do + * more work, this is a central point to do it. + */ +void dn_free_pkts(struct mbuf *mnext) +{ + struct mbuf *m; + + while ((m = mnext) != NULL) { + mnext = m->m_nextpkt; + FREE_PKT(m); + } +} + +static int +red_drops (struct dn_queue *q, int len) +{ + /* + * RED algorithm + * + * RED calculates the average queue size (avg) using a low-pass filter + * with an exponential weighted (w_q) moving average: + * avg <- (1-w_q) * avg + w_q * q_size + * where q_size is the queue length (measured in bytes or * packets). + * + * If q_size == 0, we compute the idle time for the link, and set + * avg = (1 - w_q)^(idle/s) + * where s is the time needed for transmitting a medium-sized packet. + * + * Now, if avg < min_th the packet is enqueued. + * If avg > max_th the packet is dropped. Otherwise, the packet is + * dropped with probability P function of avg. + */ + + struct dn_fsk *fs = q->fs; + int64_t p_b = 0; + + /* Queue in bytes or packets? */ + uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? + q->ni.len_bytes : q->ni.length; + + /* Average queue size estimation. */ + if (q_size != 0) { + /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ + int diff = SCALE(q_size) - q->avg; + int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); + + q->avg += (int)v; + } else { + /* + * Queue is empty, find for how long the queue has been + * empty and use a lookup table for computing + * (1 - * w_q)^(idle_time/s) where s is the time to send a + * (small) packet. + * XXX check wraps... + */ + if (q->avg) { + u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); + + q->avg = (t < fs->lookup_depth) ? + SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; + } + } + + /* Should i drop? */ + if (q->avg < fs->min_th) { + q->count = -1; + return (0); /* accept packet */ + } + if (q->avg >= fs->max_th) { /* average queue >= max threshold */ + if (fs->fs.flags & DN_IS_GENTLE_RED) { + /* + * According to Gentle-RED, if avg is greater than + * max_th the packet is dropped with a probability + * p_b = c_3 * avg - c_4 + * where c_3 = (1 - max_p) / max_th + * c_4 = 1 - 2 * max_p + */ + p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - + fs->c_4; + } else { + q->count = -1; + return (1); + } + } else if (q->avg > fs->min_th) { + /* + * We compute p_b using the linear dropping function + * p_b = c_1 * avg - c_2 + * where c_1 = max_p / (max_th - min_th) + * c_2 = max_p * min_th / (max_th - min_th) + */ + p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; + } + + if (fs->fs.flags & DN_QSIZE_BYTES) + p_b = div64((p_b * len) , fs->max_pkt_size); + if (++q->count == 0) + q->random = random() & 0xffff; + else { + /* + * q->count counts packets arrived since last drop, so a greater + * value of q->count means a greater packet drop probability. + */ + if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { + q->count = 0; + /* After a drop we calculate a new random value. */ + q->random = random() & 0xffff; + return (1); /* drop */ + } + } + /* End of RED algorithm. */ + + return (0); /* accept */ + +} + +/* + * Enqueue a packet in q, subject to space and queue management policy + * (whose parameters are in q->fs). + * Update stats for the queue and the scheduler. + * Return 0 on success, 1 on drop. The packet is consumed anyways. + */ +int +dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) +{ + struct dn_fs *f; + struct dn_flow *ni; /* stats for scheduler instance */ + uint64_t len; + + if (q->fs == NULL || q->_si == NULL) { + printf("%s fs %p si %p, dropping\n", + __FUNCTION__, q->fs, q->_si); + FREE_PKT(m); + return 1; + } + f = &(q->fs->fs); + ni = &q->_si->ni; + len = m->m_pkthdr.len; + /* Update statistics, then check reasons to drop pkt. */ + q->ni.tot_bytes += len; + q->ni.tot_pkts++; + ni->tot_bytes += len; + ni->tot_pkts++; + if (drop) + goto drop; + if (f->plr && random() < f->plr) + goto drop; + if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) + goto drop; + if (f->flags & DN_QSIZE_BYTES) { + if (q->ni.len_bytes > f->qsize) + goto drop; + } else if (q->ni.length >= f->qsize) { + goto drop; + } + mq_append(&q->mq, m); + q->ni.length++; + q->ni.len_bytes += len; + ni->length++; + ni->len_bytes += len; + return 0; + +drop: + io_pkt_drop++; + q->ni.drops++; + ni->drops++; + FREE_PKT(m); + return 1; +} + +/* + * Fetch packets from the delay line which are due now. If there are + * leftover packets, reinsert the delay line in the heap. + * Runs under scheduler lock. + */ +static void +transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) +{ + struct mbuf *m; + struct dn_pkt_tag *pkt = NULL; + + dline->oid.subtype = 0; /* not in heap */ + while ((m = dline->mq.head) != NULL) { + pkt = dn_tag_get(m); + if (!DN_KEY_LEQ(pkt->output_time, now)) + break; + dline->mq.head = m->m_nextpkt; + mq_append(q, m); + } + if (m != NULL) { + dline->oid.subtype = 1; /* in heap */ + heap_insert(&dn_cfg.evheap, pkt->output_time, dline); + } +} + +/* + * Convert the additional MAC overheads/delays into an equivalent + * number of bits for the given data rate. The samples are + * in milliseconds so we need to divide by 1000. + */ +static uint64_t +extra_bits(struct mbuf *m, struct dn_schk *s) +{ + int index; + uint64_t bits; + struct dn_profile *pf = s->profile; + + if (!pf || pf->samples_no == 0) + return 0; + index = random() % pf->samples_no; + bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); + if (index >= pf->loss_level) { + struct dn_pkt_tag *dt = dn_tag_get(m); + if (dt) + dt->dn_dir = DIR_DROP; + } + return bits; +} + +/* + * Send traffic from a scheduler instance due by 'now'. + * Return a pointer to the head of the queue. + */ +static struct mbuf * +serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) +{ + struct mq def_q; + struct dn_schk *s = si->sched; + struct mbuf *m = NULL; + int delay_line_idle = (si->dline.mq.head == NULL); + int done, bw; + + if (q == NULL) { + q = &def_q; + q->head = NULL; + } + + bw = s->link.bandwidth; + si->kflags &= ~DN_ACTIVE; + + if (bw > 0) + si->credit += (now - si->sched_time) * bw; + else + si->credit = 0; + si->sched_time = now; + done = 0; + while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { + uint64_t len_scaled; + done++; + len_scaled = (bw == 0) ? 0 : hz * + (m->m_pkthdr.len * 8 + extra_bits(m, s)); + si->credit -= len_scaled; + /* Move packet in the delay line */ + dn_tag_get(m)->output_time += s->link.delay ; + mq_append(&si->dline.mq, m); + } + /* + * If credit >= 0 the instance is idle, mark time. + * Otherwise put back in the heap, and adjust the output + * time of the last inserted packet, m, which was too early. + */ + if (si->credit >= 0) { + si->idle_time = now; + } else { + uint64_t t; + KASSERT (bw > 0, ("bw=0 and credit<0 ?")); + t = div64(bw - 1 - si->credit, bw); + if (m) + dn_tag_get(m)->output_time += t; + si->kflags |= DN_ACTIVE; + heap_insert(&dn_cfg.evheap, now + t, si); + } + if (delay_line_idle && done) + transmit_event(q, &si->dline, now); + return q->head; +} + +/* + * The timer handler for dummynet. Time is computed in ticks, but + * but the code is tolerant to the actual rate at which this is called. + * Once complete, the function reschedules itself for the next tick. + */ +void +dummynet_task(void *context, int pending) +{ + struct timeval t; + struct mq q = { NULL, NULL }; /* queue to accumulate results */ + + DN_BH_WLOCK(); + + /* Update number of lost(coalesced) ticks. */ + tick_lost += pending - 1; + + getmicrouptime(&t); + /* Last tick duration (usec). */ + tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + + (t.tv_usec - dn_cfg.prev_t.tv_usec); + /* Last tick vs standard tick difference (usec). */ + tick_delta = (tick_last * hz - 1000000) / hz; + /* Accumulated tick difference (usec). */ + tick_delta_sum += tick_delta; + + dn_cfg.prev_t = t; + + /* + * Adjust curr_time if the accumulated tick difference is + * greater than the 'standard' tick. Since curr_time should + * be monotonically increasing, we do positive adjustments + * as required, and throttle curr_time in case of negative + * adjustment. + */ + dn_cfg.curr_time++; + if (tick_delta_sum - tick >= 0) { + int diff = tick_delta_sum / tick; + + dn_cfg.curr_time += diff; + tick_diff += diff; + tick_delta_sum %= tick; + tick_adjustment++; + } else if (tick_delta_sum + tick <= 0) { + dn_cfg.curr_time--; + tick_diff--; + tick_delta_sum += tick; + tick_adjustment++; + } + + /* serve pending events, accumulate in q */ + for (;;) { + struct dn_id *p; /* generic parameter to handler */ + + if (dn_cfg.evheap.elements == 0 || + DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) + break; + p = HEAP_TOP(&dn_cfg.evheap)->object; + heap_extract(&dn_cfg.evheap, NULL); + + if (p->type == DN_SCH_I) { + serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); + } else { /* extracted a delay line */ + transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); + } + } + if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { + dn_cfg.expire_cycle = 0; + dn_drain_scheduler(); + dn_drain_queue(); + } + + DN_BH_WUNLOCK(); + dn_reschedule(); + if (q.head != NULL) + dummynet_send(q.head); +} + +/* + * forward a chain of packets to the proper destination. + * This runs outside the dummynet lock. + */ +static void +dummynet_send(struct mbuf *m) +{ + struct mbuf *n; + + for (; m != NULL; m = n) { + struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ + struct m_tag *tag; + int dst; + + n = m->m_nextpkt; + m->m_nextpkt = NULL; + tag = m_tag_first(m); + if (tag == NULL) { /* should not happen */ + dst = DIR_DROP; + } else { + struct dn_pkt_tag *pkt = dn_tag_get(m); + /* extract the dummynet info, rename the tag + * to carry reinject info. + */ + dst = pkt->dn_dir; + ifp = pkt->ifp; + tag->m_tag_cookie = MTAG_IPFW_RULE; + tag->m_tag_id = 0; + } + + switch (dst) { + case DIR_OUT: + SET_HOST_IPLEN(mtod(m, struct ip *)); + ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); + break ; + + case DIR_IN : + /* put header in network format for ip_input() */ + //SET_NET_IPLEN(mtod(m, struct ip *)); + netisr_dispatch(NETISR_IP, m); + break; + +#ifdef INET6 + case DIR_IN | PROTO_IPV6: + netisr_dispatch(NETISR_IPV6, m); + break; + + case DIR_OUT | PROTO_IPV6: + SET_HOST_IPLEN(mtod(m, struct ip *)); + ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); + break; +#endif + + case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ + if (bridge_dn_p != NULL) + ((*bridge_dn_p)(m, ifp)); + else + printf("dummynet: if_bridge not loaded\n"); + + break; + + case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ + /* + * The Ethernet code assumes the Ethernet header is + * contiguous in the first mbuf header. + * Insure this is true. + */ + if (m->m_len < ETHER_HDR_LEN && + (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { + printf("dummynet/ether: pullup failed, " + "dropping packet\n"); + break; + } + ether_demux(m->m_pkthdr.rcvif, m); + break; + + case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ + ether_output_frame(ifp, m); + break; + + case DIR_DROP: + /* drop the packet after some time */ + FREE_PKT(m); + break; + + default: + printf("dummynet: bad switch %d!\n", dst); + FREE_PKT(m); + break; + } + } +} + +static inline int +tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) +{ + struct dn_pkt_tag *dt; + struct m_tag *mtag; + + mtag = m_tag_get(PACKET_TAG_DUMMYNET, + sizeof(*dt), M_NOWAIT | M_ZERO); + if (mtag == NULL) + return 1; /* Cannot allocate packet header. */ + m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ + dt = (struct dn_pkt_tag *)(mtag + 1); + dt->rule = fwa->rule; + dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ + dt->dn_dir = dir; + dt->ifp = fwa->oif; + /* dt->output tame is updated as we move through */ + dt->output_time = dn_cfg.curr_time; + return 0; +} + + +/* + * dummynet hook for packets. + * We use the argument to locate the flowset fs and the sched_set sch + * associated to it. The we apply flow_mask and sched_mask to + * determine the queue and scheduler instances. + * + * dir where shall we send the packet after dummynet. + * *m0 the mbuf with the packet + * ifp the 'ifp' parameter from the caller. + * NULL in ip_input, destination interface in ip_output, + */ +int +dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) +{ + struct mbuf *m = *m0; + struct dn_fsk *fs = NULL; + struct dn_sch_inst *si; + struct dn_queue *q = NULL; /* default */ + + int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); + DN_BH_WLOCK(); + io_pkt++; + /* we could actually tag outside the lock, but who cares... */ + if (tag_mbuf(m, dir, fwa)) + goto dropit; + if (dn_cfg.busy) { + /* if the upper half is busy doing something expensive, + * lets queue the packet and move forward + */ + mq_append(&dn_cfg.pending, m); + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + /* XXX locate_flowset could be optimised with a direct ref. */ + fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); + if (fs == NULL) + goto dropit; /* This queue/pipe does not exist! */ + if (fs->sched == NULL) /* should not happen */ + goto dropit; + /* find scheduler instance, possibly applying sched_mask */ + si = ipdn_si_find(fs->sched, &(fwa->f_id)); + if (si == NULL) + goto dropit; + /* + * If the scheduler supports multiple queues, find the right one + * (otherwise it will be ignored by enqueue). + */ + if (fs->sched->fp->flags & DN_MULTIQUEUE) { + q = ipdn_q_find(fs, si, &(fwa->f_id)); + if (q == NULL) + goto dropit; + } + if (fs->sched->fp->enqueue(si, q, m)) { + /* packet was dropped by enqueue() */ + m = *m0 = NULL; + goto dropit; + } + + if (si->kflags & DN_ACTIVE) { + m = *m0 = NULL; /* consumed */ + goto done; /* already active, nothing to do */ + } + + /* compute the initial allowance */ + if (si->idle_time < dn_cfg.curr_time) { + /* Do this only on the first packet on an idle pipe */ + struct dn_link *p = &fs->sched->link; + + si->sched_time = dn_cfg.curr_time; + si->credit = dn_cfg.io_fast ? p->bandwidth : 0; + if (p->burst) { + uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; + if (burst > p->burst) + burst = p->burst; + si->credit += burst; + } + } + /* pass through scheduler and delay line */ + m = serve_sched(NULL, si, dn_cfg.curr_time); + + /* optimization -- pass it back to ipfw for immediate send */ + /* XXX Don't call dummynet_send() if scheduler return the packet + * just enqueued. This avoid a lock order reversal. + * + */ + if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { + /* fast io, rename the tag * to carry reinject info. */ + struct m_tag *tag = m_tag_first(m); + + tag->m_tag_cookie = MTAG_IPFW_RULE; + tag->m_tag_id = 0; + io_pkt_fast++; + if (m->m_nextpkt != NULL) { + printf("dummynet: fast io: pkt chain detected!\n"); + m->m_nextpkt = NULL; + } + m = NULL; + } else { + *m0 = NULL; + } +done: + DN_BH_WUNLOCK(); + if (m) + dummynet_send(m); + return 0; + +dropit: + io_pkt_drop++; + DN_BH_WUNLOCK(); + if (m) + FREE_PKT(m); + *m0 = NULL; + return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; +} diff --git a/freebsd/sys/netinet/ipfw/ip_dn_private.h b/freebsd/sys/netinet/ipfw/ip_dn_private.h new file mode 100644 index 00000000..270f1881 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_dn_private.h @@ -0,0 +1,402 @@ +/*- + * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * internal dummynet APIs. + * + * $FreeBSD$ + */ + +#ifndef _IP_DN_PRIVATE_H +#define _IP_DN_PRIVATE_H + +/* debugging support + * use ND() to remove debugging, D() to print a line, + * DX(level, ...) to print above a certain level + * If you redefine D() you are expected to redefine all. + */ +#ifndef D +#define ND(fmt, ...) do {} while (0) +#define D1(fmt, ...) do {} while (0) +#define D(fmt, ...) printf("%-10s " fmt "\n", \ + __FUNCTION__, ## __VA_ARGS__) +#define DX(lev, fmt, ...) do { \ + if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) +#endif + +MALLOC_DECLARE(M_DUMMYNET); + +#ifndef FREE_PKT +#define FREE_PKT(m) m_freem(m) +#endif + +#ifndef __linux__ +#define div64(a, b) ((int64_t)(a) / (int64_t)(b)) +#endif + +#define DN_LOCK_INIT() do { \ + mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ + mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ + } while (0) +#define DN_LOCK_DESTROY() do { \ + mtx_destroy(&dn_cfg.uh_mtx); \ + mtx_destroy(&dn_cfg.bh_mtx); \ + } while (0) +#if 0 /* not used yet */ +#define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) +#endif + +#define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) +#define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) +#define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) + +SLIST_HEAD(dn_schk_head, dn_schk); +SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); +SLIST_HEAD(dn_fsk_head, dn_fsk); +SLIST_HEAD(dn_queue_head, dn_queue); +SLIST_HEAD(dn_alg_head, dn_alg); + +struct mq { /* a basic queue of packets*/ + struct mbuf *head, *tail; +}; + +static inline void +set_oid(struct dn_id *o, int type, int len) +{ + o->type = type; + o->len = len; + o->subtype = 0; +}; + +/* + * configuration and global data for a dummynet instance + * + * When a configuration is modified from userland, 'id' is incremented + * so we can use the value to check for stale pointers. + */ +struct dn_parms { + uint32_t id; /* configuration version */ + + /* defaults (sysctl-accessible) */ + int red_lookup_depth; + int red_avg_pkt_size; + int red_max_pkt_size; + int hash_size; + int max_hash_size; + long byte_limit; /* max queue sizes */ + long slot_limit; + + int io_fast; + int debug; + + /* timekeeping */ + struct timeval prev_t; /* last time dummynet_tick ran */ + struct dn_heap evheap; /* scheduled events */ + + /* counters of objects -- used for reporting space */ + int schk_count; + int si_count; + int fsk_count; + int queue_count; + + /* ticks and other stuff */ + uint64_t curr_time; + /* flowsets and schedulers are in hash tables, with 'hash_size' + * buckets. fshash is looked up at every packet arrival + * so better be generous if we expect many entries. + */ + struct dn_ht *fshash; + struct dn_ht *schedhash; + /* list of flowsets without a scheduler -- use sch_chain */ + struct dn_fsk_head fsu; /* list of unlinked flowsets */ + struct dn_alg_head schedlist; /* list of algorithms */ + + /* Store the fs/sch to scan when draining. The value is the + * bucket number of the hash table. Expire can be disabled + * with net.inet.ip.dummynet.expire=0, or it happens every + * expire ticks. + **/ + int drain_fs; + int drain_sch; + uint32_t expire; + uint32_t expire_cycle; /* tick count */ + + /* if the upper half is busy doing something long, + * can set the busy flag and we will enqueue packets in + * a queue for later processing. + */ + int busy; + struct mq pending; + +#ifdef _KERNEL + /* + * This file is normally used in the kernel, unless we do + * some userland tests, in which case we do not need a mtx. + * uh_mtx arbitrates between system calls and also + * protects fshash, schedhash and fsunlinked. + * These structures are readonly for the lower half. + * bh_mtx protects all other structures which may be + * modified upon packet arrivals + */ +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t uh_mtx; + spinlock_t bh_mtx; +#else + struct mtx uh_mtx; + struct mtx bh_mtx; +#endif + +#endif /* _KERNEL */ +}; + +/* + * Delay line, contains all packets on output from a link. + * Every scheduler instance has one. + */ +struct delay_line { + struct dn_id oid; + struct dn_sch_inst *si; + struct mq mq; +}; + +/* + * The kernel side of a flowset. It is linked in a hash table + * of flowsets, and in a list of children of their parent scheduler. + * qht is either the queue or (if HAVE_MASK) a hash table queues. + * Note that the mask to use is the (flow_mask|sched_mask), which + * changes as we attach/detach schedulers. So we store it here. + * + * XXX If we want to add scheduler-specific parameters, we need to + * put them in external storage because the scheduler may not be + * available when the fsk is created. + */ +struct dn_fsk { /* kernel side of a flowset */ + struct dn_fs fs; + SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ + + struct ipfw_flow_id fsk_mask; + + /* qht is a hash table of queues, or just a single queue + * a bit in fs.flags tells us which one + */ + struct dn_ht *qht; + struct dn_schk *sched; /* Sched we are linked to */ + SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ + + /* bucket index used by drain routine to drain queues for this + * flowset + */ + int drain_bucket; + /* Parameter realted to RED / GRED */ + /* original values are in dn_fs*/ + int w_q ; /* queue weight (scaled) */ + int max_th ; /* maximum threshold for queue (scaled) */ + int min_th ; /* minimum threshold for queue (scaled) */ + int max_p ; /* maximum value for p_b (scaled) */ + + u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ + u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ + u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ + u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ + u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ + u_int lookup_depth ; /* depth of lookup table */ + int lookup_step ; /* granularity inside the lookup table */ + int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ + int avg_pkt_size ; /* medium packet size */ + int max_pkt_size ; /* max packet size */ +}; + +/* + * A queue is created as a child of a flowset unless it belongs to + * a !MULTIQUEUE scheduler. It is normally in a hash table in the + * flowset. fs always points to the parent flowset. + * si normally points to the sch_inst, unless the flowset has been + * detached from the scheduler -- in this case si == NULL and we + * should not enqueue. + */ +struct dn_queue { + struct dn_flow ni; /* oid, flow_id, stats */ + struct mq mq; /* packets queue */ + struct dn_sch_inst *_si; /* owner scheduler instance */ + SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ + struct dn_fsk *fs; /* parent flowset. */ + + /* RED parameters */ + int avg; /* average queue length est. (scaled) */ + int count; /* arrivals since last RED drop */ + int random; /* random value (scaled) */ + uint64_t q_time; /* start of queue idle time */ + +}; + +/* + * The kernel side of a scheduler. Contains the userland config, + * a link, pointer to extra config arguments from command line, + * kernel flags, and a pointer to the scheduler methods. + * It is stored in a hash table, and holds a list of all + * flowsets and scheduler instances. + * XXX sch must be at the beginning, see schk_hash(). + */ +struct dn_schk { + struct dn_sch sch; + struct dn_alg *fp; /* Pointer to scheduler functions */ + struct dn_link link; /* The link, embedded */ + struct dn_profile *profile; /* delay profile, if any */ + struct dn_id *cfg; /* extra config arguments */ + + SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ + + struct dn_fsk_head fsk_list; /* all fsk linked to me */ + struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ + + /* bucket index used by the drain routine to drain the scheduler + * instance for this flowset. + */ + int drain_bucket; + + /* Hash table of all instances (through sch.sched_mask) + * or single instance if no mask. Always valid. + */ + struct dn_ht *siht; +}; + + +/* + * Scheduler instance. + * Contains variables and all queues relative to a this instance. + * This struct is created a runtime. + */ +struct dn_sch_inst { + struct dn_flow ni; /* oid, flowid and stats */ + SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ + struct delay_line dline; + struct dn_schk *sched; /* the template */ + int kflags; /* DN_ACTIVE */ + + int64_t credit; /* bits I can transmit (more or less). */ + uint64_t sched_time; /* time link was scheduled in ready_heap */ + uint64_t idle_time; /* start of scheduler instance idle time */ + + /* q_count is the number of queues that this instance is using. + * The counter is incremented or decremented when + * a reference from the queue is created or deleted. + * It is used to make sure that a scheduler instance can be safely + * deleted by the drain routine. See notes below. + */ + int q_count; + +}; + +/* + * NOTE about object drain. + * The system will automatically (XXX check when) drain queues and + * scheduler instances when they are idle. + * A queue is idle when it has no packets; an instance is idle when + * it is not in the evheap heap, and the corresponding delay line is empty. + * A queue can be safely deleted when it is idle because of the scheduler + * function xxx_free_queue() will remove any references to it. + * An instance can be only deleted when no queues reference it. To be sure + * of that, a counter (q_count) stores the number of queues that are pointing + * to the instance. + * + * XXX + * Order of scan: + * - take all flowset in a bucket for the flowset hash table + * - take all queues in a bucket for the flowset + * - increment the queue bucket + * - scan next flowset bucket + * Nothing is done if a bucket contains no entries. + * + * The same schema is used for sceduler instances + */ + + +/* kernel-side flags. Linux has DN_DELETE in fcntl.h + */ +enum { + /* 1 and 2 are reserved for the SCAN flags */ + DN_DESTROY = 0x0004, /* destroy */ + DN_DELETE_FS = 0x0008, /* destroy flowset */ + DN_DETACH = 0x0010, + DN_ACTIVE = 0x0020, /* object is in evheap */ + DN_F_DLINE = 0x0040, /* object is a delay line */ + DN_F_SCHI = 0x00C0, /* object is a sched.instance */ + DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ +}; + +extern struct dn_parms dn_cfg; + +int dummynet_io(struct mbuf **, int , struct ip_fw_args *); +void dummynet_task(void *context, int pending); +void dn_reschedule(void); + +struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *, + struct ipfw_flow_id *); +struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); + +/* + * copy_range is a template for requests for ranges of pipes/queues/scheds. + * The number of ranges is variable and can be derived by o.len. + * As a default, we use a small number of entries so that the struct + * fits easily on the stack and is sufficient for most common requests. + */ +#define DEFAULT_RANGES 5 +struct copy_range { + struct dn_id o; + uint32_t r[ 2 * DEFAULT_RANGES ]; +}; + +struct copy_args { + char **start; + char *end; + int flags; + int type; + struct copy_range *extra; /* extra filtering */ +}; + +struct sockopt; +int ip_dummynet_compat(struct sockopt *sopt); +int dummynet_get(struct sockopt *sopt, void **compat); +int dn_c_copy_q (void *_ni, void *arg); +int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); +int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); +int dn_compat_copy_queue(struct copy_args *a, void *_o); +int dn_compat_copy_pipe(struct copy_args *a, void *_o); +int copy_data_helper_compat(void *_o, void *_arg); +int dn_compat_calc_size(struct dn_parms dn_cfg); +int do_config(void *p, int l); + +/* function to drain idle object */ +void dn_drain_scheduler(void); +void dn_drain_queue(void); + +#endif /* _IP_DN_PRIVATE_H */ diff --git a/freebsd/sys/netinet/ipfw/ip_dummynet.c b/freebsd/sys/netinet/ipfw/ip_dummynet.c new file mode 100644 index 00000000..dca39d06 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_dummynet.c @@ -0,0 +1,2297 @@ +#include + +/*- + * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * Configuration and internal object management for dummynet. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ +#include +#include /* ip_output(), IP_FORWARDING */ +#include +#include +#include +#include +#include +#include + +/* which objects to copy */ +#define DN_C_LINK 0x01 +#define DN_C_SCH 0x02 +#define DN_C_FLOW 0x04 +#define DN_C_FS 0x08 +#define DN_C_QUEUE 0x10 + +/* we use this argument in case of a schk_new */ +struct schk_new_arg { + struct dn_alg *fp; + struct dn_sch *sch; +}; + +/*---- callout hooks. ----*/ +static struct callout dn_timeout; +static struct task dn_task; +static struct taskqueue *dn_tq = NULL; + +static void +dummynet(void * __unused unused) +{ + + taskqueue_enqueue(dn_tq, &dn_task); +} + +void +dn_reschedule(void) +{ + callout_reset(&dn_timeout, 1, dummynet, NULL); +} +/*----- end of callout hooks -----*/ + +/* Return a scheduler descriptor given the type or name. */ +static struct dn_alg * +find_sched_type(int type, char *name) +{ + struct dn_alg *d; + + SLIST_FOREACH(d, &dn_cfg.schedlist, next) { + if (d->type == type || (name && !strcmp(d->name, name))) + return d; + } + return NULL; /* not found */ +} + +int +ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) +{ + int oldv = *v; + const char *op = NULL; + if (oldv < lo) { + *v = dflt; + op = "Bump"; + } else if (oldv > hi) { + *v = hi; + op = "Clamp"; + } else + return *v; + if (op && msg) + printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); + return *v; +} + +/*---- flow_id mask, hash and compare functions ---*/ +/* + * The flow_id includes the 5-tuple, the queue/pipe number + * which we store in the extra area in host order, + * and for ipv6 also the flow_id6. + * XXX see if we want the tos byte (can store in 'flags') + */ +static struct ipfw_flow_id * +flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) +{ + int is_v6 = IS_IP6_FLOW_ID(id); + + id->dst_port &= mask->dst_port; + id->src_port &= mask->src_port; + id->proto &= mask->proto; + id->extra &= mask->extra; + if (is_v6) { + APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); + APPLY_MASK(&id->src_ip6, &mask->src_ip6); + id->flow_id6 &= mask->flow_id6; + } else { + id->dst_ip &= mask->dst_ip; + id->src_ip &= mask->src_ip; + } + return id; +} + +/* computes an OR of two masks, result in dst and also returned */ +static struct ipfw_flow_id * +flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) +{ + int is_v6 = IS_IP6_FLOW_ID(dst); + + dst->dst_port |= src->dst_port; + dst->src_port |= src->src_port; + dst->proto |= src->proto; + dst->extra |= src->extra; + if (is_v6) { +#define OR_MASK(_d, _s) \ + (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ + (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ + (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ + (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; + OR_MASK(&dst->dst_ip6, &src->dst_ip6); + OR_MASK(&dst->src_ip6, &src->src_ip6); +#undef OR_MASK + dst->flow_id6 |= src->flow_id6; + } else { + dst->dst_ip |= src->dst_ip; + dst->src_ip |= src->src_ip; + } + return dst; +} + +static int +nonzero_mask(struct ipfw_flow_id *m) +{ + if (m->dst_port || m->src_port || m->proto || m->extra) + return 1; + if (IS_IP6_FLOW_ID(m)) { + return + m->dst_ip6.__u6_addr.__u6_addr32[0] || + m->dst_ip6.__u6_addr.__u6_addr32[1] || + m->dst_ip6.__u6_addr.__u6_addr32[2] || + m->dst_ip6.__u6_addr.__u6_addr32[3] || + m->src_ip6.__u6_addr.__u6_addr32[0] || + m->src_ip6.__u6_addr.__u6_addr32[1] || + m->src_ip6.__u6_addr.__u6_addr32[2] || + m->src_ip6.__u6_addr.__u6_addr32[3] || + m->flow_id6; + } else { + return m->dst_ip || m->src_ip; + } +} + +/* XXX we may want a better hash function */ +static uint32_t +flow_id_hash(struct ipfw_flow_id *id) +{ + uint32_t i; + + if (IS_IP6_FLOW_ID(id)) { + uint32_t *d = (uint32_t *)&id->dst_ip6; + uint32_t *s = (uint32_t *)&id->src_ip6; + i = (d[0] ) ^ (d[1]) ^ + (d[2] ) ^ (d[3]) ^ + (d[0] >> 15) ^ (d[1] >> 15) ^ + (d[2] >> 15) ^ (d[3] >> 15) ^ + (s[0] << 1) ^ (s[1] << 1) ^ + (s[2] << 1) ^ (s[3] << 1) ^ + (s[0] << 16) ^ (s[1] << 16) ^ + (s[2] << 16) ^ (s[3] << 16) ^ + (id->dst_port << 1) ^ (id->src_port) ^ + (id->extra) ^ + (id->proto ) ^ (id->flow_id6); + } else { + i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ + (id->src_ip << 1) ^ (id->src_ip >> 16) ^ + (id->extra) ^ + (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); + } + return i; +} + +/* Like bcmp, returns 0 if ids match, 1 otherwise. */ +static int +flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) +{ + int is_v6 = IS_IP6_FLOW_ID(id1); + + if (!is_v6) { + if (IS_IP6_FLOW_ID(id2)) + return 1; /* different address families */ + + return (id1->dst_ip == id2->dst_ip && + id1->src_ip == id2->src_ip && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->extra == id2->extra) ? 0 : 1; + } + /* the ipv6 case */ + return ( + !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && + !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && + id1->dst_port == id2->dst_port && + id1->src_port == id2->src_port && + id1->proto == id2->proto && + id1->extra == id2->extra && + id1->flow_id6 == id2->flow_id6) ? 0 : 1; +} +/*--------- end of flow-id mask, hash and compare ---------*/ + +/*--- support functions for the qht hashtable ---- + * Entries are hashed by flow-id + */ +static uint32_t +q_hash(uintptr_t key, int flags, void *arg) +{ + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_queue *)key)->ni.fid : + (struct ipfw_flow_id *)key; + + return flow_id_hash(id); +} + +static int +q_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_queue *o = (struct dn_queue *)obj; + struct ipfw_flow_id *id2; + + if (flags & DNHT_KEY_IS_OBJ) { + /* compare pointers */ + id2 = &((struct dn_queue *)key)->ni.fid; + } else { + id2 = (struct ipfw_flow_id *)key; + } + return (0 == flow_id_cmp(&o->ni.fid, id2)); +} + +/* + * create a new queue instance for the given 'key'. + */ +static void * +q_new(uintptr_t key, int flags, void *arg) +{ + struct dn_queue *q, *template = arg; + struct dn_fsk *fs = template->fs; + int size = sizeof(*q) + fs->sched->fp->q_datalen; + + q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (q == NULL) { + D("no memory for new queue"); + return NULL; + } + + set_oid(&q->ni.oid, DN_QUEUE, size); + if (fs->fs.flags & DN_QHT_HASH) + q->ni.fid = *(struct ipfw_flow_id *)key; + q->fs = fs; + q->_si = template->_si; + q->_si->q_count++; + + if (fs->sched->fp->new_queue) + fs->sched->fp->new_queue(q); + dn_cfg.queue_count++; + return q; +} + +/* + * Notify schedulers that a queue is going away. + * If (flags & DN_DESTROY), also free the packets. + * The version for callbacks is called q_delete_cb(). + */ +static void +dn_delete_queue(struct dn_queue *q, int flags) +{ + struct dn_fsk *fs = q->fs; + + // D("fs %p si %p\n", fs, q->_si); + /* notify the parent scheduler that the queue is going away */ + if (fs && fs->sched->fp->free_queue) + fs->sched->fp->free_queue(q); + q->_si->q_count--; + q->_si = NULL; + if (flags & DN_DESTROY) { + if (q->mq.head) + dn_free_pkts(q->mq.head); + bzero(q, sizeof(*q)); // safety + free(q, M_DUMMYNET); + dn_cfg.queue_count--; + } +} + +static int +q_delete_cb(void *q, void *arg) +{ + int flags = (int)(uintptr_t)arg; + dn_delete_queue(q, flags); + return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; +} + +/* + * calls dn_delete_queue/q_delete_cb on all queues, + * which notifies the parent scheduler and possibly drains packets. + * flags & DN_DESTROY: drains queues and destroy qht; + */ +static void +qht_delete(struct dn_fsk *fs, int flags) +{ + ND("fs %d start flags %d qht %p", + fs->fs.fs_nr, flags, fs->qht); + if (!fs->qht) + return; + if (fs->fs.flags & DN_QHT_HASH) { + dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); + if (flags & DN_DESTROY) { + dn_ht_free(fs->qht, 0); + fs->qht = NULL; + } + } else { + dn_delete_queue((struct dn_queue *)(fs->qht), flags); + if (flags & DN_DESTROY) + fs->qht = NULL; + } +} + +/* + * Find and possibly create the queue for a MULTIQUEUE scheduler. + * We never call it for !MULTIQUEUE (the queue is in the sch_inst). + */ +struct dn_queue * +ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si, + struct ipfw_flow_id *id) +{ + struct dn_queue template; + + template._si = si; + template.fs = fs; + + if (fs->fs.flags & DN_QHT_HASH) { + struct ipfw_flow_id masked_id; + if (fs->qht == NULL) { + fs->qht = dn_ht_init(NULL, fs->fs.buckets, + offsetof(struct dn_queue, q_next), + q_hash, q_match, q_new); + if (fs->qht == NULL) + return NULL; + } + masked_id = *id; + flow_id_mask(&fs->fsk_mask, &masked_id); + return dn_ht_find(fs->qht, (uintptr_t)&masked_id, + DNHT_INSERT, &template); + } else { + if (fs->qht == NULL) + fs->qht = q_new(0, 0, &template); + return (struct dn_queue *)fs->qht; + } +} +/*--- end of queue hash table ---*/ + +/*--- support functions for the sch_inst hashtable ---- + * + * These are hashed by flow-id + */ +static uint32_t +si_hash(uintptr_t key, int flags, void *arg) +{ + /* compute the hash slot from the flow id */ + struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + + return flow_id_hash(id); +} + +static int +si_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_sch_inst *o = obj; + struct ipfw_flow_id *id2; + + id2 = (flags & DNHT_KEY_IS_OBJ) ? + &((struct dn_sch_inst *)key)->ni.fid : + (struct ipfw_flow_id *)key; + return flow_id_cmp(&o->ni.fid, id2) == 0; +} + +/* + * create a new instance for the given 'key' + * Allocate memory for instance, delay line and scheduler private data. + */ +static void * +si_new(uintptr_t key, int flags, void *arg) +{ + struct dn_schk *s = arg; + struct dn_sch_inst *si; + int l = sizeof(*si) + s->fp->si_datalen; + + si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (si == NULL) + goto error; + /* Set length only for the part passed up to userland. */ + set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); + set_oid(&(si->dline.oid), DN_DELAY_LINE, + sizeof(struct delay_line)); + /* mark si and dline as outside the event queue */ + si->ni.oid.id = si->dline.oid.id = -1; + + si->sched = s; + si->dline.si = si; + + if (s->fp->new_sched && s->fp->new_sched(si)) { + D("new_sched error"); + goto error; + } + if (s->sch.flags & DN_HAVE_MASK) + si->ni.fid = *(struct ipfw_flow_id *)key; + + dn_cfg.si_count++; + return si; + +error: + if (si) { + bzero(si, sizeof(*si)); // safety + free(si, M_DUMMYNET); + } + return NULL; +} + +/* + * Callback from siht to delete all scheduler instances. Remove + * si and delay line from the system heap, destroy all queues. + * We assume that all flowset have been notified and do not + * point to us anymore. + */ +static int +si_destroy(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + struct dn_schk *s = si->sched; + struct delay_line *dl = &si->dline; + + if (dl->oid.subtype) /* remove delay line from event heap */ + heap_extract(&dn_cfg.evheap, dl); + dn_free_pkts(dl->mq.head); /* drain delay line */ + if (si->kflags & DN_ACTIVE) /* remove si from event heap */ + heap_extract(&dn_cfg.evheap, si); + if (s->fp->free_sched) + s->fp->free_sched(si); + bzero(si, sizeof(*si)); /* safety */ + free(si, M_DUMMYNET); + dn_cfg.si_count--; + return DNHT_SCAN_DEL; +} + +/* + * Find the scheduler instance for this packet. If we need to apply + * a mask, do on a local copy of the flow_id to preserve the original. + * Assume siht is always initialized if we have a mask. + */ +struct dn_sch_inst * +ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) +{ + + if (s->sch.flags & DN_HAVE_MASK) { + struct ipfw_flow_id id_t = *id; + flow_id_mask(&s->sch.sched_mask, &id_t); + return dn_ht_find(s->siht, (uintptr_t)&id_t, + DNHT_INSERT, s); + } + if (!s->siht) + s->siht = si_new(0, 0, s); + return (struct dn_sch_inst *)s->siht; +} + +/* callback to flush credit for the scheduler instance */ +static int +si_reset_credit(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + struct dn_link *p = &si->sched->link; + + si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); + return 0; +} + +static void +schk_reset_credit(struct dn_schk *s) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, si_reset_credit, NULL); + else if (s->siht) + si_reset_credit(s->siht, NULL); +} +/*---- end of sch_inst hashtable ---------------------*/ + +/*------------------------------------------------------- + * flowset hash (fshash) support. Entries are hashed by fs_nr. + * New allocations are put in the fsunlinked list, from which + * they are removed when they point to a specific scheduler. + */ +static uint32_t +fsk_hash(uintptr_t key, int flags, void *arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; + + return ( (i>>8)^(i>>4)^i ); +} + +static int +fsk_match(void *obj, uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs = obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_fsk *)key)->fs.fs_nr; + + return (fs->fs.fs_nr == i); +} + +static void * +fsk_new(uintptr_t key, int flags, void *arg) +{ + struct dn_fsk *fs; + + fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); + if (fs) { + set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); + dn_cfg.fsk_count++; + fs->drain_bucket = 0; + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } + return fs; +} + +/* + * detach flowset from its current scheduler. Flags as follows: + * DN_DETACH removes from the fsk_list + * DN_DESTROY deletes individual queues + * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). + */ +static void +fsk_detach(struct dn_fsk *fs, int flags) +{ + if (flags & DN_DELETE_FS) + flags |= DN_DESTROY; + ND("fs %d from sched %d flags %s %s %s", + fs->fs.fs_nr, fs->fs.sched_nr, + (flags & DN_DELETE_FS) ? "DEL_FS":"", + (flags & DN_DESTROY) ? "DEL":"", + (flags & DN_DETACH) ? "DET":""); + if (flags & DN_DETACH) { /* detach from the list */ + struct dn_fsk_head *h; + h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; + SLIST_REMOVE(h, fs, dn_fsk, sch_chain); + } + /* Free the RED parameters, they will be recomputed on + * subsequent attach if needed. + */ + if (fs->w_q_lookup) + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; + qht_delete(fs, flags); + if (fs->sched && fs->sched->fp->free_fsk) + fs->sched->fp->free_fsk(fs); + fs->sched = NULL; + if (flags & DN_DELETE_FS) { + bzero(fs, sizeof(fs)); /* safety */ + free(fs, M_DUMMYNET); + dn_cfg.fsk_count--; + } else { + SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); + } +} + +/* + * Detach or destroy all flowsets in a list. + * flags specifies what to do: + * DN_DESTROY: flush all queues + * DN_DELETE_FS: DN_DESTROY + destroy flowset + * DN_DELETE_FS implies DN_DESTROY + */ +static void +fsk_detach_list(struct dn_fsk_head *h, int flags) +{ + struct dn_fsk *fs; + int n = 0; /* only for stats */ + + ND("head %p flags %x", h, flags); + while ((fs = SLIST_FIRST(h))) { + SLIST_REMOVE_HEAD(h, sch_chain); + n++; + fsk_detach(fs, flags); + } + ND("done %d flowsets", n); +} + +/* + * called on 'queue X delete' -- removes the flowset from fshash, + * deletes all queues for the flowset, and removes the flowset. + */ +static int +delete_fs(int i, int locked) +{ + struct dn_fsk *fs; + int err = 0; + + if (!locked) + DN_BH_WLOCK(); + fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); + ND("fs %d found %p", i, fs); + if (fs) { + fsk_detach(fs, DN_DETACH | DN_DELETE_FS); + err = 0; + } else + err = EINVAL; + if (!locked) + DN_BH_WUNLOCK(); + return err; +} + +/*----- end of flowset hashtable support -------------*/ + +/*------------------------------------------------------------ + * Scheduler hash. When searching by index we pass sched_nr, + * otherwise we pass struct dn_sch * which is the first field in + * struct dn_schk so we can cast between the two. We use this trick + * because in the create phase (but it should be fixed). + */ +static uint32_t +schk_hash(uintptr_t key, int flags, void *_arg) +{ + uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return ( (i>>8)^(i>>4)^i ); +} + +static int +schk_match(void *obj, uintptr_t key, int flags, void *_arg) +{ + struct dn_schk *s = (struct dn_schk *)obj; + int i = !(flags & DNHT_KEY_IS_OBJ) ? key : + ((struct dn_schk *)key)->sch.sched_nr; + return (s->sch.sched_nr == i); +} + +/* + * Create the entry and intialize with the sched hash if needed. + * Leave s->fp unset so we can tell whether a dn_ht_find() returns + * a new object or a previously existing one. + */ +static void * +schk_new(uintptr_t key, int flags, void *arg) +{ + struct schk_new_arg *a = arg; + struct dn_schk *s; + int l = sizeof(*s) +a->fp->schk_datalen; + + s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s == NULL) + return NULL; + set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); + s->sch = *a->sch; // copy initial values + s->link.link_nr = s->sch.sched_nr; + SLIST_INIT(&s->fsk_list); + /* initialize the hash table or create the single instance */ + s->fp = a->fp; /* si_new needs this */ + s->drain_bucket = 0; + if (s->sch.flags & DN_HAVE_MASK) { + s->siht = dn_ht_init(NULL, s->sch.buckets, + offsetof(struct dn_sch_inst, si_next), + si_hash, si_match, si_new); + if (s->siht == NULL) { + free(s, M_DUMMYNET); + return NULL; + } + } + s->fp = NULL; /* mark as a new scheduler */ + dn_cfg.schk_count++; + return s; +} + +/* + * Callback for sched delete. Notify all attached flowsets to + * detach from the scheduler, destroy the internal flowset, and + * all instances. The scheduler goes away too. + * arg is 0 (only detach flowsets and destroy instances) + * DN_DESTROY (detach & delete queues, delete schk) + * or DN_DELETE_FS (delete queues and flowsets, delete schk) + */ +static int +schk_delete_cb(void *obj, void *arg) +{ + struct dn_schk *s = obj; +#if 0 + int a = (int)arg; + ND("sched %d arg %s%s", + s->sch.sched_nr, + a&DN_DESTROY ? "DEL ":"", + a&DN_DELETE_FS ? "DEL_FS":""); +#endif + fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); + /* no more flowset pointing to us now */ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, si_destroy, NULL); + else if (s->siht) + si_destroy(s->siht, NULL); + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + s->siht = NULL; + if (s->fp->destroy) + s->fp->destroy(s); + bzero(s, sizeof(*s)); // safety + free(obj, M_DUMMYNET); + dn_cfg.schk_count--; + return DNHT_SCAN_DEL; +} + +/* + * called on a 'sched X delete' command. Deletes a single scheduler. + * This is done by removing from the schedhash, unlinking all + * flowsets and deleting their traffic. + */ +static int +delete_schk(int i) +{ + struct dn_schk *s; + + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + ND("%d %p", i, s); + if (!s) + return EINVAL; + delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ + /* then detach flowsets, delete traffic */ + schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); + return 0; +} +/*--- end of schk hashtable support ---*/ + +static int +copy_obj(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; + + if (have < o->len || o->len == 0 || o->type == 0) { + D("(WARN) type %d %s %d have %d need %d", + o->type, msg, i, have, o->len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, o->len); + bcopy(_o, *start, o->len); + if (o->type == DN_LINK) { + /* Adjust burst parameter for link */ + struct dn_link *l = (struct dn_link *)*start; + l->burst = div64(l->burst, 8 * hz); + } else if (o->type == DN_SCH) { + /* Set id->id to the number of instances */ + struct dn_schk *s = _o; + struct dn_id *id = (struct dn_id *)(*start); + id->id = (s->sch.flags & DN_HAVE_MASK) ? + dn_ht_entries(s->siht) : (s->siht ? 1 : 0); + } + *start += o->len; + return 0; +} + +/* Specific function to copy a queue. + * Copies only the user-visible part of a queue (which is in + * a struct dn_flow), and sets len accordingly. + */ +static int +copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) +{ + struct dn_id *o = _o; + int have = end - *start; + int len = sizeof(struct dn_flow); /* see above comment */ + + if (have < len || o->len == 0 || o->type != DN_QUEUE) { + D("ERROR type %d %s %d have %d need %d", + o->type, msg, i, have, len); + return 1; + } + ND("type %d %s %d len %d", o->type, msg, i, len); + bcopy(_o, *start, len); + ((struct dn_id*)(*start))->len = len; + *start += len; + return 0; +} + +static int +copy_q_cb(void *obj, void *arg) +{ + struct dn_queue *q = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ + ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); + return 0; +} + +static int +copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + if (!fs->qht) + return 0; + if (fs->fs.flags & DN_QHT_HASH) + dn_ht_scan(fs->qht, copy_q_cb, a); + else + copy_q_cb(fs->qht, a); + return 0; +} + +/* + * This routine only copies the initial part of a profile ? XXX + */ +static int +copy_profile(struct copy_args *a, struct dn_profile *p) +{ + int have = a->end - *a->start; + /* XXX here we check for max length */ + int profile_len = sizeof(struct dn_profile) - + ED_MAX_SAMPLES_NO*sizeof(int); + + if (p == NULL) + return 0; + if (have < profile_len) { + D("error have %d need %d", have, profile_len); + return 1; + } + bcopy(p, *a->start, profile_len); + ((struct dn_id *)(*a->start))->len = profile_len; + *a->start += profile_len; + return 0; +} + +static int +copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) +{ + struct dn_fs *ufs = (struct dn_fs *)(*a->start); + if (!fs) + return 0; + ND("flowset %d", fs->fs.fs_nr); + if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) + return DNHT_SCAN_END; + ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? + dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); + if (flags) { /* copy queues */ + copy_q(a, fs, 0); + } + return 0; +} + +static int +copy_si_cb(void *obj, void *arg) +{ + struct dn_sch_inst *si = obj; + struct copy_args *a = arg; + struct dn_flow *ni = (struct dn_flow *)(*a->start); + if (copy_obj(a->start, a->end, &si->ni, "inst", + si->sched->sch.sched_nr)) + return DNHT_SCAN_END; + ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ + ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); + return 0; +} + +static int +copy_si(struct copy_args *a, struct dn_schk *s, int flags) +{ + if (s->sch.flags & DN_HAVE_MASK) + dn_ht_scan(s->siht, copy_si_cb, a); + else if (s->siht) + copy_si_cb(s->siht, a); + return 0; +} + +/* + * compute a list of children of a scheduler and copy up + */ +static int +copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) +{ + struct dn_fsk *fs; + struct dn_id *o; + uint32_t *p; + + int n = 0, space = sizeof(*o); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs->fs.fs_nr < DN_MAX_ID) + n++; + } + space += n * sizeof(uint32_t); + DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); + if (a->end - *(a->start) < space) + return DNHT_SCAN_END; + o = (struct dn_id *)(*(a->start)); + o->len = space; + *a->start += o->len; + o->type = DN_TEXT; + p = (uint32_t *)(o+1); + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) + if (fs->fs.fs_nr < DN_MAX_ID) + *p++ = fs->fs.fs_nr; + return 0; +} + +static int +copy_data_helper(void *_o, void *_arg) +{ + struct copy_args *a = _arg; + uint32_t *r = a->extra->r; /* start of first range */ + uint32_t *lim; /* first invalid pointer */ + int n; + + lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len); + + if (a->type == DN_LINK || a->type == DN_SCH) { + /* pipe|sched show, we receive a dn_schk */ + struct dn_schk *s = _o; + + n = s->sch.sched_nr; + if (a->type == DN_SCH && n >= DN_MAX_ID) + return 0; /* not a scheduler */ + if (a->type == DN_LINK && n <= DN_MAX_ID) + return 0; /* not a pipe */ + + /* see if the object is within one of our ranges */ + for (;r < lim; r += 2) { + if (n < r[0] || n > r[1]) + continue; + /* Found a valid entry, copy and we are done */ + if (a->flags & DN_C_LINK) { + if (copy_obj(a->start, a->end, + &s->link, "link", n)) + return DNHT_SCAN_END; + if (copy_profile(a, s->profile)) + return DNHT_SCAN_END; + if (copy_flowset(a, s->fs, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_SCH) { + if (copy_obj(a->start, a->end, + &s->sch, "sched", n)) + return DNHT_SCAN_END; + /* list all attached flowsets */ + if (copy_fsk_list(a, s, 0)) + return DNHT_SCAN_END; + } + if (a->flags & DN_C_FLOW) + copy_si(a, s, 0); + break; + } + } else if (a->type == DN_FS) { + /* queue show, skip internal flowsets */ + struct dn_fsk *fs = _o; + + n = fs->fs.fs_nr; + if (n >= DN_MAX_ID) + return 0; + /* see if the object is within one of our ranges */ + for (;r < lim; r += 2) { + if (n < r[0] || n > r[1]) + continue; + if (copy_flowset(a, fs, 0)) + return DNHT_SCAN_END; + copy_q(a, fs, 0); + break; /* we are done */ + } + } + return 0; +} + +static inline struct dn_schk * +locate_scheduler(int i) +{ + return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); +} + +/* + * red parameters are in fixed point arithmetic. + */ +static int +config_red(struct dn_fsk *fs) +{ + int64_t s, idle, weight, w0; + int t, i; + + fs->w_q = fs->fs.w_q; + fs->max_p = fs->fs.max_p; + D("called"); + /* Doing stuff that was in userland */ + i = fs->sched->link.bandwidth; + s = (i <= 0) ? 0 : + hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; + + idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ + fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); + /* fs->lookup_step not scaled, */ + if (!fs->lookup_step) + fs->lookup_step = 1; + w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled + + for (t = fs->lookup_step; t > 1; --t) + weight = SCALE_MUL(weight, w0); + fs->lookup_weight = (int)(weight); // scaled + + /* Now doing stuff that was in kerneland */ + fs->min_th = SCALE(fs->fs.min_th); + fs->max_th = SCALE(fs->fs.max_th); + + fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th); + fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); + + if (fs->fs.flags & DN_IS_GENTLE_RED) { + fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; + fs->c_4 = SCALE(1) - 2 * fs->max_p; + } + + /* If the lookup table already exist, free and create it again. */ + if (fs->w_q_lookup) { + free(fs->w_q_lookup, M_DUMMYNET); + fs->w_q_lookup = NULL; + } + if (dn_cfg.red_lookup_depth == 0) { + printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" + "must be > 0\n"); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; + return (EINVAL); + } + fs->lookup_depth = dn_cfg.red_lookup_depth; + fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), + M_DUMMYNET, M_NOWAIT); + if (fs->w_q_lookup == NULL) { + printf("dummynet: sorry, cannot allocate red lookup table\n"); + fs->fs.flags &= ~DN_IS_RED; + fs->fs.flags &= ~DN_IS_GENTLE_RED; + return(ENOSPC); + } + + /* Fill the lookup table with (1 - w_q)^x */ + fs->w_q_lookup[0] = SCALE(1) - fs->w_q; + + for (i = 1; i < fs->lookup_depth; i++) + fs->w_q_lookup[i] = + SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); + + if (dn_cfg.red_avg_pkt_size < 1) + dn_cfg.red_avg_pkt_size = 512; + fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; + if (dn_cfg.red_max_pkt_size < 1) + dn_cfg.red_max_pkt_size = 1500; + fs->max_pkt_size = dn_cfg.red_max_pkt_size; + D("exit"); + return 0; +} + +/* Scan all flowset attached to this scheduler and update red */ +static void +update_red(struct dn_schk *s) +{ + struct dn_fsk *fs; + SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { + if (fs && (fs->fs.flags & DN_IS_RED)) + config_red(fs); + } +} + +/* attach flowset to scheduler s, possibly requeue */ +static void +fsk_attach(struct dn_fsk *fs, struct dn_schk *s) +{ + ND("remove fs %d from fsunlinked, link to sched %d", + fs->fs.fs_nr, s->sch.sched_nr); + SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); + fs->sched = s; + SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); + if (s->fp->new_fsk) + s->fp->new_fsk(fs); + /* XXX compute fsk_mask */ + fs->fsk_mask = fs->fs.flow_mask; + if (fs->sched->sch.flags & DN_HAVE_MASK) + flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); + if (fs->qht) { + /* + * we must drain qht according to the old + * type, and reinsert according to the new one. + * The requeue is complex -- in general we need to + * reclassify every single packet. + * For the time being, let's hope qht is never set + * when we reach this point. + */ + D("XXX TODO requeue from fs %d to sch %d", + fs->fs.fs_nr, s->sch.sched_nr); + fs->qht = NULL; + } + /* set the new type for qht */ + if (nonzero_mask(&fs->fsk_mask)) + fs->fs.flags |= DN_QHT_HASH; + else + fs->fs.flags &= ~DN_QHT_HASH; + + /* XXX config_red() can fail... */ + if (fs->fs.flags & DN_IS_RED) + config_red(fs); +} + +/* update all flowsets which may refer to this scheduler */ +static void +update_fs(struct dn_schk *s) +{ + struct dn_fsk *fs, *tmp; + + SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { + if (s->sch.sched_nr != fs->fs.sched_nr) { + D("fs %d for sch %d not %d still unlinked", + fs->fs.fs_nr, fs->fs.sched_nr, + s->sch.sched_nr); + continue; + } + fsk_attach(fs, s); + } +} + +/* + * Configuration -- to preserve backward compatibility we use + * the following scheme (N is 65536) + * NUMBER SCHED LINK FLOWSET + * 1 .. N-1 (1)WFQ (2)WFQ (3)queue + * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 + * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 + * + * "pipe i config" configures #1, #2 and #3 + * "sched i config" configures #1 and possibly #6 + * "queue i config" configures #3 + * #1 is configured with 'pipe i config' or 'sched i config' + * #2 is configured with 'pipe i config', and created if not + * existing with 'sched i config' + * #3 is configured with 'queue i config' + * #4 is automatically configured after #1, can only be FIFO + * #5 is automatically configured after #2 + * #6 is automatically created when #1 is !MULTIQUEUE, + * and can be updated. + * #7 is automatically configured after #2 + */ + +/* + * configure a link (and its FIFO instance) + */ +static int +config_link(struct dn_link *p, struct dn_id *arg) +{ + int i; + + if (p->oid.len != sizeof(*p)) { + D("invalid pipe len %d", p->oid.len); + return EINVAL; + } + i = p->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* + * The config program passes parameters as follows: + * bw = bits/second (0 means no limits), + * delay = ms, must be translated into ticks. + * qsize = slots/bytes + * burst ??? + */ + p->delay = (p->delay * hz) / 1000; + /* Scale burst size: bytes -> bits * hz */ + p->burst *= 8 * hz; + + DN_BH_WLOCK(); + /* do it twice, base link and FIFO link */ + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + struct dn_schk *s = locate_scheduler(i); + if (s == NULL) { + DN_BH_WUNLOCK(); + D("sched %d not found", i); + return EINVAL; + } + /* remove profile if exists */ + if (s->profile) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + /* copy all parameters */ + s->link.oid = p->oid; + s->link.link_nr = i; + s->link.delay = p->delay; + if (s->link.bandwidth != p->bandwidth) { + /* XXX bandwidth changes, need to update red params */ + s->link.bandwidth = p->bandwidth; + update_red(s); + } + s->link.burst = p->burst; + schk_reset_credit(s); + } + dn_cfg.id++; + DN_BH_WUNLOCK(); + return 0; +} + +/* + * configure a flowset. Can be called from inside with locked=1, + */ +static struct dn_fsk * +config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) +{ + int i; + struct dn_fsk *fs; + + if (nfs->oid.len != sizeof(*nfs)) { + D("invalid flowset len %d", nfs->oid.len); + return NULL; + } + i = nfs->fs_nr; + if (i <= 0 || i >= 3*DN_MAX_ID) + return NULL; + ND("flowset %d", i); + /* XXX other sanity checks */ + if (nfs->flags & DN_QSIZE_BYTES) { + ipdn_bound_var(&nfs->qsize, 16384, + 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); + } else { + ipdn_bound_var(&nfs->qsize, 50, + 1, dn_cfg.slot_limit, NULL); // "queue slot size"); + } + if (nfs->flags & DN_HAVE_MASK) { + /* make sure we have some buckets */ + ipdn_bound_var(&nfs->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "flowset buckets"); + } else { + nfs->buckets = 1; /* we only need 1 */ + } + if (!locked) + DN_BH_WLOCK(); + do { /* exit with break when done */ + struct dn_schk *s; + int flags = nfs->sched_nr ? DNHT_INSERT : 0; + int j; + int oldc = dn_cfg.fsk_count; + fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); + if (fs == NULL) { + D("missing sched for flowset %d", i); + break; + } + /* grab some defaults from the existing one */ + if (nfs->sched_nr == 0) /* reuse */ + nfs->sched_nr = fs->fs.sched_nr; + for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { + if (nfs->par[j] == -1) /* reuse */ + nfs->par[j] = fs->fs.par[j]; + } + if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { + ND("flowset %d unchanged", i); + break; /* no change, nothing to do */ + } + if (oldc != dn_cfg.fsk_count) /* new item */ + dn_cfg.id++; + s = locate_scheduler(nfs->sched_nr); + /* detach from old scheduler if needed, preserving + * queues if we need to reattach. Then update the + * configuration, and possibly attach to the new sched. + */ + DX(2, "fs %d changed sched %d@%p to %d@%p", + fs->fs.fs_nr, + fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); + if (fs->sched) { + int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); + flags |= DN_DESTROY; /* XXX temporary */ + fsk_detach(fs, flags); + } + fs->fs = *nfs; /* copy configuration */ + if (s != NULL) + fsk_attach(fs, s); + } while (0); + if (!locked) + DN_BH_WUNLOCK(); + return fs; +} + +/* + * config/reconfig a scheduler and its FIFO variant. + * For !MULTIQUEUE schedulers, also set up the flowset. + * + * On reconfigurations (detected because s->fp is set), + * detach existing flowsets preserving traffic, preserve link, + * and delete the old scheduler creating a new one. + */ +static int +config_sched(struct dn_sch *_nsch, struct dn_id *arg) +{ + struct dn_schk *s; + struct schk_new_arg a; /* argument for schk_new */ + int i; + struct dn_link p; /* copy of oldlink */ + struct dn_profile *pf = NULL; /* copy of old link profile */ + /* Used to preserv mask parameter */ + struct ipfw_flow_id new_mask; + int new_buckets = 0; + int new_flags = 0; + int pipe_cmd; + int err = ENOMEM; + + a.sch = _nsch; + if (a.sch->oid.len != sizeof(*a.sch)) { + D("bad sched len %d", a.sch->oid.len); + return EINVAL; + } + i = a.sch->sched_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* make sure we have some buckets */ + if (a.sch->flags & DN_HAVE_MASK) + ipdn_bound_var(&a.sch->buckets, dn_cfg.hash_size, + 1, dn_cfg.max_hash_size, "sched buckets"); + /* XXX other sanity checks */ + bzero(&p, sizeof(p)); + + pipe_cmd = a.sch->flags & DN_PIPE_CMD; + a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? + if (pipe_cmd) { + /* Copy mask parameter */ + new_mask = a.sch->sched_mask; + new_buckets = a.sch->buckets; + new_flags = a.sch->flags; + } + DN_BH_WLOCK(); +again: /* run twice, for wfq and fifo */ + /* + * lookup the type. If not supplied, use the previous one + * or default to WF2Q+. Otherwise, return an error. + */ + dn_cfg.id++; + a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); + if (a.fp != NULL) { + /* found. Lookup or create entry */ + s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); + } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { + /* No type. search existing s* or retry with WF2Q+ */ + s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); + if (s != NULL) { + a.fp = s->fp; + /* Scheduler exists, skip to FIFO scheduler + * if command was pipe config... + */ + if (pipe_cmd) + goto next; + } else { + /* New scheduler, create a wf2q+ with no mask + * if command was pipe config... + */ + if (pipe_cmd) { + /* clear mask parameter */ + bzero(&a.sch->sched_mask, sizeof(new_mask)); + a.sch->buckets = 0; + a.sch->flags &= ~DN_HAVE_MASK; + } + a.sch->oid.subtype = DN_SCHED_WF2QP; + goto again; + } + } else { + D("invalid scheduler type %d %s", + a.sch->oid.subtype, a.sch->name); + err = EINVAL; + goto error; + } + /* normalize name and subtype */ + a.sch->oid.subtype = a.fp->type; + bzero(a.sch->name, sizeof(a.sch->name)); + strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); + if (s == NULL) { + D("cannot allocate scheduler %d", i); + goto error; + } + /* restore existing link if any */ + if (p.link_nr) { + s->link = p; + if (!pf || pf->link_nr != p.link_nr) { /* no saved value */ + s->profile = NULL; /* XXX maybe not needed */ + } else { + s->profile = malloc(sizeof(struct dn_profile), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s->profile == NULL) { + D("cannot allocate profile"); + goto error; //XXX + } + bcopy(pf, s->profile, sizeof(*pf)); + } + } + p.link_nr = 0; + if (s->fp == NULL) { + DX(2, "sched %d new type %s", i, a.fp->name); + } else if (s->fp != a.fp || + bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { + /* already existing. */ + DX(2, "sched %d type changed from %s to %s", + i, s->fp->name, a.fp->name); + DX(4, " type/sub %d/%d -> %d/%d", + s->sch.oid.type, s->sch.oid.subtype, + a.sch->oid.type, a.sch->oid.subtype); + if (s->link.link_nr == 0) + D("XXX WARNING link 0 for sched %d", i); + p = s->link; /* preserve link */ + if (s->profile) {/* preserve profile */ + if (!pf) + pf = malloc(sizeof(*pf), + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (pf) /* XXX should issue a warning otherwise */ + bcopy(s->profile, pf, sizeof(*pf)); + } + /* remove from the hash */ + dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); + /* Detach flowsets, preserve queues. */ + // schk_delete_cb(s, NULL); + // XXX temporarily, kill queues + schk_delete_cb(s, (void *)DN_DESTROY); + goto again; + } else { + DX(4, "sched %d unchanged type %s", i, a.fp->name); + } + /* complete initialization */ + s->sch = *a.sch; + s->fp = a.fp; + s->cfg = arg; + // XXX schk_reset_credit(s); + /* create the internal flowset if needed, + * trying to reuse existing ones if available + */ + if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { + s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); + if (!s->fs) { + struct dn_fs fs; + bzero(&fs, sizeof(fs)); + set_oid(&fs.oid, DN_FS, sizeof(fs)); + fs.fs_nr = i + DN_MAX_ID; + fs.sched_nr = i; + s->fs = config_fs(&fs, NULL, 1 /* locked */); + } + if (!s->fs) { + schk_delete_cb(s, (void *)DN_DESTROY); + D("error creating internal fs for %d", i); + goto error; + } + } + /* call init function after the flowset is created */ + if (s->fp->config) + s->fp->config(s); + update_fs(s); +next: + if (i < DN_MAX_ID) { /* now configure the FIFO instance */ + i += DN_MAX_ID; + if (pipe_cmd) { + /* Restore mask parameter for FIFO */ + a.sch->sched_mask = new_mask; + a.sch->buckets = new_buckets; + a.sch->flags = new_flags; + } else { + /* sched config shouldn't modify the FIFO scheduler */ + if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { + /* FIFO already exist, don't touch it */ + err = 0; /* and this is not an error */ + goto error; + } + } + a.sch->sched_nr = i; + a.sch->oid.subtype = DN_SCHED_FIFO; + bzero(a.sch->name, sizeof(a.sch->name)); + goto again; + } + err = 0; +error: + DN_BH_WUNLOCK(); + if (pf) + free(pf, M_DUMMYNET); + return err; +} + +/* + * attach a profile to a link + */ +static int +config_profile(struct dn_profile *pf, struct dn_id *arg) +{ + struct dn_schk *s; + int i, olen, err = 0; + + if (pf->oid.len < sizeof(*pf)) { + D("short profile len %d", pf->oid.len); + return EINVAL; + } + i = pf->link_nr; + if (i <= 0 || i >= DN_MAX_ID) + return EINVAL; + /* XXX other sanity checks */ + DN_BH_WLOCK(); + for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { + s = locate_scheduler(i); + + if (s == NULL) { + err = EINVAL; + break; + } + dn_cfg.id++; + /* + * If we had a profile and the new one does not fit, + * or it is deleted, then we need to free memory. + */ + if (s->profile && (pf->samples_no == 0 || + s->profile->oid.len < pf->oid.len)) { + free(s->profile, M_DUMMYNET); + s->profile = NULL; + } + if (pf->samples_no == 0) + continue; + /* + * new profile, possibly allocate memory + * and copy data. + */ + if (s->profile == NULL) + s->profile = malloc(pf->oid.len, + M_DUMMYNET, M_NOWAIT | M_ZERO); + if (s->profile == NULL) { + D("no memory for profile %d", i); + err = ENOMEM; + break; + } + /* preserve larger length XXX double check */ + olen = s->profile->oid.len; + if (olen < pf->oid.len) + olen = pf->oid.len; + bcopy(pf, s->profile, pf->oid.len); + s->profile->oid.len = olen; + } + DN_BH_WUNLOCK(); + return err; +} + +/* + * Delete all objects: + */ +static void +dummynet_flush(void) +{ + + /* delete all schedulers and related links/queues/flowsets */ + dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, + (void *)(uintptr_t)DN_DELETE_FS); + /* delete all remaining (unlinked) flowsets */ + DX(4, "still %d unlinked fs", dn_cfg.fsk_count); + dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); + fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); + /* Reinitialize system heap... */ + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); +} + +/* + * Main handler for configuration. We are guaranteed to be called + * with an oid which is at least a dn_id. + * - the first object is the command (config, delete, flush, ...) + * - config_link must be issued after the corresponding config_sched + * - parameters (DN_TXT) for an object must preceed the object + * processed on a config_sched. + */ +int +do_config(void *p, int l) +{ + struct dn_id *next, *o; + int err = 0, err2 = 0; + struct dn_id *arg = NULL; + uintptr_t *a; + + o = p; + if (o->id != DN_API_VERSION) { + D("invalid api version got %d need %d", + o->id, DN_API_VERSION); + return EINVAL; + } + for (; l >= sizeof(*o); o = next) { + struct dn_id *prev = arg; + if (o->len < sizeof(*o) || l < o->len) { + D("bad len o->len %d len %d", o->len, l); + err = EINVAL; + break; + } + l -= o->len; + next = (struct dn_id *)((char *)o + o->len); + err = 0; + switch (o->type) { + default: + D("cmd %d not implemented", o->type); + break; +#ifdef EMULATE_SYSCTL + /* sysctl emulation. + * if we recognize the command, jump to the correct + * handler and return + */ + case DN_SYSCTL_SET: + err = kesysctl_emu_set(p, l); + return err; +#endif + case DN_CMD_CONFIG: /* simply a header */ + break; + + case DN_CMD_DELETE: + /* the argument is in the first uintptr_t after o */ + a = (uintptr_t *)(o+1); + if (o->len < sizeof(*o) + sizeof(*a)) { + err = EINVAL; + break; + } + switch (o->subtype) { + case DN_LINK: + /* delete base and derived schedulers */ + DN_BH_WLOCK(); + err = delete_schk(*a); + err2 = delete_schk(*a + DN_MAX_ID); + DN_BH_WUNLOCK(); + if (!err) + err = err2; + break; + + default: + D("invalid delete type %d", + o->subtype); + err = EINVAL; + break; + + case DN_FS: + err = (*a <1 || *a >= DN_MAX_ID) ? + EINVAL : delete_fs(*a, 0) ; + break; + } + break; + + case DN_CMD_FLUSH: + DN_BH_WLOCK(); + dummynet_flush(); + DN_BH_WUNLOCK(); + break; + case DN_TEXT: /* store argument the next block */ + prev = NULL; + arg = o; + break; + case DN_LINK: + err = config_link((struct dn_link *)o, arg); + break; + case DN_PROFILE: + err = config_profile((struct dn_profile *)o, arg); + break; + case DN_SCH: + err = config_sched((struct dn_sch *)o, arg); + break; + case DN_FS: + err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); + break; + } + if (prev) + arg = NULL; + if (err != 0) + break; + } + return err; +} + +static int +compute_space(struct dn_id *cmd, struct copy_args *a) +{ + int x = 0, need = 0; + int profile_size = sizeof(struct dn_profile) - + ED_MAX_SAMPLES_NO*sizeof(int); + + /* NOTE about compute space: + * NP = dn_cfg.schk_count + * NSI = dn_cfg.si_count + * NF = dn_cfg.fsk_count + * NQ = dn_cfg.queue_count + * - ipfw pipe show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI)*(dn_flow) all scheduler instance (includes + * the queue instance) + * - ipfw sched show + * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler + * link, scheduler template, flowset + * integrated in scheduler and header + * for flowset list + * (NSI * dn_flow) all scheduler instances + * (NF * sizeof(uint_32)) space for flowset list linked to scheduler + * (NQ * dn_queue) all queue [XXXfor now not listed] + * - ipfw queue show + * (NF * dn_fs) all flowset + * (NQ * dn_queue) all queues + */ + switch (cmd->subtype) { + default: + return -1; + /* XXX where do LINK and SCH differ ? */ + /* 'ipfw sched show' could list all queues associated to + * a scheduler. This feature for now is disabled + */ + case DN_LINK: /* pipe show */ + x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + break; + case DN_SCH: /* sched show */ + need += dn_cfg.schk_count * + (sizeof(struct dn_fs) + profile_size) / 2; + need += dn_cfg.fsk_count * sizeof(uint32_t); + x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; + break; + case DN_FS: /* queue show */ + x = DN_C_FS | DN_C_QUEUE; + break; + case DN_GET_COMPAT: /* compatibility mode */ + need = dn_compat_calc_size(dn_cfg); + break; + } + a->flags = x; + if (x & DN_C_SCH) { + need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; + /* NOT also, each fs might be attached to a sched */ + need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; + } + if (x & DN_C_FS) + need += dn_cfg.fsk_count * sizeof(struct dn_fs); + if (x & DN_C_LINK) { + need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; + } + /* + * When exporting a queue to userland, only pass up the + * struct dn_flow, which is the only visible part. + */ + + if (x & DN_C_QUEUE) + need += dn_cfg.queue_count * sizeof(struct dn_flow); + if (x & DN_C_FLOW) + need += dn_cfg.si_count * (sizeof(struct dn_flow)); + return need; +} + +/* + * If compat != NULL dummynet_get is called in compatibility mode. + * *compat will be the pointer to the buffer to pass to ipfw + */ +int +dummynet_get(struct sockopt *sopt, void **compat) +{ + int have, i, need, error; + char *start = NULL, *buf; + size_t sopt_valsize; + struct dn_id *cmd; + struct copy_args a; + struct copy_range r; + int l = sizeof(struct dn_id); + + bzero(&a, sizeof(a)); + bzero(&r, sizeof(r)); + + /* save and restore original sopt_valsize around copyin */ + sopt_valsize = sopt->sopt_valsize; + + cmd = &r.o; + + if (!compat) { + /* copy at least an oid, and possibly a full object */ + error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + l = cmd->len; +#ifdef EMULATE_SYSCTL + /* sysctl emulation. */ + if (cmd->type == DN_SYSCTL_GET) + return kesysctl_emu_get(sopt); +#endif + if (l > sizeof(r)) { + /* request larger than default, allocate buffer */ + cmd = malloc(l, M_DUMMYNET, M_WAIT); + if (cmd == NULL) + return ENOMEM; //XXX + error = sooptcopyin(sopt, cmd, l, l); + sopt->sopt_valsize = sopt_valsize; + if (error) + goto done; + } + } else { /* compatibility */ + error = 0; + cmd->type = DN_CMD_GET; + cmd->len = sizeof(struct dn_id); + cmd->subtype = DN_GET_COMPAT; + // cmd->id = sopt_valsize; + D("compatibility mode"); + } + a.extra = (struct copy_range *)cmd; + if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ + uint32_t *rp = (uint32_t *)(cmd + 1); + cmd->len += 2* sizeof(uint32_t); + rp[0] = 1; + rp[1] = DN_MAX_ID - 1; + if (cmd->subtype == DN_LINK) { + rp[0] += DN_MAX_ID; + rp[1] += DN_MAX_ID; + } + } + /* Count space (under lock) and allocate (outside lock). + * Exit with lock held if we manage to get enough buffer. + * Try a few times then give up. + */ + for (have = 0, i = 0; i < 10; i++) { + DN_BH_WLOCK(); + need = compute_space(cmd, &a); + + /* if there is a range, ignore value from compute_space() */ + if (l > sizeof(*cmd)) + need = sopt_valsize - sizeof(*cmd); + + if (need < 0) { + DN_BH_WUNLOCK(); + error = EINVAL; + goto done; + } + need += sizeof(*cmd); + cmd->id = need; + if (have >= need) + break; + + DN_BH_WUNLOCK(); + if (start) + free(start, M_DUMMYNET); + start = NULL; + if (need > sopt_valsize) + break; + + have = need; + start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); + if (start == NULL) { + error = ENOMEM; + goto done; + } + } + + if (start == NULL) { + if (compat) { + *compat = NULL; + error = 1; // XXX + } else { + error = sooptcopyout(sopt, cmd, sizeof(*cmd)); + } + goto done; + } + ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " + "%d:%d si %d, %d:%d queues %d", + dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, + dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, + dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, + dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, + dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); + sopt->sopt_valsize = sopt_valsize; + a.type = cmd->subtype; + + if (compat == NULL) { + bcopy(cmd, start, sizeof(*cmd)); + ((struct dn_id*)(start))->len = sizeof(struct dn_id); + buf = start + sizeof(*cmd); + } else + buf = start; + a.start = &buf; + a.end = start + have; + /* start copying other objects */ + if (compat) { + a.type = DN_COMPAT_PIPE; + dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); + a.type = DN_COMPAT_QUEUE; + dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); + } else if (a.type == DN_FS) { + dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); + } else { + dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); + } + DN_BH_WUNLOCK(); + + if (compat) { + *compat = start; + sopt->sopt_valsize = buf - start; + /* free() is done by ip_dummynet_compat() */ + start = NULL; //XXX hack + } else { + error = sooptcopyout(sopt, start, buf - start); + } +done: + if (cmd && cmd != &r.o) + free(cmd, M_DUMMYNET); + if (start) + free(start, M_DUMMYNET); + return error; +} + +/* Callback called on scheduler instance to delete it if idle */ +static int +drain_scheduler_cb(void *_si, void *arg) +{ + struct dn_sch_inst *si = _si; + + if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) + return 0; + + if (si->sched->fp->flags & DN_MULTIQUEUE) { + if (si->q_count == 0) + return si_destroy(si, NULL); + else + return 0; + } else { /* !DN_MULTIQUEUE */ + if ((si+1)->ni.length == 0) + return si_destroy(si, NULL); + else + return 0; + } + return 0; /* unreachable */ +} + +/* Callback called on scheduler to check if it has instances */ +static int +drain_scheduler_sch_cb(void *_s, void *arg) +{ + struct dn_schk *s = _s; + + if (s->sch.flags & DN_HAVE_MASK) { + dn_ht_scan_bucket(s->siht, &s->drain_bucket, + drain_scheduler_cb, NULL); + s->drain_bucket++; + } else { + if (s->siht) { + if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL) + s->siht = NULL; + } + } + return 0; +} + +/* Called every tick, try to delete a 'bucket' of scheduler */ +void +dn_drain_scheduler(void) +{ + dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch, + drain_scheduler_sch_cb, NULL); + dn_cfg.drain_sch++; +} + +/* Callback called on queue to delete if it is idle */ +static int +drain_queue_cb(void *_q, void *arg) +{ + struct dn_queue *q = _q; + + if (q->ni.length == 0) { + dn_delete_queue(q, DN_DESTROY); + return DNHT_SCAN_DEL; /* queue is deleted */ + } + + return 0; /* queue isn't deleted */ +} + +/* Callback called on flowset used to check if it has queues */ +static int +drain_queue_fs_cb(void *_fs, void *arg) +{ + struct dn_fsk *fs = _fs; + + if (fs->fs.flags & DN_QHT_HASH) { + /* Flowset has a hash table for queues */ + dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, + drain_queue_cb, NULL); + fs->drain_bucket++; + } else { + /* No hash table for this flowset, null the pointer + * if the queue is deleted + */ + if (fs->qht) { + if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL) + fs->qht = NULL; + } + } + return 0; +} + +/* Called every tick, try to delete a 'bucket' of queue */ +void +dn_drain_queue(void) +{ + /* scan a bucket of flowset */ + dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs, + drain_queue_fs_cb, NULL); + dn_cfg.drain_fs++; +} + +/* + * Handler for the various dummynet socket options + */ +static int +ip_dn_ctl(struct sockopt *sopt) +{ + void *p = NULL; + int error, l; + + error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); + if (error) + return (error); + + /* Disallow sets in really-really secure mode. */ + if (sopt->sopt_dir == SOPT_SET) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } + + switch (sopt->sopt_name) { + default : + D("dummynet: unknown option %d", sopt->sopt_name); + error = EINVAL; + break; + + case IP_DUMMYNET_FLUSH: + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: /* remove a pipe or queue */ + case IP_DUMMYNET_GET: + D("dummynet: compat option %d", sopt->sopt_name); + error = ip_dummynet_compat(sopt); + break; + + case IP_DUMMYNET3 : + if (sopt->sopt_dir == SOPT_GET) { + error = dummynet_get(sopt, NULL); + break; + } + l = sopt->sopt_valsize; + if (l < sizeof(struct dn_id) || l > 12000) { + D("argument len %d invalid", l); + break; + } + p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? + error = sooptcopyin(sopt, p, l, l); + if (error) + break ; + error = do_config(p, l); + break; + } + + if (p != NULL) + free(p, M_TEMP); + + return error ; +} + + +static void +ip_dn_init(void) +{ + static int init_done = 0; + + if (init_done) + return; + init_done = 1; + if (bootverbose) + printf("DUMMYNET with IPv6 initialized (100131)\n"); + + /* Set defaults here. MSVC does not accept initializers, + * and this is also useful for vimages + */ + /* queue limits */ + dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ + dn_cfg.byte_limit = 1024 * 1024; + dn_cfg.expire = 1; + + /* RED parameters */ + dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ + dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ + dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ + + /* hash tables */ + dn_cfg.max_hash_size = 1024; /* max in the hash tables */ + dn_cfg.hash_size = 64; /* default hash size */ + + /* create hash tables for schedulers and flowsets. + * In both we search by key and by pointer. + */ + dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_schk, schk_next), + schk_hash, schk_match, schk_new); + dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, + offsetof(struct dn_fsk, fsk_next), + fsk_hash, fsk_match, fsk_new); + + /* bucket index to drain object */ + dn_cfg.drain_fs = 0; + dn_cfg.drain_sch = 0; + + heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); + SLIST_INIT(&dn_cfg.fsu); + SLIST_INIT(&dn_cfg.schedlist); + + DN_LOCK_INIT(); + ip_dn_ctl_ptr = ip_dn_ctl; + ip_dn_io_ptr = dummynet_io; + + TASK_INIT(&dn_task, 0, dummynet_task, NULL); + dn_tq = taskqueue_create_fast("dummynet", M_NOWAIT, + taskqueue_thread_enqueue, &dn_tq); + taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet"); + + callout_init(&dn_timeout, CALLOUT_MPSAFE); + callout_reset(&dn_timeout, 1, dummynet, NULL); + + /* Initialize curr_time adjustment mechanics. */ + getmicrouptime(&dn_cfg.prev_t); +} + +#ifdef KLD_MODULE +static void +ip_dn_destroy(void) +{ + callout_drain(&dn_timeout); + + DN_BH_WLOCK(); + ip_dn_ctl_ptr = NULL; + ip_dn_io_ptr = NULL; + + dummynet_flush(); + DN_BH_WUNLOCK(); + taskqueue_drain(dn_tq, &dn_task); + taskqueue_free(dn_tq); + + dn_ht_free(dn_cfg.schedhash, 0); + dn_ht_free(dn_cfg.fshash, 0); + heap_free(&dn_cfg.evheap); + + DN_LOCK_DESTROY(); +} +#endif /* KLD_MODULE */ + +static int +dummynet_modevent(module_t mod, int type, void *data) +{ + + if (type == MOD_LOAD) { + if (ip_dn_io_ptr) { + printf("DUMMYNET already loaded\n"); + return EEXIST ; + } + ip_dn_init(); + return 0; + } else if (type == MOD_UNLOAD) { +#if !defined(KLD_MODULE) + printf("dummynet statically compiled, cannot unload\n"); + return EINVAL ; +#else + ip_dn_destroy(); + return 0; +#endif + } else + return EOPNOTSUPP; +} + +/* modevent helpers for the modules */ +static int +load_dn_sched(struct dn_alg *d) +{ + struct dn_alg *s; + + if (d == NULL) + return 1; /* error */ + ip_dn_init(); /* just in case, we need the lock */ + + /* Check that mandatory funcs exists */ + if (d->enqueue == NULL || d->dequeue == NULL) { + D("missing enqueue or dequeue for %s", d->name); + return 1; + } + + /* Search if scheduler already exists */ + DN_BH_WLOCK(); + SLIST_FOREACH(s, &dn_cfg.schedlist, next) { + if (strcmp(s->name, d->name) == 0) { + D("%s already loaded", d->name); + break; /* scheduler already exists */ + } + } + if (s == NULL) + SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); + DN_BH_WUNLOCK(); + D("dn_sched %s %sloaded", d->name, s ? "not ":""); + return s ? 1 : 0; +} + +static int +unload_dn_sched(struct dn_alg *s) +{ + struct dn_alg *tmp, *r; + int err = EINVAL; + + D("called for %s", s->name); + + DN_BH_WLOCK(); + SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { + if (strcmp(s->name, r->name) != 0) + continue; + D("ref_count = %d", r->ref_count); + err = (r->ref_count != 0) ? EBUSY : 0; + if (err == 0) + SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); + break; + } + DN_BH_WUNLOCK(); + D("dn_sched %s %sunloaded", s->name, err ? "not ":""); + return err; +} + +int +dn_sched_modevent(module_t mod, int cmd, void *arg) +{ + struct dn_alg *sch = arg; + + if (cmd == MOD_LOAD) + return load_dn_sched(sch); + else if (cmd == MOD_UNLOAD) + return unload_dn_sched(sch); + else + return EINVAL; +} + +static moduledata_t dummynet_mod = { + "dummynet", dummynet_modevent, NULL +}; + +DECLARE_MODULE(dummynet, dummynet_mod, + SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY-1); +MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); +MODULE_VERSION(dummynet, 1); +/* end of file */ diff --git a/freebsd/sys/netinet/ipfw/ip_fw2.c b/freebsd/sys/netinet/ipfw/ip_fw2.c new file mode 100644 index 00000000..682cced1 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_fw2.c @@ -0,0 +1,2495 @@ +#include + +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * The FreeBSD IP packet firewall, main file + */ + +#if !defined(KLD_MODULE) +#include +#include +#include +#include +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ETHERTYPE_IP */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef INET6 +#include +#include +#endif + +#include /* XXX for in_cksum */ + +#ifdef MAC +#include +#endif + +/* + * static variables followed by global ones. + * All ipfw global variables are here. + */ + +/* ipfw_vnet_ready controls when we are open for business */ +static VNET_DEFINE(int, ipfw_vnet_ready) = 0; +#define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) + +static VNET_DEFINE(int, fw_deny_unknown_exthdrs); +#define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) + +#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT +static int default_to_accept = 1; +#else +static int default_to_accept; +#endif + +VNET_DEFINE(int, autoinc_step); + +/* + * Each rule belongs to one of 32 different sets (0..31). + * The variable set_disable contains one bit per set. + * If the bit is set, all rules in the corresponding set + * are disabled. Set RESVD_SET(31) is reserved for the default rule + * and rules that are not deleted by the flush command, + * and CANNOT be disabled. + * Rules in set RESVD_SET can only be deleted individually. + */ +VNET_DEFINE(u_int32_t, set_disable); +#define V_set_disable VNET(set_disable) + +VNET_DEFINE(int, fw_verbose); +/* counter for ipfw_log(NULL...) */ +VNET_DEFINE(u_int64_t, norule_counter); +VNET_DEFINE(int, verbose_limit); + +/* layer3_chain contains the list of rules for layer 3 */ +VNET_DEFINE(struct ip_fw_chain, layer3_chain); + +ipfw_nat_t *ipfw_nat_ptr = NULL; +struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); +ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; +ipfw_nat_cfg_t *ipfw_nat_del_ptr; +ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; +ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; + +#ifdef SYSCTL_NODE +uint32_t dummy_def = IPFW_DEFAULT_RULE; +uint32_t dummy_tables_max = IPFW_TABLES_MAX; + +SYSBEGIN(f3) + +SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, + CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, + "Only do a single pass through ipfw when using dummynet(4)"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, + CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, + "Rule number auto-increment step"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, + CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, + "Log matches to ipfw rules"); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, + CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, + "Set upper limit of matches of ipfw rules logged"); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, + &dummy_def, 0, + "The default/max possible rule number."); +SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_RD, + &dummy_tables_max, 0, + "The maximum number of tables."); +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, + &default_to_accept, 0, + "Make the default rule accept all packets."); +TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept); +SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, + CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, + "Number of static rules"); + +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6); +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); +SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, + CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, + "Deny packets with unknown IPv6 Extension Headers"); +#endif /* INET6 */ + +SYSEND + +#endif /* SYSCTL_NODE */ + + +/* + * Some macros used in the various matching options. + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T + * Other macros just cast void * into the appropriate type + */ +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) +#define ICMP(p) ((struct icmphdr *)(p)) +#define ICMP6(p) ((struct icmp6_hdr *)(p)) + +static __inline int +icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) +{ + int type = icmp->icmp_type; + + return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; + + return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. + * + * We scan options and store the bits we find set. We succeed if + * + * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear + * + * The code is sometimes optimized not to store additional variables. + */ + +static int +flags_match(ipfw_insn *cmd, u_int8_t bits) +{ + u_char want_clear; + bits = ~bits; + + if ( ((cmd->arg1 & 0xff) & bits) != 0) + return 0; /* some bits we want set were clear */ + want_clear = (cmd->arg1 >> 8) & 0xff; + if ( (want_clear & bits) != want_clear) + return 0; /* some bits we want clear were set */ + return 1; +} + +static int +ipopts_match(struct ip *ip, ipfw_insn *cmd) +{ + int optlen, bits = 0; + u_char *cp = (u_char *)(ip + 1); + int x = (ip->ip_hl << 2) - sizeof (struct ip); + + for (; x > 0; x -= optlen, cp += optlen) { + int opt = cp[IPOPT_OPTVAL]; + + if (opt == IPOPT_EOL) + break; + if (opt == IPOPT_NOP) + optlen = 1; + else { + optlen = cp[IPOPT_OLEN]; + if (optlen <= 0 || optlen > x) + return 0; /* invalid or truncated */ + } + switch (opt) { + + default: + break; + + case IPOPT_LSRR: + bits |= IP_FW_IPOPT_LSRR; + break; + + case IPOPT_SSRR: + bits |= IP_FW_IPOPT_SSRR; + break; + + case IPOPT_RR: + bits |= IP_FW_IPOPT_RR; + break; + + case IPOPT_TS: + bits |= IP_FW_IPOPT_TS; + break; + } + } + return (flags_match(cmd, bits)); +} + +static int +tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) +{ + int optlen, bits = 0; + u_char *cp = (u_char *)(tcp + 1); + int x = (tcp->th_off << 2) - sizeof(struct tcphdr); + + for (; x > 0; x -= optlen, cp += optlen) { + int opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + optlen = cp[1]; + if (optlen <= 0) + break; + } + + switch (opt) { + + default: + break; + + case TCPOPT_MAXSEG: + bits |= IP_FW_TCPOPT_MSS; + break; + + case TCPOPT_WINDOW: + bits |= IP_FW_TCPOPT_WINDOW; + break; + + case TCPOPT_SACK_PERMITTED: + case TCPOPT_SACK: + bits |= IP_FW_TCPOPT_SACK; + break; + + case TCPOPT_TIMESTAMP: + bits |= IP_FW_TCPOPT_TS; + break; + + } + } + return (flags_match(cmd, bits)); +} + +static int +iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) +{ + if (ifp == NULL) /* no iface with this packet, match fails */ + return 0; + /* Check by name or by IP address */ + if (cmd->name[0] != '\0') { /* match by name */ + /* Check name */ + if (cmd->p.glob) { + if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) + return(1); + } else { + if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) + return(1); + } + } else { +#ifdef __FreeBSD__ /* and OSX too ? */ + struct ifaddr *ia; + + if_addr_rlock(ifp); + TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { + if (ia->ifa_addr->sa_family != AF_INET) + continue; + if (cmd->p.ip.s_addr == ((struct sockaddr_in *) + (ia->ifa_addr))->sin_addr.s_addr) { + if_addr_runlock(ifp); + return(1); /* match */ + } + } + if_addr_runlock(ifp); +#endif /* __FreeBSD__ */ + } + return(0); /* no match, fail ... */ +} + +/* + * The verify_path function checks if a route to the src exists and + * if it is reachable via ifp (when provided). + * + * The 'verrevpath' option checks that the interface that an IP packet + * arrives on is the same interface that traffic destined for the + * packet's source address would be routed out of. + * The 'versrcreach' option just checks that the source address is + * reachable via any route (except default) in the routing table. + * These two are a measure to block forged packets. This is also + * commonly known as "anti-spoofing" or Unicast Reverse Path + * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs + * is purposely reminiscent of the Cisco IOS command, + * + * ip verify unicast reverse-path + * ip verify unicast source reachable-via any + * + * which implements the same functionality. But note that the syntax + * is misleading, and the check may be performed on all IP packets + * whether unicast, multicast, or broadcast. + */ +static int +verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) +{ +#ifndef __FreeBSD__ + return 0; +#else + struct route ro; + struct sockaddr_in *dst; + + bzero(&ro, sizeof(ro)); + + dst = (struct sockaddr_in *)&(ro.ro_dst); + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = src; + in_rtalloc_ign(&ro, 0, fib); + + if (ro.ro_rt == NULL) + return 0; + + /* + * If ifp is provided, check for equality with rtentry. + * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, + * in order to pass packets injected back by if_simloop(): + * if useloopback == 1 routing entry (via lo0) for our own address + * may exist, so we need to handle routing assymetry. + */ + if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { + RTFREE(ro.ro_rt); + return 0; + } + + /* if no ifp provided, check if rtentry is not default route */ + if (ifp == NULL && + satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { + RTFREE(ro.ro_rt); + return 0; + } + + /* or if this is a blackhole/reject route */ + if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* found valid route */ + RTFREE(ro.ro_rt); + return 1; +#endif /* __FreeBSD__ */ +} + +#ifdef INET6 +/* + * ipv6 specific rules here... + */ +static __inline int +icmp6type_match (int type, ipfw_insn_u32 *cmd) +{ + return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); +} + +static int +flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) +{ + int i; + for (i=0; i <= cmd->o.arg1; ++i ) + if (curr_flow == cmd->d[i] ) + return 1; + return 0; +} + +/* support for IP6_*_ME opcodes */ +static int +search_ip6_addr_net (struct in6_addr * ip6_addr) +{ + struct ifnet *mdc; + struct ifaddr *mdc2; + struct in6_ifaddr *fdm; + struct in6_addr copia; + + TAILQ_FOREACH(mdc, &V_ifnet, if_link) { + if_addr_rlock(mdc); + TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { + if (mdc2->ifa_addr->sa_family == AF_INET6) { + fdm = (struct in6_ifaddr *)mdc2; + copia = fdm->ia_addr.sin6_addr; + /* need for leaving scope_id in the sock_addr */ + in6_clearscope(&copia); + if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { + if_addr_runlock(mdc); + return 1; + } + } + } + if_addr_runlock(mdc); + } + return 0; +} + +static int +verify_path6(struct in6_addr *src, struct ifnet *ifp) +{ + struct route_in6 ro; + struct sockaddr_in6 *dst; + + bzero(&ro, sizeof(ro)); + + dst = (struct sockaddr_in6 * )&(ro.ro_dst); + dst->sin6_family = AF_INET6; + dst->sin6_len = sizeof(*dst); + dst->sin6_addr = *src; + /* XXX MRT 0 for ipv6 at this time */ + rtalloc_ign((struct route *)&ro, 0); + + if (ro.ro_rt == NULL) + return 0; + + /* + * if ifp is provided, check for equality with rtentry + * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, + * to support the case of sending packets to an address of our own. + * (where the former interface is the first argument of if_simloop() + * (=ifp), the latter is lo0) + */ + if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { + RTFREE(ro.ro_rt); + return 0; + } + + /* if no ifp provided, check if rtentry is not default route */ + if (ifp == NULL && + IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* or if this is a blackhole/reject route */ + if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + RTFREE(ro.ro_rt); + return 0; + } + + /* found valid route */ + RTFREE(ro.ro_rt); + return 1; + +} + +static int +is_icmp6_query(int icmp6_type) +{ + if ((icmp6_type <= ICMP6_MAXTYPE) && + (icmp6_type == ICMP6_ECHO_REQUEST || + icmp6_type == ICMP6_MEMBERSHIP_QUERY || + icmp6_type == ICMP6_WRUREQUEST || + icmp6_type == ICMP6_FQDN_QUERY || + icmp6_type == ICMP6_NI_QUERY)) + return (1); + + return (0); +} + +static void +send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) +{ + struct mbuf *m; + + m = args->m; + if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { + struct tcphdr *tcp; + tcp = (struct tcphdr *)((char *)ip6 + hlen); + + if ((tcp->th_flags & TH_RST) == 0) { + struct mbuf *m0; + m0 = ipfw_send_pkt(args->m, &(args->f_id), + ntohl(tcp->th_seq), ntohl(tcp->th_ack), + tcp->th_flags | TH_RST); + if (m0 != NULL) + ip6_output(m0, NULL, NULL, 0, NULL, NULL, + NULL); + } + FREE_PKT(m); + } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ +#if 0 + /* + * Unlike above, the mbufs need to line up with the ip6 hdr, + * as the contents are read. We need to m_adj() the + * needed amount. + * The mbuf will however be thrown away so we can adjust it. + * Remember we did an m_pullup on it already so we + * can make some assumptions about contiguousness. + */ + if (args->L3offset) + m_adj(m, args->L3offset); +#endif + icmp6_error(m, ICMP6_DST_UNREACH, code, 0); + } else + FREE_PKT(m); + + args->m = NULL; +} + +#endif /* INET6 */ + + +/* + * sends a reject message, consuming the mbuf passed as an argument. + */ +static void +send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) +{ + +#if 0 + /* XXX When ip is not guaranteed to be at mtod() we will + * need to account for this */ + * The mbuf will however be thrown away so we can adjust it. + * Remember we did an m_pullup on it already so we + * can make some assumptions about contiguousness. + */ + if (args->L3offset) + m_adj(m, args->L3offset); +#endif + if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ + /* We need the IP header in host order for icmp_error(). */ + SET_HOST_IPLEN(ip); + icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); + } else if (args->f_id.proto == IPPROTO_TCP) { + struct tcphdr *const tcp = + L3HDR(struct tcphdr, mtod(args->m, struct ip *)); + if ( (tcp->th_flags & TH_RST) == 0) { + struct mbuf *m; + m = ipfw_send_pkt(args->m, &(args->f_id), + ntohl(tcp->th_seq), ntohl(tcp->th_ack), + tcp->th_flags | TH_RST); + if (m != NULL) + ip_output(m, NULL, NULL, 0, NULL, NULL); + } + FREE_PKT(args->m); + } else + FREE_PKT(args->m); + args->m = NULL; +} + +/* + * Support for uid/gid/jail lookup. These tests are expensive + * (because we may need to look into the list of active sockets) + * so we cache the results. ugid_lookupp is 0 if we have not + * yet done a lookup, 1 if we succeeded, and -1 if we tried + * and failed. The function always returns the match value. + * We could actually spare the variable and use *uc, setting + * it to '(void *)check_uidgid if we have no info, NULL if + * we tried and failed, or any other value if successful. + */ +static int +check_uidgid(ipfw_insn_u32 *insn, int proto, struct ifnet *oif, + struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, + u_int16_t src_port, int *ugid_lookupp, + struct ucred **uc, struct inpcb *inp) +{ +#ifndef __FreeBSD__ + return cred_check(insn, proto, oif, + dst_ip, dst_port, src_ip, src_port, + (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); +#else /* FreeBSD */ + struct inpcbinfo *pi; + int wildcard; + struct inpcb *pcb; + int match; + + /* + * Check to see if the UDP or TCP stack supplied us with + * the PCB. If so, rather then holding a lock and looking + * up the PCB, we can use the one that was supplied. + */ + if (inp && *ugid_lookupp == 0) { + INP_LOCK_ASSERT(inp); + if (inp->inp_socket != NULL) { + *uc = crhold(inp->inp_cred); + *ugid_lookupp = 1; + } else + *ugid_lookupp = -1; + } + /* + * If we have already been here and the packet has no + * PCB entry associated with it, then we can safely + * assume that this is a no match. + */ + if (*ugid_lookupp == -1) + return (0); + if (proto == IPPROTO_TCP) { + wildcard = 0; + pi = &V_tcbinfo; + } else if (proto == IPPROTO_UDP) { + wildcard = INPLOOKUP_WILDCARD; + pi = &V_udbinfo; + } else + return 0; + match = 0; + if (*ugid_lookupp == 0) { + INP_INFO_RLOCK(pi); + pcb = (oif) ? + in_pcblookup_hash(pi, + dst_ip, htons(dst_port), + src_ip, htons(src_port), + wildcard, oif) : + in_pcblookup_hash(pi, + src_ip, htons(src_port), + dst_ip, htons(dst_port), + wildcard, NULL); + if (pcb != NULL) { + *uc = crhold(pcb->inp_cred); + *ugid_lookupp = 1; + } + INP_INFO_RUNLOCK(pi); + if (*ugid_lookupp == 0) { + /* + * We tried and failed, set the variable to -1 + * so we will not try again on this packet. + */ + *ugid_lookupp = -1; + return (0); + } + } + if (insn->o.opcode == O_UID) + match = ((*uc)->cr_uid == (uid_t)insn->d[0]); + else if (insn->o.opcode == O_GID) + match = groupmember((gid_t)insn->d[0], *uc); + else if (insn->o.opcode == O_JAIL) + match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); + return match; +#endif /* __FreeBSD__ */ +} + +/* + * Helper function to set args with info on the rule after the matching + * one. slot is precise, whereas we guess rule_id as they are + * assigned sequentially. + */ +static inline void +set_match(struct ip_fw_args *args, int slot, + struct ip_fw_chain *chain) +{ + args->rule.chain_id = chain->id; + args->rule.slot = slot + 1; /* we use 0 as a marker */ + args->rule.rule_id = 1 + chain->map[slot]->id; + args->rule.rulenum = chain->map[slot]->rulenum; +} + +/* + * The main check routine for the firewall. + * + * All arguments are in args so we can modify them and return them + * back to the caller. + * + * Parameters: + * + * args->m (in/out) The packet; we set to NULL when/if we nuke it. + * Starts with the IP header. + * args->eh (in) Mac header if present, NULL for layer3 packet. + * args->L3offset Number of bytes bypassed if we came from L2. + * e.g. often sizeof(eh) ** NOTYET ** + * args->oif Outgoing interface, NULL if packet is incoming. + * The incoming interface is in the mbuf. (in) + * args->divert_rule (in/out) + * Skip up to the first rule past this rule number; + * upon return, non-zero port number for divert or tee. + * + * args->rule Pointer to the last matching rule (in/out) + * args->next_hop Socket we are forwarding to (out). + * args->f_id Addresses grabbed from the packet (out) + * args->rule.info a cookie depending on rule action + * + * Return value: + * + * IP_FW_PASS the packet must be accepted + * IP_FW_DENY the packet must be dropped + * IP_FW_DIVERT divert packet, port in m_tag + * IP_FW_TEE tee packet, port in m_tag + * IP_FW_DUMMYNET to dummynet, pipe in args->cookie + * IP_FW_NETGRAPH into netgraph, cookie args->cookie + * args->rule contains the matching rule, + * args->rule.info has additional information. + * + */ +int +ipfw_chk(struct ip_fw_args *args) +{ + + /* + * Local variables holding state while processing a packet: + * + * IMPORTANT NOTE: to speed up the processing of rules, there + * are some assumption on the values of the variables, which + * are documented here. Should you change them, please check + * the implementation of the various instructions to make sure + * that they still work. + * + * args->eh The MAC header. It is non-null for a layer2 + * packet, it is NULL for a layer-3 packet. + * **notyet** + * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. + * + * m | args->m Pointer to the mbuf, as received from the caller. + * It may change if ipfw_chk() does an m_pullup, or if it + * consumes the packet because it calls send_reject(). + * XXX This has to change, so that ipfw_chk() never modifies + * or consumes the buffer. + * ip is the beginning of the ip(4 or 6) header. + * Calculated by adding the L3offset to the start of data. + * (Until we start using L3offset, the packet is + * supposed to start with the ip header). + */ + struct mbuf *m = args->m; + struct ip *ip = mtod(m, struct ip *); + + /* + * For rules which contain uid/gid or jail constraints, cache + * a copy of the users credentials after the pcb lookup has been + * executed. This will speed up the processing of rules with + * these types of constraints, as well as decrease contention + * on pcb related locks. + */ +#ifndef __FreeBSD__ + struct bsd_ucred ucred_cache; +#else + struct ucred *ucred_cache = NULL; +#endif + int ucred_lookup = 0; + + /* + * oif | args->oif If NULL, ipfw_chk has been called on the + * inbound path (ether_input, ip_input). + * If non-NULL, ipfw_chk has been called on the outbound path + * (ether_output, ip_output). + */ + struct ifnet *oif = args->oif; + + int f_pos = 0; /* index of current rule in the array */ + int retval = 0; + + /* + * hlen The length of the IP header. + */ + u_int hlen = 0; /* hlen >0 means we have an IP pkt */ + + /* + * offset The offset of a fragment. offset != 0 means that + * we have a fragment at this offset of an IPv4 packet. + * offset == 0 means that (if this is an IPv4 packet) + * this is the first or only fragment. + * For IPv6 offset == 0 means there is no Fragment Header. + * If offset != 0 for IPv6 always use correct mask to + * get the correct offset because we add IP6F_MORE_FRAG + * to be able to dectect the first fragment which would + * otherwise have offset = 0. + */ + u_short offset = 0; + + /* + * Local copies of addresses. They are only valid if we have + * an IP packet. + * + * proto The protocol. Set to 0 for non-ip packets, + * or to the protocol read from the packet otherwise. + * proto != 0 means that we have an IPv4 packet. + * + * src_port, dst_port port numbers, in HOST format. Only + * valid for TCP and UDP packets. + * + * src_ip, dst_ip ip addresses, in NETWORK format. + * Only valid for IPv4 packets. + */ + uint8_t proto; + uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ + struct in_addr src_ip, dst_ip; /* NOTE: network format */ + uint16_t iplen=0; + int pktlen; + uint16_t etype = 0; /* Host order stored ether type */ + + /* + * dyn_dir = MATCH_UNKNOWN when rules unchecked, + * MATCH_NONE when checked and not matched (q = NULL), + * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) + */ + int dyn_dir = MATCH_UNKNOWN; + ipfw_dyn_rule *q = NULL; + struct ip_fw_chain *chain = &V_layer3_chain; + + /* + * We store in ulp a pointer to the upper layer protocol header. + * In the ipv4 case this is easy to determine from the header, + * but for ipv6 we might have some additional headers in the middle. + * ulp is NULL if not found. + */ + void *ulp = NULL; /* upper layer protocol pointer. */ + + /* XXX ipv6 variables */ + int is_ipv6 = 0; + uint8_t icmp6_type = 0; + uint16_t ext_hd = 0; /* bits vector for extension header filtering */ + /* end of ipv6 variables */ + + int is_ipv4 = 0; + + int done = 0; /* flag to exit the outer loop */ + + if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) + return (IP_FW_PASS); /* accept */ + + dst_ip.s_addr = 0; /* make sure it is initialized */ + src_ip.s_addr = 0; /* make sure it is initialized */ + pktlen = m->m_pkthdr.len; + args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */ + proto = args->f_id.proto = 0; /* mark f_id invalid */ + /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ + +/* + * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, + * then it sets p to point at the offset "len" in the mbuf. WARNING: the + * pointer might become stale after other pullups (but we never use it + * this way). + */ +#define PULLUP_TO(_len, p, T) \ +do { \ + int x = (_len) + sizeof(T); \ + if ((m)->m_len < x) { \ + args->m = m = m_pullup(m, x); \ + if (m == NULL) \ + goto pullup_failed; \ + } \ + p = (mtod(m, char *) + (_len)); \ +} while (0) + + /* + * if we have an ether header, + */ + if (args->eh) + etype = ntohs(args->eh->ether_type); + + /* Identify IP packets and fill up variables. */ + if (pktlen >= sizeof(struct ip6_hdr) && + (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { + struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; + is_ipv6 = 1; + args->f_id.addr_type = 6; + hlen = sizeof(struct ip6_hdr); + proto = ip6->ip6_nxt; + + /* Search extension headers to find upper layer protocols */ + while (ulp == NULL) { + switch (proto) { + case IPPROTO_ICMPV6: + PULLUP_TO(hlen, ulp, struct icmp6_hdr); + icmp6_type = ICMP6(ulp)->icmp6_type; + break; + + case IPPROTO_TCP: + PULLUP_TO(hlen, ulp, struct tcphdr); + dst_port = TCP(ulp)->th_dport; + src_port = TCP(ulp)->th_sport; + /* save flags for dynamic rules */ + args->f_id._flags = TCP(ulp)->th_flags; + break; + + case IPPROTO_SCTP: + PULLUP_TO(hlen, ulp, struct sctphdr); + src_port = SCTP(ulp)->src_port; + dst_port = SCTP(ulp)->dest_port; + break; + + case IPPROTO_UDP: + PULLUP_TO(hlen, ulp, struct udphdr); + dst_port = UDP(ulp)->uh_dport; + src_port = UDP(ulp)->uh_sport; + break; + + case IPPROTO_HOPOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + ext_hd |= EXT_HOPOPTS; + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + + case IPPROTO_ROUTING: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_rthdr); + switch (((struct ip6_rthdr *)ulp)->ip6r_type) { + case 0: + ext_hd |= EXT_RTHDR0; + break; + case 2: + ext_hd |= EXT_RTHDR2; + break; + default: + printf("IPFW2: IPV6 - Unknown Routing " + "Header type(%d)\n", + ((struct ip6_rthdr *)ulp)->ip6r_type); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + break; + } + ext_hd |= EXT_ROUTING; + hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; + proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; + ulp = NULL; + break; + + case IPPROTO_FRAGMENT: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_frag); + ext_hd |= EXT_FRAGMENT; + hlen += sizeof (struct ip6_frag); + proto = ((struct ip6_frag *)ulp)->ip6f_nxt; + offset = ((struct ip6_frag *)ulp)->ip6f_offlg & + IP6F_OFF_MASK; + /* Add IP6F_MORE_FRAG for offset of first + * fragment to be != 0. */ + offset |= ((struct ip6_frag *)ulp)->ip6f_offlg & + IP6F_MORE_FRAG; + if (offset == 0) { + printf("IPFW2: IPV6 - Invalid Fragment " + "Header\n"); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + break; + } + args->f_id.extra = + ntohl(((struct ip6_frag *)ulp)->ip6f_ident); + ulp = NULL; + break; + + case IPPROTO_DSTOPTS: /* RFC 2460 */ + PULLUP_TO(hlen, ulp, struct ip6_hbh); + ext_hd |= EXT_DSTOPTS; + hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; + proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; + ulp = NULL; + break; + + case IPPROTO_AH: /* RFC 2402 */ + PULLUP_TO(hlen, ulp, struct ip6_ext); + ext_hd |= EXT_AH; + hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; + proto = ((struct ip6_ext *)ulp)->ip6e_nxt; + ulp = NULL; + break; + + case IPPROTO_ESP: /* RFC 2406 */ + PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ + /* Anything past Seq# is variable length and + * data past this ext. header is encrypted. */ + ext_hd |= EXT_ESP; + break; + + case IPPROTO_NONE: /* RFC 2460 */ + /* + * Packet ends here, and IPv6 header has + * already been pulled up. If ip6e_len!=0 + * then octets must be ignored. + */ + ulp = ip; /* non-NULL to get out of loop. */ + break; + + case IPPROTO_OSPFIGP: + /* XXX OSPF header check? */ + PULLUP_TO(hlen, ulp, struct ip6_ext); + break; + + case IPPROTO_PIM: + /* XXX PIM header check? */ + PULLUP_TO(hlen, ulp, struct pim); + break; + + case IPPROTO_CARP: + PULLUP_TO(hlen, ulp, struct carp_header); + if (((struct carp_header *)ulp)->carp_version != + CARP_VERSION) + return (IP_FW_DENY); + if (((struct carp_header *)ulp)->carp_type != + CARP_ADVERTISEMENT) + return (IP_FW_DENY); + break; + + case IPPROTO_IPV6: /* RFC 2893 */ + PULLUP_TO(hlen, ulp, struct ip6_hdr); + break; + + case IPPROTO_IPV4: /* RFC 2893 */ + PULLUP_TO(hlen, ulp, struct ip); + break; + + default: + printf("IPFW2: IPV6 - Unknown Extension " + "Header(%d), ext_hd=%x\n", proto, ext_hd); + if (V_fw_deny_unknown_exthdrs) + return (IP_FW_DENY); + PULLUP_TO(hlen, ulp, struct ip6_ext); + break; + } /*switch */ + } + ip = mtod(m, struct ip *); + ip6 = (struct ip6_hdr *)ip; + args->f_id.src_ip6 = ip6->ip6_src; + args->f_id.dst_ip6 = ip6->ip6_dst; + args->f_id.src_ip = 0; + args->f_id.dst_ip = 0; + args->f_id.flow_id6 = ntohl(ip6->ip6_flow); + } else if (pktlen >= sizeof(struct ip) && + (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { + is_ipv4 = 1; + hlen = ip->ip_hl << 2; + args->f_id.addr_type = 4; + + /* + * Collect parameters into local variables for faster matching. + */ + proto = ip->ip_p; + src_ip = ip->ip_src; + dst_ip = ip->ip_dst; + offset = ntohs(ip->ip_off) & IP_OFFMASK; + iplen = ntohs(ip->ip_len); + pktlen = iplen < pktlen ? iplen : pktlen; + + if (offset == 0) { + switch (proto) { + case IPPROTO_TCP: + PULLUP_TO(hlen, ulp, struct tcphdr); + dst_port = TCP(ulp)->th_dport; + src_port = TCP(ulp)->th_sport; + /* save flags for dynamic rules */ + args->f_id._flags = TCP(ulp)->th_flags; + break; + + case IPPROTO_UDP: + PULLUP_TO(hlen, ulp, struct udphdr); + dst_port = UDP(ulp)->uh_dport; + src_port = UDP(ulp)->uh_sport; + break; + + case IPPROTO_ICMP: + PULLUP_TO(hlen, ulp, struct icmphdr); + //args->f_id.flags = ICMP(ulp)->icmp_type; + break; + + default: + break; + } + } + + ip = mtod(m, struct ip *); + args->f_id.src_ip = ntohl(src_ip.s_addr); + args->f_id.dst_ip = ntohl(dst_ip.s_addr); + } +#undef PULLUP_TO + if (proto) { /* we may have port numbers, store them */ + args->f_id.proto = proto; + args->f_id.src_port = src_port = ntohs(src_port); + args->f_id.dst_port = dst_port = ntohs(dst_port); + } + + IPFW_RLOCK(chain); + if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ + IPFW_RUNLOCK(chain); + return (IP_FW_PASS); /* accept */ + } + if (args->rule.slot) { + /* + * Packet has already been tagged as a result of a previous + * match on rule args->rule aka args->rule_id (PIPE, QUEUE, + * REASS, NETGRAPH, DIVERT/TEE...) + * Validate the slot and continue from the next one + * if still present, otherwise do a lookup. + */ + f_pos = (args->rule.chain_id == chain->id) ? + args->rule.slot : + ipfw_find_rule(chain, args->rule.rulenum, + args->rule.rule_id); + } else { + f_pos = 0; + } + + /* + * Now scan the rules, and parse microinstructions for each rule. + * We have two nested loops and an inner switch. Sometimes we + * need to break out of one or both loops, or re-enter one of + * the loops with updated variables. Loop variables are: + * + * f_pos (outer loop) points to the current rule. + * On output it points to the matching rule. + * done (outer loop) is used as a flag to break the loop. + * l (inner loop) residual length of current rule. + * cmd points to the current microinstruction. + * + * We break the inner loop by setting l=0 and possibly + * cmdlen=0 if we don't want to advance cmd. + * We break the outer loop by setting done=1 + * We can restart the inner loop by setting l>0 and f_pos, f, cmd + * as needed. + */ + for (; f_pos < chain->n_rules; f_pos++) { + ipfw_insn *cmd; + uint32_t tablearg = 0; + int l, cmdlen, skip_or; /* skip rest of OR block */ + struct ip_fw *f; + + f = chain->map[f_pos]; + if (V_set_disable & (1 << f->set) ) + continue; + + skip_or = 0; + for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; + l -= cmdlen, cmd += cmdlen) { + int match; + + /* + * check_body is a jump target used when we find a + * CHECK_STATE, and need to jump to the body of + * the target rule. + */ + +/* check_body: */ + cmdlen = F_LEN(cmd); + /* + * An OR block (insn_1 || .. || insn_n) has the + * F_OR bit set in all but the last instruction. + * The first match will set "skip_or", and cause + * the following instructions to be skipped until + * past the one with the F_OR bit clear. + */ + if (skip_or) { /* skip this instruction */ + if ((cmd->len & F_OR) == 0) + skip_or = 0; /* next one is good */ + continue; + } + match = 0; /* set to 1 if we succeed */ + + switch (cmd->opcode) { + /* + * The first set of opcodes compares the packet's + * fields with some pattern, setting 'match' if a + * match is found. At the end of the loop there is + * logic to deal with F_NOT and F_OR flags associated + * with the opcode. + */ + case O_NOP: + match = 1; + break; + + case O_FORWARD_MAC: + printf("ipfw: opcode %d unimplemented\n", + cmd->opcode); + break; + + case O_GID: + case O_UID: + case O_JAIL: + /* + * We only check offset == 0 && proto != 0, + * as this ensures that we have a + * packet with the ports info. + */ + if (offset!=0) + break; + if (is_ipv6) /* XXX to be fixed later */ + break; + if (proto == IPPROTO_TCP || + proto == IPPROTO_UDP) + match = check_uidgid( + (ipfw_insn_u32 *)cmd, + proto, oif, + dst_ip, dst_port, + src_ip, src_port, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache, args->inp); +#else + (void *)&ucred_cache, + (struct inpcb *)args->m); +#endif + break; + + case O_RECV: + match = iface_match(m->m_pkthdr.rcvif, + (ipfw_insn_if *)cmd); + break; + + case O_XMIT: + match = iface_match(oif, (ipfw_insn_if *)cmd); + break; + + case O_VIA: + match = iface_match(oif ? oif : + m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd); + break; + + case O_MACADDR2: + if (args->eh != NULL) { /* have MAC header */ + u_int32_t *want = (u_int32_t *) + ((ipfw_insn_mac *)cmd)->addr; + u_int32_t *mask = (u_int32_t *) + ((ipfw_insn_mac *)cmd)->mask; + u_int32_t *hdr = (u_int32_t *)args->eh; + + match = + ( want[0] == (hdr[0] & mask[0]) && + want[1] == (hdr[1] & mask[1]) && + want[2] == (hdr[2] & mask[2]) ); + } + break; + + case O_MAC_TYPE: + if (args->eh != NULL) { + u_int16_t *p = + ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = cmdlen - 1; !match && i>0; + i--, p += 2) + match = (etype >= p[0] && + etype <= p[1]); + } + break; + + case O_FRAG: + match = (offset != 0); + break; + + case O_IN: /* "out" is "not in" */ + match = (oif == NULL); + break; + + case O_LAYER2: + match = (args->eh != NULL); + break; + + case O_DIVERTED: + { + /* For diverted packets, args->rule.info + * contains the divert port (in host format) + * reason and direction. + */ + uint32_t i = args->rule.info; + match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && + cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); + } + break; + + case O_PROTO: + /* + * We do not allow an arg of 0 so the + * check of "proto" only suffices. + */ + match = (proto == cmd->arg1); + break; + + case O_IP_SRC: + match = is_ipv4 && + (((ipfw_insn_ip *)cmd)->addr.s_addr == + src_ip.s_addr); + break; + + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + if (is_ipv4) { + uint32_t key = + (cmd->opcode == O_IP_DST_LOOKUP) ? + dst_ip.s_addr : src_ip.s_addr; + uint32_t v = 0; + + if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { + /* generic lookup. The key must be + * in 32bit big-endian format. + */ + v = ((ipfw_insn_u32 *)cmd)->d[1]; + if (v == 0) + key = dst_ip.s_addr; + else if (v == 1) + key = src_ip.s_addr; + else if (v == 6) /* dscp */ + key = (ip->ip_tos >> 2) & 0x3f; + else if (offset != 0) + break; + else if (proto != IPPROTO_TCP && + proto != IPPROTO_UDP) + break; + else if (v == 2) + key = htonl(dst_port); + else if (v == 3) + key = htonl(src_port); + else if (v == 4 || v == 5) { + check_uidgid( + (ipfw_insn_u32 *)cmd, + proto, oif, + dst_ip, dst_port, + src_ip, src_port, &ucred_lookup, +#ifdef __FreeBSD__ + &ucred_cache, args->inp); + if (v == 4 /* O_UID */) + key = ucred_cache->cr_uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache->cr_prison->pr_id; +#else /* !__FreeBSD__ */ + (void *)&ucred_cache, + (struct inpcb *)args->m); + if (v ==4 /* O_UID */) + key = ucred_cache.uid; + else if (v == 5 /* O_JAIL */) + key = ucred_cache.xid; +#endif /* !__FreeBSD__ */ + key = htonl(key); + } else + break; + } + match = ipfw_lookup_table(chain, + cmd->arg1, key, &v); + if (!match) + break; + if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) + match = + ((ipfw_insn_u32 *)cmd)->d[0] == v; + else + tablearg = v; + } + break; + + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + if (is_ipv4) { + uint32_t a = + (cmd->opcode == O_IP_DST_MASK) ? + dst_ip.s_addr : src_ip.s_addr; + uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; + int i = cmdlen-1; + + for (; !match && i>0; i-= 2, p+= 2) + match = (p[0] == (a & p[1])); + } + break; + + case O_IP_SRC_ME: + if (is_ipv4) { + struct ifnet *tif; + + INADDR_TO_IFP(src_ip, tif); + match = (tif != NULL); + break; + } +#ifdef INET6 + /* FALLTHROUGH */ + case O_IP6_SRC_ME: + match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); +#endif + break; + + case O_IP_DST_SET: + case O_IP_SRC_SET: + if (is_ipv4) { + u_int32_t *d = (u_int32_t *)(cmd+1); + u_int32_t addr = + cmd->opcode == O_IP_DST_SET ? + args->f_id.dst_ip : + args->f_id.src_ip; + + if (addr < d[0]) + break; + addr -= d[0]; /* subtract base */ + match = (addr < cmd->arg1) && + ( d[ 1 + (addr>>5)] & + (1<<(addr & 0x1f)) ); + } + break; + + case O_IP_DST: + match = is_ipv4 && + (((ipfw_insn_ip *)cmd)->addr.s_addr == + dst_ip.s_addr); + break; + + case O_IP_DST_ME: + if (is_ipv4) { + struct ifnet *tif; + + INADDR_TO_IFP(dst_ip, tif); + match = (tif != NULL); + break; + } +#ifdef INET6 + /* FALLTHROUGH */ + case O_IP6_DST_ME: + match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); +#endif + break; + + + case O_IP_SRCPORT: + case O_IP_DSTPORT: + /* + * offset == 0 && proto != 0 is enough + * to guarantee that we have a + * packet with port info. + */ + if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) + && offset == 0) { + u_int16_t x = + (cmd->opcode == O_IP_SRCPORT) ? + src_port : dst_port ; + u_int16_t *p = + ((ipfw_insn_u16 *)cmd)->ports; + int i; + + for (i = cmdlen - 1; !match && i>0; + i--, p += 2) + match = (x>=p[0] && x<=p[1]); + } + break; + + case O_ICMPTYPE: + match = (offset == 0 && proto==IPPROTO_ICMP && + icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); + break; + +#ifdef INET6 + case O_ICMP6TYPE: + match = is_ipv6 && offset == 0 && + proto==IPPROTO_ICMPV6 && + icmp6type_match( + ICMP6(ulp)->icmp6_type, + (ipfw_insn_u32 *)cmd); + break; +#endif /* INET6 */ + + case O_IPOPT: + match = (is_ipv4 && + ipopts_match(ip, cmd) ); + break; + + case O_IPVER: + match = (is_ipv4 && + cmd->arg1 == ip->ip_v); + break; + + case O_IPID: + case O_IPLEN: + case O_IPTTL: + if (is_ipv4) { /* only for IP packets */ + uint16_t x; + uint16_t *p; + int i; + + if (cmd->opcode == O_IPLEN) + x = iplen; + else if (cmd->opcode == O_IPTTL) + x = ip->ip_ttl; + else /* must be IPID */ + x = ntohs(ip->ip_id); + if (cmdlen == 1) { + match = (cmd->arg1 == x); + break; + } + /* otherwise we have ranges */ + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for (; !match && i>0; i--, p += 2) + match = (x >= p[0] && x <= p[1]); + } + break; + + case O_IPPRECEDENCE: + match = (is_ipv4 && + (cmd->arg1 == (ip->ip_tos & 0xe0)) ); + break; + + case O_IPTOS: + match = (is_ipv4 && + flags_match(cmd, ip->ip_tos)); + break; + + case O_TCPDATALEN: + if (proto == IPPROTO_TCP && offset == 0) { + struct tcphdr *tcp; + uint16_t x; + uint16_t *p; + int i; + + tcp = TCP(ulp); + x = iplen - + ((ip->ip_hl + tcp->th_off) << 2); + if (cmdlen == 1) { + match = (cmd->arg1 == x); + break; + } + /* otherwise we have ranges */ + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for (; !match && i>0; i--, p += 2) + match = (x >= p[0] && x <= p[1]); + } + break; + + case O_TCPFLAGS: + match = (proto == IPPROTO_TCP && offset == 0 && + flags_match(cmd, TCP(ulp)->th_flags)); + break; + + case O_TCPOPTS: + match = (proto == IPPROTO_TCP && offset == 0 && + tcpopts_match(TCP(ulp), cmd)); + break; + + case O_TCPSEQ: + match = (proto == IPPROTO_TCP && offset == 0 && + ((ipfw_insn_u32 *)cmd)->d[0] == + TCP(ulp)->th_seq); + break; + + case O_TCPACK: + match = (proto == IPPROTO_TCP && offset == 0 && + ((ipfw_insn_u32 *)cmd)->d[0] == + TCP(ulp)->th_ack); + break; + + case O_TCPWIN: + match = (proto == IPPROTO_TCP && offset == 0 && + cmd->arg1 == TCP(ulp)->th_win); + break; + + case O_ESTAB: + /* reject packets which have SYN only */ + /* XXX should i also check for TH_ACK ? */ + match = (proto == IPPROTO_TCP && offset == 0 && + (TCP(ulp)->th_flags & + (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); + break; + + case O_ALTQ: { + struct pf_mtag *at; + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; + + match = 1; + at = pf_find_mtag(m); + if (at != NULL && at->qid != 0) + break; + at = pf_get_mtag(m); + if (at == NULL) { + /* + * Let the packet fall back to the + * default ALTQ. + */ + break; + } + at->qid = altq->qid; + if (is_ipv4) + at->af = AF_INET; + else + at->af = AF_LINK; + at->hdr = ip; + break; + } + + case O_LOG: + ipfw_log(f, hlen, args, m, + oif, offset, tablearg, ip); + match = 1; + break; + + case O_PROB: + match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); + break; + + case O_VERREVPATH: + /* Outgoing packets automatically pass/match */ + match = ((oif != NULL) || + (m->m_pkthdr.rcvif == NULL) || + ( +#ifdef INET6 + is_ipv6 ? + verify_path6(&(args->f_id.src_ip6), + m->m_pkthdr.rcvif) : +#endif + verify_path(src_ip, m->m_pkthdr.rcvif, + args->f_id.fib))); + break; + + case O_VERSRCREACH: + /* Outgoing packets automatically pass/match */ + match = (hlen > 0 && ((oif != NULL) || +#ifdef INET6 + is_ipv6 ? + verify_path6(&(args->f_id.src_ip6), + NULL) : +#endif + verify_path(src_ip, NULL, args->f_id.fib))); + break; + + case O_ANTISPOOF: + /* Outgoing packets automatically pass/match */ + if (oif == NULL && hlen > 0 && + ( (is_ipv4 && in_localaddr(src_ip)) +#ifdef INET6 + || (is_ipv6 && + in6_localaddr(&(args->f_id.src_ip6))) +#endif + )) + match = +#ifdef INET6 + is_ipv6 ? verify_path6( + &(args->f_id.src_ip6), + m->m_pkthdr.rcvif) : +#endif + verify_path(src_ip, + m->m_pkthdr.rcvif, + args->f_id.fib); + else + match = 1; + break; + + case O_IPSEC: +#ifdef IPSEC + match = (m_tag_find(m, + PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); +#endif + /* otherwise no match */ + break; + +#ifdef INET6 + case O_IP6_SRC: + match = is_ipv6 && + IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, + &((ipfw_insn_ip6 *)cmd)->addr6); + break; + + case O_IP6_DST: + match = is_ipv6 && + IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, + &((ipfw_insn_ip6 *)cmd)->addr6); + break; + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + if (is_ipv6) { + int i = cmdlen - 1; + struct in6_addr p; + struct in6_addr *d = + &((ipfw_insn_ip6 *)cmd)->addr6; + + for (; !match && i > 0; d += 2, + i -= F_INSN_SIZE(struct in6_addr) + * 2) { + p = (cmd->opcode == + O_IP6_SRC_MASK) ? + args->f_id.src_ip6: + args->f_id.dst_ip6; + APPLY_MASK(&p, &d[1]); + match = + IN6_ARE_ADDR_EQUAL(&d[0], + &p); + } + } + break; + + case O_FLOW6ID: + match = is_ipv6 && + flow6id_match(args->f_id.flow_id6, + (ipfw_insn_u32 *) cmd); + break; + + case O_EXT_HDR: + match = is_ipv6 && + (ext_hd & ((ipfw_insn *) cmd)->arg1); + break; + + case O_IP6: + match = is_ipv6; + break; +#endif + + case O_IP4: + match = is_ipv4; + break; + + case O_TAG: { + struct m_tag *mtag; + uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + + /* Packet is already tagged with this tag? */ + mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); + + /* We have `untag' action when F_NOT flag is + * present. And we must remove this mtag from + * mbuf and reset `match' to zero (`match' will + * be inversed later). + * Otherwise we should allocate new mtag and + * push it into mbuf. + */ + if (cmd->len & F_NOT) { /* `untag' action */ + if (mtag != NULL) + m_tag_delete(m, mtag); + match = 0; + } else if (mtag == NULL) { + if ((mtag = m_tag_alloc(MTAG_IPFW, + tag, 0, M_NOWAIT)) != NULL) + m_tag_prepend(m, mtag); + match = 1; + } + break; + } + + case O_FIB: /* try match the specified fib */ + if (args->f_id.fib == cmd->arg1) + match = 1; + break; + + case O_TAGGED: { + struct m_tag *mtag; + uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + + if (cmdlen == 1) { + match = m_tag_locate(m, MTAG_IPFW, + tag, NULL) != NULL; + break; + } + + /* we have ranges */ + for (mtag = m_tag_first(m); + mtag != NULL && !match; + mtag = m_tag_next(m, mtag)) { + uint16_t *p; + int i; + + if (mtag->m_tag_cookie != MTAG_IPFW) + continue; + + p = ((ipfw_insn_u16 *)cmd)->ports; + i = cmdlen - 1; + for(; !match && i > 0; i--, p += 2) + match = + mtag->m_tag_id >= p[0] && + mtag->m_tag_id <= p[1]; + } + break; + } + + /* + * The second set of opcodes represents 'actions', + * i.e. the terminal part of a rule once the packet + * matches all previous patterns. + * Typically there is only one action for each rule, + * and the opcode is stored at the end of the rule + * (but there are exceptions -- see below). + * + * In general, here we set retval and terminate the + * outer loop (would be a 'break 3' in some language, + * but we need to set l=0, done=1) + * + * Exceptions: + * O_COUNT and O_SKIPTO actions: + * instead of terminating, we jump to the next rule + * (setting l=0), or to the SKIPTO target (setting + * f/f_len, cmd and l as needed), respectively. + * + * O_TAG, O_LOG and O_ALTQ action parameters: + * perform some action and set match = 1; + * + * O_LIMIT and O_KEEP_STATE: these opcodes are + * not real 'actions', and are stored right + * before the 'action' part of the rule. + * These opcodes try to install an entry in the + * state tables; if successful, we continue with + * the next opcode (match=1; break;), otherwise + * the packet must be dropped (set retval, + * break loops with l=0, done=1) + * + * O_PROBE_STATE and O_CHECK_STATE: these opcodes + * cause a lookup of the state table, and a jump + * to the 'action' part of the parent rule + * if an entry is found, or + * (CHECK_STATE only) a jump to the next rule if + * the entry is not found. + * The result of the lookup is cached so that + * further instances of these opcodes become NOPs. + * The jump to the next rule is done by setting + * l=0, cmdlen=0. + */ + case O_LIMIT: + case O_KEEP_STATE: + if (ipfw_install_state(f, + (ipfw_insn_limit *)cmd, args, tablearg)) { + /* error or limit violation */ + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + } + match = 1; + break; + + case O_PROBE_STATE: + case O_CHECK_STATE: + /* + * dynamic rules are checked at the first + * keep-state or check-state occurrence, + * with the result being stored in dyn_dir. + * The compiler introduces a PROBE_STATE + * instruction for us when we have a + * KEEP_STATE (because PROBE_STATE needs + * to be run first). + */ + if (dyn_dir == MATCH_UNKNOWN && + (q = ipfw_lookup_dyn_rule(&args->f_id, + &dyn_dir, proto == IPPROTO_TCP ? + TCP(ulp) : NULL)) + != NULL) { + /* + * Found dynamic entry, update stats + * and jump to the 'action' part of + * the parent rule by setting + * f, cmd, l and clearing cmdlen. + */ + q->pcnt++; + q->bcnt += pktlen; + /* XXX we would like to have f_pos + * readily accessible in the dynamic + * rule, instead of having to + * lookup q->rule. + */ + f = q->rule; + f_pos = ipfw_find_rule(chain, + f->rulenum, f->id); + cmd = ACTION_PTR(f); + l = f->cmd_len - f->act_ofs; + ipfw_dyn_unlock(); + cmdlen = 0; + match = 1; + break; + } + /* + * Dynamic entry not found. If CHECK_STATE, + * skip to next rule, if PROBE_STATE just + * ignore and continue with next opcode. + */ + if (cmd->opcode == O_CHECK_STATE) + l = 0; /* exit inner loop */ + match = 1; + break; + + case O_ACCEPT: + retval = 0; /* accept */ + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_PIPE: + case O_QUEUE: + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + if (cmd->opcode == O_PIPE) + args->rule.info |= IPFW_IS_PIPE; + if (V_fw_one_pass) + args->rule.info |= IPFW_ONEPASS; + retval = IP_FW_DUMMYNET; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_DIVERT: + case O_TEE: + if (args->eh) /* not on layer 2 */ + break; + /* otherwise this is terminal */ + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + retval = (cmd->opcode == O_DIVERT) ? + IP_FW_DIVERT : IP_FW_TEE; + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + break; + + case O_COUNT: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = time_uptime; + l = 0; /* exit inner loop */ + break; + + case O_SKIPTO: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = time_uptime; + /* If possible use cached f_pos (in f->next_rule), + * whose version is written in f->next_rule + * (horrible hacks to avoid changing the ABI). + */ + if (cmd->arg1 != IP_FW_TABLEARG && + (uintptr_t)f->x_next == chain->id) { + f_pos = (uintptr_t)f->next_rule; + } else { + int i = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + /* make sure we do not jump backward */ + if (i <= f->rulenum) + i = f->rulenum + 1; + f_pos = ipfw_find_rule(chain, i, 0); + /* update the cache */ + if (cmd->arg1 != IP_FW_TABLEARG) { + f->next_rule = + (void *)(uintptr_t)f_pos; + f->x_next = + (void *)(uintptr_t)chain->id; + } + } + /* + * Skip disabled rules, and re-enter + * the inner loop with the correct + * f_pos, f, l and cmd. + * Also clear cmdlen and skip_or + */ + for (; f_pos < chain->n_rules - 1 && + (V_set_disable & + (1 << chain->map[f_pos]->set)); + f_pos++) + ; + /* Re-enter the inner loop at the skipto rule. */ + f = chain->map[f_pos]; + l = f->cmd_len; + cmd = f->cmd; + match = 1; + cmdlen = 0; + skip_or = 0; + continue; + break; /* not reached */ + + case O_REJECT: + /* + * Drop the packet and send a reject notice + * if the packet is not ICMP (or is an ICMP + * query), and it is not multicast/broadcast. + */ + if (hlen > 0 && is_ipv4 && offset == 0 && + (proto != IPPROTO_ICMP || + is_icmp_query(ICMP(ulp))) && + !(m->m_flags & (M_BCAST|M_MCAST)) && + !IN_MULTICAST(ntohl(dst_ip.s_addr))) { + send_reject(args, cmd->arg1, iplen, ip); + m = args->m; + } + /* FALLTHROUGH */ +#ifdef INET6 + case O_UNREACH6: + if (hlen > 0 && is_ipv6 && + ((offset & IP6F_OFF_MASK) == 0) && + (proto != IPPROTO_ICMPV6 || + (is_icmp6_query(icmp6_type) == 1)) && + !(m->m_flags & (M_BCAST|M_MCAST)) && + !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { + send_reject6( + args, cmd->arg1, hlen, + (struct ip6_hdr *)ip); + m = args->m; + } + /* FALLTHROUGH */ +#endif + case O_DENY: + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_FORWARD_IP: + if (args->eh) /* not valid on layer2 pkts */ + break; + if (!q || dyn_dir == MATCH_FORWARD) { + struct sockaddr_in *sa; + sa = &(((ipfw_insn_sa *)cmd)->sa); + if (sa->sin_addr.s_addr == INADDR_ANY) { + bcopy(sa, &args->hopstore, + sizeof(*sa)); + args->hopstore.sin_addr.s_addr = + htonl(tablearg); + args->next_hop = &args->hopstore; + } else { + args->next_hop = sa; + } + } + retval = IP_FW_PASS; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_NETGRAPH: + case O_NGTEE: + set_match(args, f_pos, chain); + args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + if (V_fw_one_pass) + args->rule.info |= IPFW_ONEPASS; + retval = (cmd->opcode == O_NETGRAPH) ? + IP_FW_NETGRAPH : IP_FW_NGTEE; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_SETFIB: + f->pcnt++; /* update stats */ + f->bcnt += pktlen; + f->timestamp = time_uptime; + M_SETFIB(m, cmd->arg1); + args->f_id.fib = cmd->arg1; + l = 0; /* exit inner loop */ + break; + + case O_NAT: + if (!IPFW_NAT_LOADED) { + retval = IP_FW_DENY; + } else { + struct cfg_nat *t; + int nat_id; + + set_match(args, f_pos, chain); + t = ((ipfw_insn_nat *)cmd)->nat; + if (t == NULL) { + nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? + tablearg : cmd->arg1; + t = (*lookup_nat_ptr)(&chain->nat, nat_id); + + if (t == NULL) { + retval = IP_FW_DENY; + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + } + if (cmd->arg1 != IP_FW_TABLEARG) + ((ipfw_insn_nat *)cmd)->nat = t; + } + retval = ipfw_nat_ptr(args, t, m); + } + l = 0; /* exit inner loop */ + done = 1; /* exit outer loop */ + break; + + case O_REASS: { + int ip_off; + + f->pcnt++; + f->bcnt += pktlen; + l = 0; /* in any case exit inner loop */ + ip_off = ntohs(ip->ip_off); + + /* if not fragmented, go to next rule */ + if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) + break; + /* + * ip_reass() expects len & off in host + * byte order. + */ + SET_HOST_IPLEN(ip); + + args->m = m = ip_reass(m); + + /* + * do IP header checksum fixup. + */ + if (m == NULL) { /* fragment got swallowed */ + retval = IP_FW_DENY; + } else { /* good, packet complete */ + int hlen; + + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + SET_NET_IPLEN(ip); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(m, hlen); + retval = IP_FW_REASS; + set_match(args, f_pos, chain); + } + done = 1; /* exit outer loop */ + break; + } + + default: + panic("-- unknown opcode %d\n", cmd->opcode); + } /* end of switch() on opcodes */ + /* + * if we get here with l=0, then match is irrelevant. + */ + + if (cmd->len & F_NOT) + match = !match; + + if (match) { + if (cmd->len & F_OR) + skip_or = 1; + } else { + if (!(cmd->len & F_OR)) /* not an OR block, */ + break; /* try next rule */ + } + + } /* end of inner loop, scan opcodes */ + + if (done) + break; + +/* next_rule:; */ /* try next rule */ + + } /* end of outer for, scan rules */ + + if (done) { + struct ip_fw *rule = chain->map[f_pos]; + /* Update statistics */ + rule->pcnt++; + rule->bcnt += pktlen; + rule->timestamp = time_uptime; + } else { + retval = IP_FW_DENY; + printf("ipfw: ouch!, skip past end of rules, denying packet\n"); + } + IPFW_RUNLOCK(chain); +#ifdef __FreeBSD__ + if (ucred_cache != NULL) + crfree(ucred_cache); +#endif + return (retval); + +pullup_failed: + if (V_fw_verbose) + printf("ipfw: pullup failed\n"); + return (IP_FW_DENY); +} + +/* + * Module and VNET glue + */ + +/* + * Stuff that must be initialised only on boot or module load + */ +static int +ipfw_init(void) +{ + int error = 0; + + ipfw_dyn_attach(); + /* + * Only print out this stuff the first time around, + * when called from the sysinit code. + */ + printf("ipfw2 " +#ifdef INET6 + "(+ipv6) " +#endif + "initialized, divert %s, nat %s, " + "rule-based forwarding " +#ifdef IPFIREWALL_FORWARD + "enabled, " +#else + "disabled, " +#endif + "default to %s, logging ", +#ifdef IPDIVERT + "enabled", +#else + "loadable", +#endif +#ifdef IPFIREWALL_NAT + "enabled", +#else + "loadable", +#endif + default_to_accept ? "accept" : "deny"); + + /* + * Note: V_xxx variables can be accessed here but the vnet specific + * initializer may not have been called yet for the VIMAGE case. + * Tuneables will have been processed. We will print out values for + * the default vnet. + * XXX This should all be rationalized AFTER 8.0 + */ + if (V_fw_verbose == 0) + printf("disabled\n"); + else if (V_verbose_limit == 0) + printf("unlimited\n"); + else + printf("limited to %d packets/entry by default\n", + V_verbose_limit); + + ipfw_log_bpf(1); /* init */ + return (error); +} + +/* + * Called for the removal of the last instance only on module unload. + */ +static void +ipfw_destroy(void) +{ + + ipfw_log_bpf(0); /* uninit */ + ipfw_dyn_detach(); + printf("IP firewall unloaded\n"); +} + +/* + * Stuff that must be initialized for every instance + * (including the first of course). + */ +static int +vnet_ipfw_init(const void *unused) +{ + int error; + struct ip_fw *rule = NULL; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + + /* First set up some values that are compile time options */ + V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ + V_fw_deny_unknown_exthdrs = 1; +#ifdef IPFIREWALL_VERBOSE + V_fw_verbose = 1; +#endif +#ifdef IPFIREWALL_VERBOSE_LIMIT + V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; +#endif +#ifdef IPFIREWALL_NAT + LIST_INIT(&chain->nat); +#endif + + /* insert the default rule and create the initial map */ + chain->n_rules = 1; + chain->static_len = sizeof(struct ip_fw); + chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_NOWAIT | M_ZERO); + if (chain->map) + rule = malloc(chain->static_len, M_IPFW, M_NOWAIT | M_ZERO); + if (rule == NULL) { + if (chain->map) + free(chain->map, M_IPFW); + printf("ipfw2: ENOSPC initializing default rule " + "(support disabled)\n"); + return (ENOSPC); + } + error = ipfw_init_tables(chain); + if (error) { + panic("init_tables"); /* XXX Marko fix this ! */ + } + + /* fill and insert the default rule */ + rule->act_ofs = 0; + rule->rulenum = IPFW_DEFAULT_RULE; + rule->cmd_len = 1; + rule->set = RESVD_SET; + rule->cmd[0].len = 1; + rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; + chain->rules = chain->default_rule = chain->map[0] = rule; + chain->id = rule->id = 1; + + IPFW_LOCK_INIT(chain); + ipfw_dyn_init(); + + /* First set up some values that are compile time options */ + V_ipfw_vnet_ready = 1; /* Open for business */ + + /* + * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr) + * and pfil hooks for ipv4 and ipv6. Even if the latter two fail + * we still keep the module alive because the sockopt and + * layer2 paths are still useful. + * ipfw[6]_hook return 0 on success, ENOENT on failure, + * so we can ignore the exact return value and just set a flag. + * + * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so + * changes in the underlying (per-vnet) variables trigger + * immediate hook()/unhook() calls. + * In layer2 we have the same behaviour, except that V_ether_ipfw + * is checked on each packet because there are no pfil hooks. + */ + V_ip_fw_ctl_ptr = ipfw_ctl; + V_ip_fw_chk_ptr = ipfw_chk; + error = ipfw_attach_hooks(1); + return (error); +} + +/* + * Called for the removal of each instance. + */ +static int +vnet_ipfw_uninit(const void *unused) +{ + struct ip_fw *reap, *rule; + struct ip_fw_chain *chain = &V_layer3_chain; + int i; + + V_ipfw_vnet_ready = 0; /* tell new callers to go away */ + /* + * disconnect from ipv4, ipv6, layer2 and sockopt. + * Then grab, release and grab again the WLOCK so we make + * sure the update is propagated and nobody will be in. + */ + (void)ipfw_attach_hooks(0 /* detach */); + V_ip_fw_chk_ptr = NULL; + V_ip_fw_ctl_ptr = NULL; + IPFW_UH_WLOCK(chain); + IPFW_UH_WUNLOCK(chain); + IPFW_UH_WLOCK(chain); + + IPFW_WLOCK(chain); + IPFW_WUNLOCK(chain); + IPFW_WLOCK(chain); + + ipfw_dyn_uninit(0); /* run the callout_drain */ + ipfw_destroy_tables(chain); + reap = NULL; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + rule->x_next = reap; + reap = rule; + } + if (chain->map) + free(chain->map, M_IPFW); + IPFW_WUNLOCK(chain); + IPFW_UH_WUNLOCK(chain); + if (reap != NULL) + ipfw_reap_rules(reap); + IPFW_LOCK_DESTROY(chain); + ipfw_dyn_uninit(1); /* free the remaining parts */ + return 0; +} + +/* + * Module event handler. + * In general we have the choice of handling most of these events by the + * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to + * use the SYSINIT handlers as they are more capable of expressing the + * flow of control during module and vnet operations, so this is just + * a skeleton. Note there is no SYSINIT equivalent of the module + * SHUTDOWN handler, but we don't have anything to do in that case anyhow. + */ +static int +ipfw_modevent(module_t mod, int type, void *unused) +{ + int err = 0; + + switch (type) { + case MOD_LOAD: + /* Called once at module load or + * system boot if compiled in. */ + break; + case MOD_QUIESCE: + /* Called before unload. May veto unloading. */ + break; + case MOD_UNLOAD: + /* Called during unload. */ + break; + case MOD_SHUTDOWN: + /* Called during system shutdown. */ + break; + default: + err = EOPNOTSUPP; + break; + } + return err; +} + +static moduledata_t ipfwmod = { + "ipfw", + ipfw_modevent, + 0 +}; + +/* Define startup order. */ +#define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN +#define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ +#define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ +#define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ + +DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); +MODULE_VERSION(ipfw, 2); +/* should declare some dependencies here */ + +/* + * Starting up. Done in order after ipfwmod() has been called. + * VNET_SYSINIT is also called for each existing vnet and each new vnet. + */ +SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, + ipfw_init, NULL); +VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, + vnet_ipfw_init, NULL); + +/* + * Closing up shop. These are done in REVERSE ORDER, but still + * after ipfwmod() has been called. Not called on reboot. + * VNET_SYSUNINIT is also called for each exiting vnet as it exits. + * or when the module is unloaded. + */ +SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, + ipfw_destroy, NULL); +VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, + vnet_ipfw_uninit, NULL); +/* end of file */ diff --git a/freebsd/sys/netinet/ipfw/ip_fw_log.c b/freebsd/sys/netinet/ipfw/ip_fw_log.c new file mode 100644 index 00000000..0a5cd94c --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_fw_log.c @@ -0,0 +1,451 @@ +#include + +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * Logging support for ipfw + */ + +#if !defined(KLD_MODULE) +#include +#include +#include +#include +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include /* for ETHERTYPE_IP */ +#include +#include +#include /* for IFT_ETHER */ +#include /* for BPF */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#ifdef INET6 +#include /* ip6_sprintf() */ +#endif + +#ifdef MAC +#include +#endif + +/* + * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T + * Other macros just cast void * into the appropriate type + */ +#define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) +#define TCP(p) ((struct tcphdr *)(p)) +#define SCTP(p) ((struct sctphdr *)(p)) +#define UDP(p) ((struct udphdr *)(p)) +#define ICMP(p) ((struct icmphdr *)(p)) +#define ICMP6(p) ((struct icmp6_hdr *)(p)) + +#define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 +#define SNP(buf) buf, sizeof(buf) + +#ifdef WITHOUT_BPF +void +ipfw_log_bpf(int onoff) +{ +} +#else /* !WITHOUT_BPF */ +static struct ifnet *log_if; /* hook to attach to bpf */ + +/* we use this dummy function for all ifnet callbacks */ +static int +log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) +{ + return EINVAL; +} + +static int +ipfw_log_output(struct ifnet *ifp, struct mbuf *m, + struct sockaddr *dst, struct route *ro) +{ + if (m != NULL) + m_freem(m); + return EINVAL; +} + +static void +ipfw_log_start(struct ifnet* ifp) +{ + panic("ipfw_log_start() must not be called"); +} + +static const u_char ipfwbroadcastaddr[6] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + +void +ipfw_log_bpf(int onoff) +{ + struct ifnet *ifp; + + if (onoff) { + if (log_if) + return; + ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) + return; + if_initname(ifp, "ipfw", 0); + ifp->if_mtu = 65536; + ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_init = (void *)log_dummy; + ifp->if_ioctl = log_dummy; + ifp->if_start = ipfw_log_start; + ifp->if_output = ipfw_log_output; + ifp->if_addrlen = 6; + ifp->if_hdrlen = 14; + if_attach(ifp); + ifp->if_broadcastaddr = ipfwbroadcastaddr; + ifp->if_baudrate = IF_Mbps(10); + bpfattach(ifp, DLT_EN10MB, 14); + log_if = ifp; + } else { + if (log_if) { + ether_ifdetach(log_if); + if_free(log_if); + } + log_if = NULL; + } +} +#endif /* !WITHOUT_BPF */ + +/* + * We enter here when we have a rule with O_LOG. + * XXX this function alone takes about 2Kbytes of code! + */ +void +ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, + struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, + struct ip *ip) +{ + char *action; + int limit_reached = 0; + char action2[40], proto[128], fragment[32]; + + if (V_fw_verbose == 0) { +#ifndef WITHOUT_BPF + + if (log_if == NULL || log_if->if_bpf == NULL) + return; + + if (args->eh) /* layer2, use orig hdr */ + BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m); + else + /* Add fake header. Later we will store + * more info in the header. + */ + BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); +#endif /* !WITHOUT_BPF */ + return; + } + /* the old 'log' function */ + fragment[0] = '\0'; + proto[0] = '\0'; + + if (f == NULL) { /* bogus pkt */ + if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) + return; + V_norule_counter++; + if (V_norule_counter == V_verbose_limit) + limit_reached = V_verbose_limit; + action = "Refuse"; + } else { /* O_LOG is the first action, find the real one */ + ipfw_insn *cmd = ACTION_PTR(f); + ipfw_insn_log *l = (ipfw_insn_log *)cmd; + + if (l->max_log != 0 && l->log_left == 0) + return; + l->log_left--; + if (l->log_left == 0) + limit_reached = l->max_log; + cmd += F_LEN(cmd); /* point to first action */ + if (cmd->opcode == O_ALTQ) { + ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; + + snprintf(SNPARGS(action2, 0), "Altq %d", + altq->qid); + cmd += F_LEN(cmd); + } + if (cmd->opcode == O_PROB) + cmd += F_LEN(cmd); + + if (cmd->opcode == O_TAG) + cmd += F_LEN(cmd); + + action = action2; + switch (cmd->opcode) { + case O_DENY: + action = "Deny"; + break; + + case O_REJECT: + if (cmd->arg1==ICMP_REJECT_RST) + action = "Reset"; + else if (cmd->arg1==ICMP_UNREACH_HOST) + action = "Reject"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_UNREACH6: + if (cmd->arg1==ICMP6_UNREACH_RST) + action = "Reset"; + else + snprintf(SNPARGS(action2, 0), "Unreach %d", + cmd->arg1); + break; + + case O_ACCEPT: + action = "Accept"; + break; + case O_COUNT: + action = "Count"; + break; + case O_DIVERT: + snprintf(SNPARGS(action2, 0), "Divert %d", + cmd->arg1); + break; + case O_TEE: + snprintf(SNPARGS(action2, 0), "Tee %d", + cmd->arg1); + break; + case O_SETFIB: + snprintf(SNPARGS(action2, 0), "SetFib %d", + cmd->arg1); + break; + case O_SKIPTO: + snprintf(SNPARGS(action2, 0), "SkipTo %d", + cmd->arg1); + break; + case O_PIPE: + snprintf(SNPARGS(action2, 0), "Pipe %d", + cmd->arg1); + break; + case O_QUEUE: + snprintf(SNPARGS(action2, 0), "Queue %d", + cmd->arg1); + break; + case O_FORWARD_IP: { + ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; + int len; + struct in_addr dummyaddr; + if (sa->sa.sin_addr.s_addr == INADDR_ANY) + dummyaddr.s_addr = htonl(tablearg); + else + dummyaddr.s_addr = sa->sa.sin_addr.s_addr; + + len = snprintf(SNPARGS(action2, 0), "Forward to %s", + inet_ntoa(dummyaddr)); + + if (sa->sa.sin_port) + snprintf(SNPARGS(action2, len), ":%d", + sa->sa.sin_port); + } + break; + case O_NETGRAPH: + snprintf(SNPARGS(action2, 0), "Netgraph %d", + cmd->arg1); + break; + case O_NGTEE: + snprintf(SNPARGS(action2, 0), "Ngtee %d", + cmd->arg1); + break; + case O_NAT: + action = "Nat"; + break; + case O_REASS: + action = "Reass"; + break; + default: + action = "UNKNOWN"; + break; + } + } + + if (hlen == 0) { /* non-ip */ + snprintf(SNPARGS(proto, 0), "MAC"); + + } else { + int len; +#ifdef INET6 + char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; +#else + char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; +#endif + struct icmphdr *icmp; + struct tcphdr *tcp; + struct udphdr *udp; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + struct icmp6_hdr *icmp6; +#endif + src[0] = '\0'; + dst[0] = '\0'; +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + char ip6buf[INET6_ADDRSTRLEN]; + snprintf(src, sizeof(src), "[%s]", + ip6_sprintf(ip6buf, &args->f_id.src_ip6)); + snprintf(dst, sizeof(dst), "[%s]", + ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); + + ip6 = (struct ip6_hdr *)ip; + tcp = (struct tcphdr *)(((char *)ip) + hlen); + udp = (struct udphdr *)(((char *)ip) + hlen); + } else +#endif + { + tcp = L3HDR(struct tcphdr, ip); + udp = L3HDR(struct udphdr, ip); + + inet_ntoa_r(ip->ip_src, src); + inet_ntoa_r(ip->ip_dst, dst); + } + + switch (args->f_id.proto) { + case IPPROTO_TCP: + len = snprintf(SNPARGS(proto, 0), "TCP %s", src); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(tcp->th_sport), + dst, + ntohs(tcp->th_dport)); + else + snprintf(SNPARGS(proto, len), " %s", dst); + break; + + case IPPROTO_UDP: + len = snprintf(SNPARGS(proto, 0), "UDP %s", src); + if (offset == 0) + snprintf(SNPARGS(proto, len), ":%d %s:%d", + ntohs(udp->uh_sport), + dst, + ntohs(udp->uh_dport)); + else + snprintf(SNPARGS(proto, len), " %s", dst); + break; + + case IPPROTO_ICMP: + icmp = L3HDR(struct icmphdr, ip); + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMP:%u.%u ", + icmp->icmp_type, icmp->icmp_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMP "); + len += snprintf(SNPARGS(proto, len), "%s", src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; +#ifdef INET6 + case IPPROTO_ICMPV6: + icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); + if (offset == 0) + len = snprintf(SNPARGS(proto, 0), + "ICMPv6:%u.%u ", + icmp6->icmp6_type, icmp6->icmp6_code); + else + len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); + len += snprintf(SNPARGS(proto, len), "%s", src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; +#endif + default: + len = snprintf(SNPARGS(proto, 0), "P:%d %s", + args->f_id.proto, src); + snprintf(SNPARGS(proto, len), " %s", dst); + break; + } + +#ifdef INET6 + if (IS_IP6_FLOW_ID(&(args->f_id))) { + if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) + snprintf(SNPARGS(fragment, 0), + " (frag %08x:%d@%d%s)", + args->f_id.extra, + ntohs(ip6->ip6_plen) - hlen, + ntohs(offset & IP6F_OFF_MASK) << 3, + (offset & IP6F_MORE_FRAG) ? "+" : ""); + } else +#endif + { + int ipoff, iplen; + ipoff = ntohs(ip->ip_off); + iplen = ntohs(ip->ip_len); + if (ipoff & (IP_MF | IP_OFFMASK)) + snprintf(SNPARGS(fragment, 0), + " (frag %d:%d@%d%s)", + ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), + offset << 3, + (ipoff & IP_MF) ? "+" : ""); + } + } +#ifdef __FreeBSD__ + if (oif || m->m_pkthdr.rcvif) + log(LOG_SECURITY | LOG_INFO, + "ipfw: %d %s %s %s via %s%s\n", + f ? f->rulenum : -1, + action, proto, oif ? "out" : "in", + oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, + fragment); + else +#endif + log(LOG_SECURITY | LOG_INFO, + "ipfw: %d %s %s [no if info]%s\n", + f ? f->rulenum : -1, + action, proto, fragment); + if (limit_reached) + log(LOG_SECURITY | LOG_NOTICE, + "ipfw: limit %d reached on entry %d\n", + limit_reached, f ? f->rulenum : -1); +} +/* end of file */ diff --git a/freebsd/sys/netinet/ipfw/ip_fw_nat.c b/freebsd/sys/netinet/ipfw/ip_fw_nat.c new file mode 100644 index 00000000..e6c8bcec --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_fw_nat.c @@ -0,0 +1,606 @@ +#include + +/*- + * Copyright (c) 2008 Paolo Pisati + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPFW_INTERNAL /* Access to protected data structures in ip_fw.h. */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* XXX for in_cksum */ + +static VNET_DEFINE(eventhandler_tag, ifaddr_event_tag); +#define V_ifaddr_event_tag VNET(ifaddr_event_tag) + +static void +ifaddr_change(void *arg __unused, struct ifnet *ifp) +{ + struct cfg_nat *ptr; + struct ifaddr *ifa; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + IPFW_WLOCK(chain); + /* Check every nat entry... */ + LIST_FOREACH(ptr, &chain->nat, _next) { + /* ...using nic 'ifp->if_xname' as dynamic alias address. */ + if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0) + continue; + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr == NULL) + continue; + if (ifa->ifa_addr->sa_family != AF_INET) + continue; + ptr->ip = ((struct sockaddr_in *) + (ifa->ifa_addr))->sin_addr; + LibAliasSetAddress(ptr->lib, ptr->ip); + } + if_addr_runlock(ifp); + } + IPFW_WUNLOCK(chain); +} + +/* + * delete the pointers for nat entry ix, or all of them if ix < 0 + */ +static void +flush_nat_ptrs(struct ip_fw_chain *chain, const int ix) +{ + int i; + ipfw_insn_nat *cmd; + + IPFW_WLOCK_ASSERT(chain); + for (i = 0; i < chain->n_rules; i++) { + cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]); + /* XXX skip log and the like ? */ + if (cmd->o.opcode == O_NAT && cmd->nat != NULL && + (ix < 0 || cmd->nat->id == ix)) + cmd->nat = NULL; + } +} + +static void +del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) +{ + struct cfg_redir *r, *tmp_r; + struct cfg_spool *s, *tmp_s; + int i, num; + + LIST_FOREACH_SAFE(r, head, _next, tmp_r) { + num = 1; /* Number of alias_link to delete. */ + switch (r->mode) { + case REDIR_PORT: + num = r->pport_cnt; + /* FALLTHROUGH */ + case REDIR_ADDR: + case REDIR_PROTO: + /* Delete all libalias redirect entry. */ + for (i = 0; i < num; i++) + LibAliasRedirectDelete(n->lib, r->alink[i]); + /* Del spool cfg if any. */ + LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) { + LIST_REMOVE(s, _next); + free(s, M_IPFW); + } + free(r->alink, M_IPFW); + LIST_REMOVE(r, _next); + free(r, M_IPFW); + break; + default: + printf("unknown redirect mode: %u\n", r->mode); + /* XXX - panic?!?!? */ + break; + } + } +} + +static int +add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) +{ + struct cfg_redir *r, *ser_r; + struct cfg_spool *s, *ser_s; + int cnt, off, i; + + for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { + ser_r = (struct cfg_redir *)&buf[off]; + r = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); + memcpy(r, ser_r, SOF_REDIR); + LIST_INIT(&r->spool_chain); + off += SOF_REDIR; + r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, + M_IPFW, M_WAITOK | M_ZERO); + switch (r->mode) { + case REDIR_ADDR: + r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, + r->paddr); + break; + case REDIR_PORT: + for (i = 0 ; i < r->pport_cnt; i++) { + /* If remotePort is all ports, set it to 0. */ + u_short remotePortCopy = r->rport + i; + if (r->rport_cnt == 1 && r->rport == 0) + remotePortCopy = 0; + r->alink[i] = LibAliasRedirectPort(ptr->lib, + r->laddr, htons(r->lport + i), r->raddr, + htons(remotePortCopy), r->paddr, + htons(r->pport + i), r->proto); + if (r->alink[i] == NULL) { + r->alink[0] = NULL; + break; + } + } + break; + case REDIR_PROTO: + r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, + r->raddr, r->paddr, r->proto); + break; + default: + printf("unknown redirect mode: %u\n", r->mode); + break; + } + /* XXX perhaps return an error instead of panic ? */ + if (r->alink[0] == NULL) + panic("LibAliasRedirect* returned NULL"); + /* LSNAT handling. */ + for (i = 0; i < r->spool_cnt; i++) { + ser_s = (struct cfg_spool *)&buf[off]; + s = malloc(SOF_REDIR, M_IPFW, M_WAITOK | M_ZERO); + memcpy(s, ser_s, SOF_SPOOL); + LibAliasAddServer(ptr->lib, r->alink[0], + s->addr, htons(s->port)); + off += SOF_SPOOL; + /* Hook spool entry. */ + LIST_INSERT_HEAD(&r->spool_chain, s, _next); + } + /* And finally hook this redir entry. */ + LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); + } + return (1); +} + +static int +ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) +{ + struct mbuf *mcl; + struct ip *ip; + /* XXX - libalias duct tape */ + int ldt, retval; + char *c; + + ldt = 0; + retval = 0; + mcl = m_megapullup(m, m->m_pkthdr.len); + if (mcl == NULL) { + args->m = NULL; + return (IP_FW_DENY); + } + ip = mtod(mcl, struct ip *); + + /* + * XXX - Libalias checksum offload 'duct tape': + * + * locally generated packets have only pseudo-header checksum + * calculated and libalias will break it[1], so mark them for + * later fix. Moreover there are cases when libalias modifies + * tcp packet data[2], mark them for later fix too. + * + * [1] libalias was never meant to run in kernel, so it does + * not have any knowledge about checksum offloading, and + * expects a packet with a full internet checksum. + * Unfortunately, packets generated locally will have just the + * pseudo header calculated, and when libalias tries to adjust + * the checksum it will actually compute a wrong value. + * + * [2] when libalias modifies tcp's data content, full TCP + * checksum has to be recomputed: the problem is that + * libalias does not have any idea about checksum offloading. + * To work around this, we do not do checksumming in LibAlias, + * but only mark the packets in th_x2 field. If we receive a + * marked packet, we calculate correct checksum for it + * aware of offloading. Why such a terrible hack instead of + * recalculating checksum for each packet? + * Because the previous checksum was not checked! + * Recalculating checksums for EVERY packet will hide ALL + * transmission errors. Yes, marked packets still suffer from + * this problem. But, sigh, natd(8) has this problem, too. + * + * TODO: -make libalias mbuf aware (so + * it can handle delayed checksum and tso) + */ + + if (mcl->m_pkthdr.rcvif == NULL && + mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) + ldt = 1; + + c = mtod(mcl, char *); + if (args->oif == NULL) + retval = LibAliasIn(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl)); + else + retval = LibAliasOut(t->lib, c, + mcl->m_len + M_TRAILINGSPACE(mcl)); + if (retval == PKT_ALIAS_RESPOND) { + m->m_flags |= M_SKIP_FIREWALL; + retval = PKT_ALIAS_OK; + } + if (retval != PKT_ALIAS_OK && + retval != PKT_ALIAS_FOUND_HEADER_FRAGMENT) { + /* XXX - should i add some logging? */ + m_free(mcl); + args->m = NULL; + return (IP_FW_DENY); + } + mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); + + /* + * XXX - libalias checksum offload + * 'duct tape' (see above) + */ + + if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && + ip->ip_p == IPPROTO_TCP) { + struct tcphdr *th; + + th = (struct tcphdr *)(ip + 1); + if (th->th_x2) + ldt = 1; + } + + if (ldt) { + struct tcphdr *th; + struct udphdr *uh; + u_short cksum; + + ip->ip_len = ntohs(ip->ip_len); + cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(ip->ip_p + ip->ip_len - (ip->ip_hl << 2))); + + switch (ip->ip_p) { + case IPPROTO_TCP: + th = (struct tcphdr *)(ip + 1); + /* + * Maybe it was set in + * libalias... + */ + th->th_x2 = 0; + th->th_sum = cksum; + mcl->m_pkthdr.csum_data = + offsetof(struct tcphdr, th_sum); + break; + case IPPROTO_UDP: + uh = (struct udphdr *)(ip + 1); + uh->uh_sum = cksum; + mcl->m_pkthdr.csum_data = + offsetof(struct udphdr, uh_sum); + break; + } + /* No hw checksum offloading: do it ourselves */ + if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { + in_delayed_cksum(mcl); + mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } + ip->ip_len = htons(ip->ip_len); + } + args->m = mcl; + return (IP_FW_NAT); +} + +static struct cfg_nat * +lookup_nat(struct nat_list *l, int nat_id) +{ + struct cfg_nat *res; + + LIST_FOREACH(res, l, _next) { + if (res->id == nat_id) + break; + } + return res; +} + +static int +ipfw_nat_cfg(struct sockopt *sopt) +{ + struct cfg_nat *ptr, *ser_n; + char *buf; + struct ip_fw_chain *chain = &V_layer3_chain; + + buf = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); + sooptcopyin(sopt, buf, NAT_BUF_LEN, sizeof(struct cfg_nat)); + ser_n = (struct cfg_nat *)buf; + + /* check valid parameter ser_n->id > 0 ? */ + /* + * Find/create nat rule. + */ + IPFW_WLOCK(chain); + ptr = lookup_nat(&chain->nat, ser_n->id); + if (ptr == NULL) { + /* New rule: allocate and init new instance. */ + ptr = malloc(sizeof(struct cfg_nat), + M_IPFW, M_NOWAIT | M_ZERO); + if (ptr == NULL) { + IPFW_WUNLOCK(chain); + free(buf, M_IPFW); + return (ENOSPC); + } + ptr->lib = LibAliasInit(NULL); + if (ptr->lib == NULL) { + IPFW_WUNLOCK(chain); + free(ptr, M_IPFW); + free(buf, M_IPFW); + return (EINVAL); + } + LIST_INIT(&ptr->redir_chain); + } else { + /* Entry already present: temporarly unhook it. */ + LIST_REMOVE(ptr, _next); + flush_nat_ptrs(chain, ser_n->id); + } + IPFW_WUNLOCK(chain); + + /* + * Basic nat configuration. + */ + ptr->id = ser_n->id; + /* + * XXX - what if this rule doesn't nat any ip and just + * redirect? + * do we set aliasaddress to 0.0.0.0? + */ + ptr->ip = ser_n->ip; + ptr->redir_cnt = ser_n->redir_cnt; + ptr->mode = ser_n->mode; + LibAliasSetMode(ptr->lib, ser_n->mode, ser_n->mode); + LibAliasSetAddress(ptr->lib, ptr->ip); + memcpy(ptr->if_name, ser_n->if_name, IF_NAMESIZE); + + /* + * Redir and LSNAT configuration. + */ + /* Delete old cfgs. */ + del_redir_spool_cfg(ptr, &ptr->redir_chain); + /* Add new entries. */ + add_redir_spool_cfg(&buf[(sizeof(struct cfg_nat))], ptr); + free(buf, M_IPFW); + IPFW_WLOCK(chain); + LIST_INSERT_HEAD(&chain->nat, ptr, _next); + IPFW_WUNLOCK(chain); + return (0); +} + +static int +ipfw_nat_del(struct sockopt *sopt) +{ + struct cfg_nat *ptr; + struct ip_fw_chain *chain = &V_layer3_chain; + int i; + + sooptcopyin(sopt, &i, sizeof i, sizeof i); + /* XXX validate i */ + IPFW_WLOCK(chain); + ptr = lookup_nat(&chain->nat, i); + if (ptr == NULL) { + IPFW_WUNLOCK(chain); + return (EINVAL); + } + LIST_REMOVE(ptr, _next); + flush_nat_ptrs(chain, i); + IPFW_WUNLOCK(chain); + del_redir_spool_cfg(ptr, &ptr->redir_chain); + LibAliasUninit(ptr->lib); + free(ptr, M_IPFW); + return (0); +} + +static int +ipfw_nat_get_cfg(struct sockopt *sopt) +{ + uint8_t *data; + struct cfg_nat *n; + struct cfg_redir *r; + struct cfg_spool *s; + int nat_cnt, off; + struct ip_fw_chain *chain; + int err = ENOSPC; + + chain = &V_layer3_chain; + nat_cnt = 0; + off = sizeof(nat_cnt); + + data = malloc(NAT_BUF_LEN, M_IPFW, M_WAITOK | M_ZERO); + IPFW_RLOCK(chain); + /* Serialize all the data. */ + LIST_FOREACH(n, &chain->nat, _next) { + nat_cnt++; + if (off + SOF_NAT >= NAT_BUF_LEN) + goto nospace; + bcopy(n, &data[off], SOF_NAT); + off += SOF_NAT; + LIST_FOREACH(r, &n->redir_chain, _next) { + if (off + SOF_REDIR >= NAT_BUF_LEN) + goto nospace; + bcopy(r, &data[off], SOF_REDIR); + off += SOF_REDIR; + LIST_FOREACH(s, &r->spool_chain, _next) { + if (off + SOF_SPOOL >= NAT_BUF_LEN) + goto nospace; + bcopy(s, &data[off], SOF_SPOOL); + off += SOF_SPOOL; + } + } + } + err = 0; /* all good */ +nospace: + IPFW_RUNLOCK(chain); + if (err == 0) { + bcopy(&nat_cnt, data, sizeof(nat_cnt)); + sooptcopyout(sopt, data, NAT_BUF_LEN); + } else { + printf("serialized data buffer not big enough:" + "please increase NAT_BUF_LEN\n"); + } + free(data, M_IPFW); + return (err); +} + +static int +ipfw_nat_get_log(struct sockopt *sopt) +{ + uint8_t *data; + struct cfg_nat *ptr; + int i, size; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + + IPFW_RLOCK(chain); + /* one pass to count, one to copy the data */ + i = 0; + LIST_FOREACH(ptr, &chain->nat, _next) { + if (ptr->lib->logDesc == NULL) + continue; + i++; + } + size = i * (LIBALIAS_BUF_SIZE + sizeof(int)); + data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO); + if (data == NULL) { + IPFW_RUNLOCK(chain); + return (ENOSPC); + } + i = 0; + LIST_FOREACH(ptr, &chain->nat, _next) { + if (ptr->lib->logDesc == NULL) + continue; + bcopy(&ptr->id, &data[i], sizeof(int)); + i += sizeof(int); + bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE); + i += LIBALIAS_BUF_SIZE; + } + IPFW_RUNLOCK(chain); + sooptcopyout(sopt, data, size); + free(data, M_IPFW); + return(0); +} + +static void +ipfw_nat_init(void) +{ + + IPFW_WLOCK(&V_layer3_chain); + /* init ipfw hooks */ + ipfw_nat_ptr = ipfw_nat; + lookup_nat_ptr = lookup_nat; + ipfw_nat_cfg_ptr = ipfw_nat_cfg; + ipfw_nat_del_ptr = ipfw_nat_del; + ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; + ipfw_nat_get_log_ptr = ipfw_nat_get_log; + IPFW_WUNLOCK(&V_layer3_chain); + V_ifaddr_event_tag = EVENTHANDLER_REGISTER( + ifaddr_event, ifaddr_change, + NULL, EVENTHANDLER_PRI_ANY); +} + +static void +ipfw_nat_destroy(void) +{ + struct cfg_nat *ptr, *ptr_temp; + struct ip_fw_chain *chain; + + chain = &V_layer3_chain; + IPFW_WLOCK(chain); + LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { + LIST_REMOVE(ptr, _next); + del_redir_spool_cfg(ptr, &ptr->redir_chain); + LibAliasUninit(ptr->lib); + free(ptr, M_IPFW); + } + EVENTHANDLER_DEREGISTER(ifaddr_event, V_ifaddr_event_tag); + flush_nat_ptrs(chain, -1 /* flush all */); + /* deregister ipfw_nat */ + ipfw_nat_ptr = NULL; + lookup_nat_ptr = NULL; + ipfw_nat_cfg_ptr = NULL; + ipfw_nat_del_ptr = NULL; + ipfw_nat_get_cfg_ptr = NULL; + ipfw_nat_get_log_ptr = NULL; + IPFW_WUNLOCK(chain); +} + +static int +ipfw_nat_modevent(module_t mod, int type, void *unused) +{ + int err = 0; + + switch (type) { + case MOD_LOAD: + ipfw_nat_init(); + break; + + case MOD_UNLOAD: + ipfw_nat_destroy(); + break; + + default: + return EOPNOTSUPP; + break; + } + return err; +} + +static moduledata_t ipfw_nat_mod = { + "ipfw_nat", + ipfw_nat_modevent, + 0 +}; + +DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); +MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); +MODULE_DEPEND(ipfw_nat, ipfw, 2, 2, 2); +MODULE_VERSION(ipfw_nat, 1); +/* end of file */ diff --git a/freebsd/sys/netinet/ipfw/ip_fw_pfil.c b/freebsd/sys/netinet/ipfw/ip_fw_pfil.c new file mode 100644 index 00000000..8759f409 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_fw_pfil.c @@ -0,0 +1,417 @@ +#include + +/*- + * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#if !defined(KLD_MODULE) +#include +#include +#include +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif /* KLD_MODULE */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +static VNET_DEFINE(int, fw_enable) = 1; +#define V_fw_enable VNET(fw_enable) + +#ifdef INET6 +static VNET_DEFINE(int, fw6_enable) = 1; +#define V_fw6_enable VNET(fw6_enable) +#endif + +int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); + +/* Forward declarations. */ +static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); + +#ifdef SYSCTL_NODE + +SYSBEGIN(f1) + +SYSCTL_DECL(_net_inet_ip_fw); +SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, + ipfw_chg_hook, "I", "Enable ipfw"); +#ifdef INET6 +SYSCTL_DECL(_net_inet6_ip6_fw); +SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, + ipfw_chg_hook, "I", "Enable ipfw+6"); +#endif /* INET6 */ + +SYSEND + +#endif /* SYSCTL_NODE */ + +/* + * The pfilter hook to pass packets to ipfw_chk and then to + * dummynet, divert, netgraph or other modules. + * The packet may be consumed. + */ +int +ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, + struct inpcb *inp) +{ + struct ip_fw_args args; + struct m_tag *tag; + int ipfw; + int ret; + + /* all the processing now uses ip_len in net format */ + if (mtod(*m0, struct ip *)->ip_v == 4) + SET_NET_IPLEN(mtod(*m0, struct ip *)); + + /* convert dir to IPFW values */ + dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; + bzero(&args, sizeof(args)); + +again: + /* + * extract and remove the tag if present. If we are left + * with onepass, optimize the outgoing path. + */ + tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); + if (tag != NULL) { + args.rule = *((struct ipfw_rule_ref *)(tag+1)); + m_tag_delete(*m0, tag); + if (args.rule.info & IPFW_ONEPASS) { + SET_HOST_IPLEN(mtod(*m0, struct ip *)); + return 0; + } + } + + args.m = *m0; + args.oif = dir == DIR_OUT ? ifp : NULL; + args.inp = inp; + + ipfw = ipfw_chk(&args); + *m0 = args.m; + + KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", + __func__)); + + /* breaking out of the switch means drop */ + ret = 0; /* default return value for pass */ + switch (ipfw) { + case IP_FW_PASS: + /* next_hop may be set by ipfw_chk */ + if (args.next_hop == NULL) + break; /* pass */ +#ifndef IPFIREWALL_FORWARD + ret = EACCES; +#else + { + struct m_tag *fwd_tag; + + /* Incoming packets should not be tagged so we do not + * m_tag_find. Outgoing packets may be tagged, so we + * reuse the tag if present. + */ + fwd_tag = (dir == DIR_IN) ? NULL : + m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); + if (fwd_tag != NULL) { + m_tag_unlink(*m0, fwd_tag); + } else { + fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, + sizeof(struct sockaddr_in), M_NOWAIT); + if (fwd_tag == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + } + bcopy(args.next_hop, (fwd_tag+1), sizeof(struct sockaddr_in)); + m_tag_prepend(*m0, fwd_tag); + + if (in_localip(args.next_hop->sin_addr)) + (*m0)->m_flags |= M_FASTFWD_OURS; + } +#endif + break; + + case IP_FW_DENY: + ret = EACCES; + break; /* i.e. drop */ + + case IP_FW_DUMMYNET: + ret = EACCES; + if (ip_dn_io_ptr == NULL) + break; /* i.e. drop */ + if (mtod(*m0, struct ip *)->ip_v == 4) + ret = ip_dn_io_ptr(m0, dir, &args); + else if (mtod(*m0, struct ip *)->ip_v == 6) + ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); + else + break; /* drop it */ + /* + * XXX should read the return value. + * dummynet normally eats the packet and sets *m0=NULL + * unless the packet can be sent immediately. In this + * case args is updated and we should re-run the + * check without clearing args. + */ + if (*m0 != NULL) + goto again; + break; + + case IP_FW_TEE: + case IP_FW_DIVERT: + if (ip_divert_ptr == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + ret = ipfw_divert(m0, dir, &args.rule, + (ipfw == IP_FW_TEE) ? 1 : 0); + /* continue processing for the original packet (tee). */ + if (*m0) + goto again; + break; + + case IP_FW_NGTEE: + case IP_FW_NETGRAPH: + if (ng_ipfw_input_p == NULL) { + ret = EACCES; + break; /* i.e. drop */ + } + ret = ng_ipfw_input_p(m0, dir, &args, + (ipfw == IP_FW_NGTEE) ? 1 : 0); + if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ + goto again; /* continue with packet */ + break; + + case IP_FW_NAT: + /* honor one-pass in case of successful nat */ + if (V_fw_one_pass) + break; /* ret is already 0 */ + goto again; + + case IP_FW_REASS: + goto again; /* continue with packet */ + + default: + KASSERT(0, ("%s: unknown retval", __func__)); + } + + if (ret != 0) { + if (*m0) + FREE_PKT(*m0); + *m0 = NULL; + } + if (*m0 && mtod(*m0, struct ip *)->ip_v == 4) + SET_HOST_IPLEN(mtod(*m0, struct ip *)); + return ret; +} + +/* do the divert, return 1 on error 0 on success */ +static int +ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, + int tee) +{ + /* + * ipfw_chk() has already tagged the packet with the divert tag. + * If tee is set, copy packet and return original. + * If not tee, consume packet and send it to divert socket. + */ + struct mbuf *clone; + struct ip *ip; + struct m_tag *tag; + + /* Cloning needed for tee? */ + if (tee == 0) { + clone = *m0; /* use the original mbuf */ + *m0 = NULL; + } else { + clone = m_dup(*m0, M_DONTWAIT); + /* If we cannot duplicate the mbuf, we sacrifice the divert + * chain and continue with the tee-ed packet. + */ + if (clone == NULL) + return 1; + } + + /* + * Divert listeners can normally handle non-fragmented packets, + * but we can only reass in the non-tee case. + * This means that listeners on a tee rule may get fragments, + * and have to live with that. + * Note that we now have the 'reass' ipfw option so if we care + * we can do it before a 'tee'. + */ + ip = mtod(clone, struct ip *); + if (!tee && ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { + int hlen; + struct mbuf *reass; + + SET_HOST_IPLEN(ip); /* ip_reass wants host order */ + reass = ip_reass(clone); /* Reassemble packet. */ + if (reass == NULL) + return 0; /* not an error */ + /* if reass = NULL then it was consumed by ip_reass */ + /* + * IP header checksum fixup after reassembly and leave header + * in network byte order. + */ + ip = mtod(reass, struct ip *); + hlen = ip->ip_hl << 2; + SET_NET_IPLEN(ip); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(reass, hlen); + clone = reass; + } + /* attach a tag to the packet with the reinject info */ + tag = m_tag_alloc(MTAG_IPFW_RULE, 0, + sizeof(struct ipfw_rule_ref), M_NOWAIT); + if (tag == NULL) { + FREE_PKT(clone); + return 1; + } + *((struct ipfw_rule_ref *)(tag+1)) = *rule; + m_tag_prepend(clone, tag); + + /* Do the dirty job... */ + ip_divert_ptr(clone, incoming); + return 0; +} + +/* + * attach or detach hooks for a given protocol family + */ +static int +ipfw_hook(int onoff, int pf) +{ + struct pfil_head *pfh; + + pfh = pfil_head_get(PFIL_TYPE_AF, pf); + if (pfh == NULL) + return ENOENT; + + (void) (onoff ? pfil_add_hook : pfil_remove_hook) + (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); + + return 0; +} + +int +ipfw_attach_hooks(int arg) +{ + int error = 0; + + if (arg == 0) /* detach */ + ipfw_hook(0, AF_INET); + else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { + error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ + printf("ipfw_hook() error\n"); + } +#ifdef INET6 + if (arg == 0) /* detach */ + ipfw_hook(0, AF_INET6); + else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { + error = ENOENT; + printf("ipfw6_hook() error\n"); + } +#endif + return error; +} + +int +ipfw_chg_hook(SYSCTL_HANDLER_ARGS) +{ + int enable; + int oldenable; + int error; + int af; + + if (arg1 == &VNET_NAME(fw_enable)) { + enable = V_fw_enable; + af = AF_INET; + } +#ifdef INET6 + else if (arg1 == &VNET_NAME(fw6_enable)) { + enable = V_fw6_enable; + af = AF_INET6; + } +#endif + else + return (EINVAL); + + oldenable = enable; + + error = sysctl_handle_int(oidp, &enable, 0, req); + + if (error) + return (error); + + enable = (enable) ? 1 : 0; + + if (enable == oldenable) + return (0); + + error = ipfw_hook(enable, af); + if (error) + return (error); + if (af == AF_INET) + V_fw_enable = enable; +#ifdef INET6 + else if (af == AF_INET6) + V_fw6_enable = enable; +#endif + + return (0); +} +/* end of file */ diff --git a/freebsd/sys/netinet/ipfw/ip_fw_private.h b/freebsd/sys/netinet/ipfw/ip_fw_private.h new file mode 100644 index 00000000..c29ae0ad --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_fw_private.h @@ -0,0 +1,301 @@ +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IPFW2_PRIVATE_H +#define _IPFW2_PRIVATE_H + +/* + * Internal constants and data structures used by ipfw components + * and not meant to be exported outside the kernel. + */ + +#ifdef _KERNEL + +/* + * For platforms that do not have SYSCTL support, we wrap the + * SYSCTL_* into a function (one per file) to collect the values + * into an array at module initialization. The wrapping macros, + * SYSBEGIN() and SYSEND, are empty in the default case. + */ +#ifndef SYSBEGIN +#define SYSBEGIN(x) +#endif +#ifndef SYSEND +#define SYSEND +#endif + +/* Return values from ipfw_chk() */ +enum { + IP_FW_PASS = 0, + IP_FW_DENY, + IP_FW_DIVERT, + IP_FW_TEE, + IP_FW_DUMMYNET, + IP_FW_NETGRAPH, + IP_FW_NGTEE, + IP_FW_NAT, + IP_FW_REASS, +}; + +/* + * Structure for collecting parameters to dummynet for ip6_output forwarding + */ +struct _ip6dn_args { + struct ip6_pktopts *opt_or; + struct route_in6 ro_or; + int flags_or; + struct ip6_moptions *im6o_or; + struct ifnet *origifp_or; + struct ifnet *ifp_or; + struct sockaddr_in6 dst_or; + u_long mtu_or; + struct route_in6 ro_pmtu_or; +}; + + +/* + * Arguments for calling ipfw_chk() and dummynet_io(). We put them + * all into a structure because this way it is easier and more + * efficient to pass variables around and extend the interface. + */ +struct ip_fw_args { + struct mbuf *m; /* the mbuf chain */ + struct ifnet *oif; /* output interface */ + struct sockaddr_in *next_hop; /* forward address */ + + /* + * On return, it points to the matching rule. + * On entry, rule.slot > 0 means the info is valid and + * contains the the starting rule for an ipfw search. + * If chain_id == chain->id && slot >0 then jump to that slot. + * Otherwise, we locate the first rule >= rulenum:rule_id + */ + struct ipfw_rule_ref rule; /* match/restart info */ + + struct ether_header *eh; /* for bridged packets */ + + struct ipfw_flow_id f_id; /* grabbed from IP header */ + //uint32_t cookie; /* a cookie depending on rule action */ + struct inpcb *inp; + + struct _ip6dn_args dummypar; /* dummynet->ip6_output */ + struct sockaddr_in hopstore; /* store here if cannot use a pointer */ +}; + +MALLOC_DECLARE(M_IPFW); + +/* + * Hooks sometime need to know the direction of the packet + * (divert, dummynet, netgraph, ...) + * We use a generic definition here, with bit0-1 indicating the + * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the + * specific protocol + * indicating the protocol (if necessary) + */ +enum { + DIR_MASK = 0x3, + DIR_OUT = 0, + DIR_IN = 1, + DIR_FWD = 2, + DIR_DROP = 3, + PROTO_LAYER2 = 0x4, /* set for layer 2 */ + /* PROTO_DEFAULT = 0, */ + PROTO_IPV4 = 0x08, + PROTO_IPV6 = 0x10, + PROTO_IFB = 0x0c, /* layer2 + ifbridge */ + /* PROTO_OLDBDG = 0x14, unused, old bridge */ +}; + +/* wrapper for freeing a packet, in case we need to do more work */ +#ifndef FREE_PKT +#if defined(__linux__) || defined(_WIN32) +#define FREE_PKT(m) netisr_dispatch(-1, m) +#else +#define FREE_PKT(m) m_freem(m) +#endif +#endif /* !FREE_PKT */ + +/* + * Function definitions. + */ + +/* attach (arg = 1) or detach (arg = 0) hooks */ +int ipfw_attach_hooks(int); +#ifdef NOTYET +void ipfw_nat_destroy(void); +#endif + +/* In ip_fw_log.c */ +struct ip; +void ipfw_log_bpf(int); +void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, + struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, + struct ip *ip); +VNET_DECLARE(u_int64_t, norule_counter); +#define V_norule_counter VNET(norule_counter) +VNET_DECLARE(int, verbose_limit); +#define V_verbose_limit VNET(verbose_limit) + +/* In ip_fw_dynamic.c */ + +enum { /* result for matching dynamic rules */ + MATCH_REVERSE = 0, + MATCH_FORWARD, + MATCH_NONE, + MATCH_UNKNOWN, +}; + +/* + * The lock for dynamic rules is only used once outside the file, + * and only to release the result of lookup_dyn_rule(). + * Eventually we may implement it with a callback on the function. + */ +void ipfw_dyn_unlock(void); + +struct tcphdr; +struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, + u_int32_t, u_int32_t, int); +int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, + struct ip_fw_args *args, uint32_t tablearg); +ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, + int *match_direction, struct tcphdr *tcp); +void ipfw_remove_dyn_children(struct ip_fw *rule); +void ipfw_get_dynamic(char **bp, const char *ep); + +void ipfw_dyn_attach(void); /* uma_zcreate .... */ +void ipfw_dyn_detach(void); /* uma_zdestroy ... */ +void ipfw_dyn_init(void); /* per-vnet initialization */ +void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ +int ipfw_dyn_len(void); + +/* common variables */ +VNET_DECLARE(int, fw_one_pass); +#define V_fw_one_pass VNET(fw_one_pass) + +VNET_DECLARE(int, fw_verbose); +#define V_fw_verbose VNET(fw_verbose) + +VNET_DECLARE(struct ip_fw_chain, layer3_chain); +#define V_layer3_chain VNET(layer3_chain) + +VNET_DECLARE(u_int32_t, set_disable); +#define V_set_disable VNET(set_disable) + +VNET_DECLARE(int, autoinc_step); +#define V_autoinc_step VNET(autoinc_step) + +struct ip_fw_chain { + struct ip_fw *rules; /* list of rules */ + struct ip_fw *reap; /* list of rules to reap */ + struct ip_fw *default_rule; + int n_rules; /* number of static rules */ + int static_len; /* total len of static rules */ + struct ip_fw **map; /* array of rule ptrs to ease lookup */ + LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ + struct radix_node_head *tables[IPFW_TABLES_MAX]; +#if defined( __linux__ ) || defined( _WIN32 ) + spinlock_t rwmtx; + spinlock_t uh_lock; +#else + struct rwlock rwmtx; + struct rwlock uh_lock; /* lock for upper half */ +#endif + uint32_t id; /* ruleset id */ +}; + +struct sockopt; /* used by tcp_var.h */ + +/* + * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c + * so the variable and the macros must be here. + */ + +#define IPFW_LOCK_INIT(_chain) do { \ + rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ + rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ + } while (0) + +#define IPFW_LOCK_DESTROY(_chain) do { \ + rw_destroy(&(_chain)->rwmtx); \ + rw_destroy(&(_chain)->uh_lock); \ + } while (0) + +#define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) + +#define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) +#define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) +#define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) +#define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) + +#define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) +#define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) +#define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) +#define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) + +/* In ip_fw_sockopt.c */ +int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); +int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule); +int ipfw_ctl(struct sockopt *sopt); +int ipfw_chk(struct ip_fw_args *args); +void ipfw_reap_rules(struct ip_fw *head); + +/* In ip_fw_pfil */ +int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, + struct inpcb *inp); + +/* In ip_fw_table.c */ +struct radix_node; +int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint32_t *val); +int ipfw_init_tables(struct ip_fw_chain *ch); +void ipfw_destroy_tables(struct ip_fw_chain *ch); +int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl); +int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen, uint32_t value); +int ipfw_dump_table_entry(struct radix_node *rn, void *arg); +int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen); +int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); +int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl); + +/* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ + +extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); + +typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); +typedef int ipfw_nat_cfg_t(struct sockopt *); + +extern ipfw_nat_t *ipfw_nat_ptr; +#define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL) + +extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; +extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; + +#endif /* _KERNEL */ +#endif /* _IPFW2_PRIVATE_H */ diff --git a/freebsd/sys/netinet/ipfw/ip_fw_sockopt.c b/freebsd/sys/netinet/ipfw/ip_fw_sockopt.c new file mode 100644 index 00000000..6af09905 --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_fw_sockopt.c @@ -0,0 +1,1345 @@ +#include + +/*- + * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa + * + * Supported by: Valeria Paoli + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * Sockopt support for ipfw. The routines here implement + * the upper half of the ipfw code. + */ + +#if !defined(KLD_MODULE) +#include +#include +#include +#include +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include +#include + +#include +#include +#include +#include /* struct m_tag used by nested headers */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include /* hooks */ +#include +#include + +#ifdef MAC +#include +#endif + +MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); + +/* + * static variables followed by global ones (none in this file) + */ + +/* + * Find the smallest rule >= key, id. + * We could use bsearch but it is so simple that we code it directly + */ +int +ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) +{ + int i, lo, hi; + struct ip_fw *r; + + for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { + i = (lo + hi) / 2; + r = chain->map[i]; + if (r->rulenum < key) + lo = i + 1; /* continue from the next one */ + else if (r->rulenum > key) + hi = i; /* this might be good */ + else if (r->id < id) + lo = i + 1; /* continue from the next one */ + else /* r->id >= id */ + hi = i; /* this might be good */ + }; + return hi; +} + +/* + * allocate a new map, returns the chain locked. extra is the number + * of entries to add or delete. + */ +static struct ip_fw ** +get_map(struct ip_fw_chain *chain, int extra, int locked) +{ + + for (;;) { + struct ip_fw **map; + int i; + + i = chain->n_rules + extra; + map = malloc(i * sizeof(struct ip_fw *), M_IPFW, + locked ? M_NOWAIT : M_WAITOK); + if (map == NULL) { + printf("%s: cannot allocate map\n", __FUNCTION__); + return NULL; + } + if (!locked) + IPFW_UH_WLOCK(chain); + if (i >= chain->n_rules + extra) /* good */ + return map; + /* otherwise we lost the race, free and retry */ + if (!locked) + IPFW_UH_WUNLOCK(chain); + free(map, M_IPFW); + } +} + +/* + * swap the maps. It is supposed to be called with IPFW_UH_WLOCK + */ +static struct ip_fw ** +swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) +{ + struct ip_fw **old_map; + + IPFW_WLOCK(chain); + chain->id++; + chain->n_rules = new_len; + old_map = chain->map; + chain->map = new_map; + IPFW_WUNLOCK(chain); + return old_map; +} + +/* + * Add a new rule to the list. Copy the rule into a malloc'ed area, then + * possibly create a rule number and add the rule to the list. + * Update the rule_number in the input struct so the caller knows it as well. + * XXX DO NOT USE FOR THE DEFAULT RULE. + * Must be called without IPFW_UH held + */ +int +ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) +{ + struct ip_fw *rule; + int i, l, insert_before; + struct ip_fw **map; /* the new array of pointers */ + + if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1) + return (EINVAL); + + l = RULESIZE(input_rule); + rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO); + if (rule == NULL) + return (ENOSPC); + /* get_map returns with IPFW_UH_WLOCK if successful */ + map = get_map(chain, 1, 0 /* not locked */); + if (map == NULL) { + free(rule, M_IPFW); + return ENOSPC; + } + + bcopy(input_rule, rule, l); + /* clear fields not settable from userland */ + rule->x_next = NULL; + rule->next_rule = NULL; + rule->pcnt = 0; + rule->bcnt = 0; + rule->timestamp = 0; + + if (V_autoinc_step < 1) + V_autoinc_step = 1; + else if (V_autoinc_step > 1000) + V_autoinc_step = 1000; + /* find the insertion point, we will insert before */ + insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE; + i = ipfw_find_rule(chain, insert_before, 0); + /* duplicate first part */ + if (i > 0) + bcopy(chain->map, map, i * sizeof(struct ip_fw *)); + map[i] = rule; + /* duplicate remaining part, we always have the default rule */ + bcopy(chain->map + i, map + i + 1, + sizeof(struct ip_fw *) *(chain->n_rules - i)); + if (rule->rulenum == 0) { + /* write back the number */ + rule->rulenum = i > 0 ? map[i-1]->rulenum : 0; + if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) + rule->rulenum += V_autoinc_step; + input_rule->rulenum = rule->rulenum; + } + + rule->id = chain->id + 1; + map = swap_map(chain, map, chain->n_rules + 1); + chain->static_len += l; + IPFW_UH_WUNLOCK(chain); + if (map) + free(map, M_IPFW); + return (0); +} + +/* + * Reclaim storage associated with a list of rules. This is + * typically the list created using remove_rule. + * A NULL pointer on input is handled correctly. + */ +void +ipfw_reap_rules(struct ip_fw *head) +{ + struct ip_fw *rule; + + while ((rule = head) != NULL) { + head = head->x_next; + free(rule, M_IPFW); + } +} + +/* + * Used by del_entry() to check if a rule should be kept. + * Returns 1 if the rule must be kept, 0 otherwise. + * + * Called with cmd = {0,1,5}. + * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ; + * cmd == 1 matches on set numbers only, rule numbers are ignored; + * cmd == 5 matches on rule and set numbers. + * + * n == 0 is a wildcard for rule numbers, there is no wildcard for sets. + * + * Rules to keep are + * (default || reserved || !match_set || !match_number) + * where + * default ::= (rule->rulenum == IPFW_DEFAULT_RULE) + * // the default rule is always protected + * + * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET) + * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush") + * + * match_set ::= (cmd == 0 || rule->set == set) + * // set number is ignored for cmd == 0 + * + * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum) + * // number is ignored for cmd == 1 or n == 0 + * + */ +static int +keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n) +{ + return + (rule->rulenum == IPFW_DEFAULT_RULE) || + (cmd == 0 && n == 0 && rule->set == RESVD_SET) || + !(cmd == 0 || rule->set == set) || + !(cmd == 1 || n == 0 || n == rule->rulenum); +} + +/** + * Remove all rules with given number, or do set manipulation. + * Assumes chain != NULL && *chain != NULL. + * + * The argument is an uint32_t. The low 16 bit are the rule or set number; + * the next 8 bits are the new set; the top 8 bits indicate the command: + * + * 0 delete rules numbered "rulenum" + * 1 delete rules in set "rulenum" + * 2 move rules "rulenum" to set "new_set" + * 3 move rules from set "rulenum" to set "new_set" + * 4 swap sets "rulenum" and "new_set" + * 5 delete rules "rulenum" and set "new_set" + */ +static int +del_entry(struct ip_fw_chain *chain, uint32_t arg) +{ + struct ip_fw *rule; + uint32_t num; /* rule number or old_set */ + uint8_t cmd, new_set; + int start, end, i, ofs, n; + struct ip_fw **map = NULL; + int error = 0; + + num = arg & 0xffff; + cmd = (arg >> 24) & 0xff; + new_set = (arg >> 16) & 0xff; + + if (cmd > 5 || new_set > RESVD_SET) + return EINVAL; + if (cmd == 0 || cmd == 2 || cmd == 5) { + if (num >= IPFW_DEFAULT_RULE) + return EINVAL; + } else { + if (num > RESVD_SET) /* old_set */ + return EINVAL; + } + + IPFW_UH_WLOCK(chain); /* arbitrate writers */ + chain->reap = NULL; /* prepare for deletions */ + + switch (cmd) { + case 0: /* delete rules "num" (num == 0 matches all) */ + case 1: /* delete all rules in set N */ + case 5: /* delete rules with number N and set "new_set". */ + + /* + * Locate first rule to delete (start), the rule after + * the last one to delete (end), and count how many + * rules to delete (n). Always use keep_rule() to + * determine which rules to keep. + */ + n = 0; + if (cmd == 1) { + /* look for a specific set including RESVD_SET. + * Must scan the entire range, ignore num. + */ + new_set = num; + for (start = -1, end = i = 0; i < chain->n_rules; i++) { + if (keep_rule(chain->map[i], cmd, new_set, 0)) + continue; + if (start < 0) + start = i; + end = i; + n++; + } + end++; /* first non-matching */ + } else { + /* Optimized search on rule numbers */ + start = ipfw_find_rule(chain, num, 0); + for (end = start; end < chain->n_rules; end++) { + rule = chain->map[end]; + if (num > 0 && rule->rulenum != num) + break; + if (!keep_rule(rule, cmd, new_set, num)) + n++; + } + } + + if (n == 0) { + /* A flush request (arg == 0) on empty ruleset + * returns with no error. On the contrary, + * if there is no match on a specific request, + * we return EINVAL. + */ + error = (arg == 0) ? 0 : EINVAL; + break; + } + + /* We have something to delete. Allocate the new map */ + map = get_map(chain, -n, 1 /* locked */); + if (map == NULL) { + error = EINVAL; + break; + } + + /* 1. bcopy the initial part of the map */ + if (start > 0) + bcopy(chain->map, map, start * sizeof(struct ip_fw *)); + /* 2. copy active rules between start and end */ + for (i = ofs = start; i < end; i++) { + rule = chain->map[i]; + if (keep_rule(rule, cmd, new_set, num)) + map[ofs++] = rule; + } + /* 3. copy the final part of the map */ + bcopy(chain->map + end, map + ofs, + (chain->n_rules - end) * sizeof(struct ip_fw *)); + /* 4. swap the maps (under BH_LOCK) */ + map = swap_map(chain, map, chain->n_rules - n); + /* 5. now remove the rules deleted from the old map */ + for (i = start; i < end; i++) { + int l; + rule = map[i]; + if (keep_rule(rule, cmd, new_set, num)) + continue; + l = RULESIZE(rule); + chain->static_len -= l; + ipfw_remove_dyn_children(rule); + rule->x_next = chain->reap; + chain->reap = rule; + } + break; + + /* + * In the next 3 cases the loop stops at (n_rules - 1) + * because the default rule is never eligible.. + */ + + case 2: /* move rules with given RULE number to new set */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->rulenum == num) + rule->set = new_set; + } + break; + + case 3: /* move rules with given SET number to new set */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->set == num) + rule->set = new_set; + } + break; + + case 4: /* swap two sets */ + for (i = 0; i < chain->n_rules - 1; i++) { + rule = chain->map[i]; + if (rule->set == num) + rule->set = new_set; + else if (rule->set == new_set) + rule->set = num; + } + break; + } + + rule = chain->reap; + chain->reap = NULL; + IPFW_UH_WUNLOCK(chain); + ipfw_reap_rules(rule); + if (map) + free(map, M_IPFW); + return error; +} + +/* + * Clear counters for a specific rule. + * Normally run under IPFW_UH_RLOCK, but these are idempotent ops + * so we only care that rules do not disappear. + */ +static void +clear_counters(struct ip_fw *rule, int log_only) +{ + ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); + + if (log_only == 0) { + rule->bcnt = rule->pcnt = 0; + rule->timestamp = 0; + } + if (l->o.opcode == O_LOG) + l->log_left = l->max_log; +} + +/** + * Reset some or all counters on firewall rules. + * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, + * the next 8 bits are the set number, the top 8 bits are the command: + * 0 work with rules from all set's; + * 1 work with rules only from specified set. + * Specified rule number is zero if we want to clear all entries. + * log_only is 1 if we only want to reset logs, zero otherwise. + */ +static int +zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) +{ + struct ip_fw *rule; + char *msg; + int i; + + uint16_t rulenum = arg & 0xffff; + uint8_t set = (arg >> 16) & 0xff; + uint8_t cmd = (arg >> 24) & 0xff; + + if (cmd > 1) + return (EINVAL); + if (cmd == 1 && set > RESVD_SET) + return (EINVAL); + + IPFW_UH_RLOCK(chain); + if (rulenum == 0) { + V_norule_counter = 0; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + /* Skip rules not in our set. */ + if (cmd == 1 && rule->set != set) + continue; + clear_counters(rule, log_only); + } + msg = log_only ? "All logging counts reset" : + "Accounting cleared"; + } else { + int cleared = 0; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + if (rule->rulenum == rulenum) { + if (cmd == 0 || rule->set == set) + clear_counters(rule, log_only); + cleared = 1; + } + if (rule->rulenum > rulenum) + break; + } + if (!cleared) { /* we did not find any matching rules */ + IPFW_UH_RUNLOCK(chain); + return (EINVAL); + } + msg = log_only ? "logging count reset" : "cleared"; + } + IPFW_UH_RUNLOCK(chain); + + if (V_fw_verbose) { + int lev = LOG_SECURITY | LOG_NOTICE; + + if (rulenum) + log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); + else + log(lev, "ipfw: %s.\n", msg); + } + return (0); +} + +/* + * Check validity of the structure before insert. + * Rules are simple, so this mostly need to check rule sizes. + */ +static int +check_ipfw_struct(struct ip_fw *rule, int size) +{ + int l, cmdlen = 0; + int have_action=0; + ipfw_insn *cmd; + + if (size < sizeof(*rule)) { + printf("ipfw: rule too short\n"); + return (EINVAL); + } + /* first, check for valid size */ + l = RULESIZE(rule); + if (l != size) { + printf("ipfw: size mismatch (have %d want %d)\n", size, l); + return (EINVAL); + } + if (rule->act_ofs >= rule->cmd_len) { + printf("ipfw: bogus action offset (%u > %u)\n", + rule->act_ofs, rule->cmd_len - 1); + return (EINVAL); + } + /* + * Now go for the individual checks. Very simple ones, basically only + * instruction sizes. + */ + for (l = rule->cmd_len, cmd = rule->cmd ; + l > 0 ; l -= cmdlen, cmd += cmdlen) { + cmdlen = F_LEN(cmd); + if (cmdlen > l) { + printf("ipfw: opcode %d size truncated\n", + cmd->opcode); + return EINVAL; + } + switch (cmd->opcode) { + case O_PROBE_STATE: + case O_KEEP_STATE: + case O_PROTO: + case O_IP_SRC_ME: + case O_IP_DST_ME: + case O_LAYER2: + case O_IN: + case O_FRAG: + case O_DIVERTED: + case O_IPOPT: + case O_IPTOS: + case O_IPPRECEDENCE: + case O_IPVER: + case O_TCPWIN: + case O_TCPFLAGS: + case O_TCPOPTS: + case O_ESTAB: + case O_VERREVPATH: + case O_VERSRCREACH: + case O_ANTISPOOF: + case O_IPSEC: +#ifdef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: +#endif + case O_IP4: + case O_TAG: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + + case O_FIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + if (cmd->arg1 >= rt_numfibs) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1); + return EINVAL; + } + break; + + case O_SETFIB: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + if (cmd->arg1 >= rt_numfibs) { + printf("ipfw: invalid fib number %d\n", + cmd->arg1); + return EINVAL; + } + goto check_action; + + case O_UID: + case O_GID: + case O_JAIL: + case O_IP_SRC: + case O_IP_DST: + case O_TCPSEQ: + case O_TCPACK: + case O_PROB: + case O_ICMPTYPE: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + break; + + case O_LIMIT: + if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) + goto bad_size; + break; + + case O_LOG: + if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) + goto bad_size; + + ((ipfw_insn_log *)cmd)->log_left = + ((ipfw_insn_log *)cmd)->max_log; + + break; + + case O_IP_SRC_MASK: + case O_IP_DST_MASK: + /* only odd command lengths */ + if ( !(cmdlen & 1) || cmdlen > 31) + goto bad_size; + break; + + case O_IP_SRC_SET: + case O_IP_DST_SET: + if (cmd->arg1 == 0 || cmd->arg1 > 256) { + printf("ipfw: invalid set size %d\n", + cmd->arg1); + return EINVAL; + } + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + (cmd->arg1+31)/32 ) + goto bad_size; + break; + + case O_IP_SRC_LOOKUP: + case O_IP_DST_LOOKUP: + if (cmd->arg1 >= IPFW_TABLES_MAX) { + printf("ipfw: invalid table number %d\n", + cmd->arg1); + return (EINVAL); + } + if (cmdlen != F_INSN_SIZE(ipfw_insn) && + cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && + cmdlen != F_INSN_SIZE(ipfw_insn_u32)) + goto bad_size; + break; + + case O_MACADDR2: + if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) + goto bad_size; + break; + + case O_NOP: + case O_IPID: + case O_IPTTL: + case O_IPLEN: + case O_TCPDATALEN: + case O_TAGGED: + if (cmdlen < 1 || cmdlen > 31) + goto bad_size; + break; + + case O_MAC_TYPE: + case O_IP_SRCPORT: + case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ + if (cmdlen < 2 || cmdlen > 31) + goto bad_size; + break; + + case O_RECV: + case O_XMIT: + case O_VIA: + if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) + goto bad_size; + break; + + case O_ALTQ: + if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) + goto bad_size; + break; + + case O_PIPE: + case O_QUEUE: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; + goto check_action; + + case O_FORWARD_IP: +#ifdef IPFIREWALL_FORWARD + if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) + goto bad_size; + goto check_action; +#else + return EINVAL; +#endif + + case O_DIVERT: + case O_TEE: + if (ip_divert_ptr == NULL) + return EINVAL; + else + goto check_size; + case O_NETGRAPH: + case O_NGTEE: + if (ng_ipfw_input_p == NULL) + return EINVAL; + else + goto check_size; + case O_NAT: + if (!IPFW_NAT_LOADED) + return EINVAL; + if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) + goto bad_size; + goto check_action; + case O_FORWARD_MAC: /* XXX not implemented yet */ + case O_CHECK_STATE: + case O_COUNT: + case O_ACCEPT: + case O_DENY: + case O_REJECT: +#ifdef INET6 + case O_UNREACH6: +#endif + case O_SKIPTO: + case O_REASS: +check_size: + if (cmdlen != F_INSN_SIZE(ipfw_insn)) + goto bad_size; +check_action: + if (have_action) { + printf("ipfw: opcode %d, multiple actions" + " not allowed\n", + cmd->opcode); + return EINVAL; + } + have_action = 1; + if (l != cmdlen) { + printf("ipfw: opcode %d, action must be" + " last opcode\n", + cmd->opcode); + return EINVAL; + } + break; +#ifdef INET6 + case O_IP6_SRC: + case O_IP6_DST: + if (cmdlen != F_INSN_SIZE(struct in6_addr) + + F_INSN_SIZE(ipfw_insn)) + goto bad_size; + break; + + case O_FLOW6ID: + if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + + ((ipfw_insn_u32 *)cmd)->o.arg1) + goto bad_size; + break; + + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + if ( !(cmdlen & 1) || cmdlen > 127) + goto bad_size; + break; + case O_ICMP6TYPE: + if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) + goto bad_size; + break; +#endif + + default: + switch (cmd->opcode) { +#ifndef INET6 + case O_IP6_SRC_ME: + case O_IP6_DST_ME: + case O_EXT_HDR: + case O_IP6: + case O_UNREACH6: + case O_IP6_SRC: + case O_IP6_DST: + case O_FLOW6ID: + case O_IP6_SRC_MASK: + case O_IP6_DST_MASK: + case O_ICMP6TYPE: + printf("ipfw: no IPv6 support in kernel\n"); + return EPROTONOSUPPORT; +#endif + default: + printf("ipfw: opcode %d, unknown opcode\n", + cmd->opcode); + return EINVAL; + } + } + } + if (have_action == 0) { + printf("ipfw: missing action\n"); + return EINVAL; + } + return 0; + +bad_size: + printf("ipfw: opcode %d size %d wrong\n", + cmd->opcode, cmdlen); + return EINVAL; +} + + +/* + * Translation of requests for compatibility with FreeBSD 7.2/8. + * a static variable tells us if we have an old client from userland, + * and if necessary we translate requests and responses between the + * two formats. + */ +static int is7 = 0; + +struct ip_fw7 { + struct ip_fw7 *next; /* linked list of rules */ + struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ + /* 'next_rule' is used to pass up 'set_disable' status */ + + uint16_t act_ofs; /* offset of action in 32-bit units */ + uint16_t cmd_len; /* # of 32-bit words in cmd */ + uint16_t rulenum; /* rule number */ + uint8_t set; /* rule set (0..31) */ + // #define RESVD_SET 31 /* set for default and persistent rules */ + uint8_t _pad; /* padding */ + // uint32_t id; /* rule id, only in v.8 */ + /* These fields are present in all rules. */ + uint64_t pcnt; /* Packet counter */ + uint64_t bcnt; /* Byte counter */ + uint32_t timestamp; /* tv_sec of last match */ + + ipfw_insn cmd[1]; /* storage for commands */ +}; + + int convert_rule_to_7(struct ip_fw *rule); +int convert_rule_to_8(struct ip_fw *rule); + +#ifndef RULESIZE7 +#define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ + ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) +#endif + + +/* + * Copy the static and dynamic rules to the supplied buffer + * and return the amount of space actually used. + * Must be run under IPFW_UH_RLOCK + */ +static size_t +ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) +{ + char *bp = buf; + char *ep = bp + space; + struct ip_fw *rule, *dst; + int l, i; + time_t boot_seconds; + + boot_seconds = boottime.tv_sec; + for (i = 0; i < chain->n_rules; i++) { + rule = chain->map[i]; + + if (is7) { + /* Convert rule to FreeBSd 7.2 format */ + l = RULESIZE7(rule); + if (bp + l + sizeof(uint32_t) <= ep) { + int error; + bcopy(rule, bp, l + sizeof(uint32_t)); + error = convert_rule_to_7((struct ip_fw *) bp); + if (error) + return 0; /*XXX correct? */ + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + */ + bcopy(&V_set_disable, + &(((struct ip_fw7 *)bp)->next_rule), + sizeof(V_set_disable)); + if (((struct ip_fw7 *)bp)->timestamp) + ((struct ip_fw7 *)bp)->timestamp += boot_seconds; + bp += l; + } + continue; /* go to next rule */ + } + + /* normal mode, don't touch rules */ + l = RULESIZE(rule); + if (bp + l > ep) { /* should not happen */ + printf("overflow dumping static rules\n"); + break; + } + dst = (struct ip_fw *)bp; + bcopy(rule, dst, l); + /* + * XXX HACK. Store the disable mask in the "next" + * pointer in a wild attempt to keep the ABI the same. + * Why do we do this on EVERY rule? + */ + bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); + if (dst->timestamp) + dst->timestamp += boot_seconds; + bp += l; + } + ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */ + return (bp - (char *)buf); +} + + +/** + * {set|get}sockopt parser. + */ +int +ipfw_ctl(struct sockopt *sopt) +{ +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + int error; + size_t size; + struct ip_fw *buf, *rule; + struct ip_fw_chain *chain; + u_int32_t rulenum[2]; + + error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); + if (error) + return (error); + + /* + * Disallow modifications in really-really secure mode, but still allow + * the logging counters to be reset. + */ + if (sopt->sopt_name == IP_FW_ADD || + (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { + error = securelevel_ge(sopt->sopt_td->td_ucred, 3); + if (error) + return (error); + } + + chain = &V_layer3_chain; + error = 0; + + switch (sopt->sopt_name) { + case IP_FW_GET: + /* + * pass up a copy of the current rules. Static rules + * come first (the last of which has number IPFW_DEFAULT_RULE), + * followed by a possibly empty list of dynamic rule. + * The last dynamic rule has NULL in the "next" field. + * + * Note that the calculated size is used to bound the + * amount of data returned to the user. The rule set may + * change between calculating the size and returning the + * data in which case we'll just return what fits. + */ + for (;;) { + int len = 0, want; + + size = chain->static_len; + size += ipfw_dyn_len(); + if (size >= sopt->sopt_valsize) + break; + buf = malloc(size, M_TEMP, M_WAITOK); + if (buf == NULL) + break; + IPFW_UH_RLOCK(chain); + /* check again how much space we need */ + want = chain->static_len + ipfw_dyn_len(); + if (size >= want) + len = ipfw_getrules(chain, buf, size); + IPFW_UH_RUNLOCK(chain); + if (size >= want) + error = sooptcopyout(sopt, buf, len); + free(buf, M_TEMP); + if (size >= want) + break; + } + break; + + case IP_FW_FLUSH: + /* locking is done within del_entry() */ + error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ + break; + + case IP_FW_ADD: + rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, rule, RULE_MAXSIZE, + sizeof(struct ip_fw7) ); + + /* + * If the size of commands equals RULESIZE7 then we assume + * a FreeBSD7.2 binary is talking to us (set is7=1). + * is7 is persistent so the next 'ipfw list' command + * will use this format. + * NOTE: If wrong version is guessed (this can happen if + * the first ipfw command is 'ipfw [pipe] list') + * the ipfw binary may crash or loop infinitly... + */ + if (sopt->sopt_valsize == RULESIZE7(rule)) { + is7 = 1; + error = convert_rule_to_8(rule); + if (error) + return error; + if (error == 0) + error = check_ipfw_struct(rule, RULESIZE(rule)); + } else { + is7 = 0; + if (error == 0) + error = check_ipfw_struct(rule, sopt->sopt_valsize); + } + if (error == 0) { + /* locking is done within ipfw_add_rule() */ + error = ipfw_add_rule(chain, rule); + size = RULESIZE(rule); + if (!error && sopt->sopt_dir == SOPT_GET) { + if (is7) { + error = convert_rule_to_7(rule); + size = RULESIZE7(rule); + if (error) + return error; + } + error = sooptcopyout(sopt, rule, size); + } + } + free(rule, M_TEMP); + break; + + case IP_FW_DEL: + /* + * IP_FW_DEL is used for deleting single rules or sets, + * and (ab)used to atomically manipulate sets. Argument size + * is used to distinguish between the two: + * sizeof(u_int32_t) + * delete single rule or set of rules, + * or reassign rules (or sets) to a different set. + * 2*sizeof(u_int32_t) + * atomic disable/enable sets. + * first u_int32_t contains sets to be disabled, + * second u_int32_t contains sets to be enabled. + */ + error = sooptcopyin(sopt, rulenum, + 2*sizeof(u_int32_t), sizeof(u_int32_t)); + if (error) + break; + size = sopt->sopt_valsize; + if (size == sizeof(u_int32_t) && rulenum[0] != 0) { + /* delete or reassign, locking done in del_entry() */ + error = del_entry(chain, rulenum[0]); + } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ + IPFW_UH_WLOCK(chain); + V_set_disable = + (V_set_disable | rulenum[0]) & ~rulenum[1] & + ~(1<sopt_val != 0) { + error = sooptcopyin(sopt, rulenum, + sizeof(u_int32_t), sizeof(u_int32_t)); + if (error) + break; + } + error = zero_entry(chain, rulenum[0], + sopt->sopt_name == IP_FW_RESETLOG); + break; + + /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/ + case IP_FW_TABLE_ADD: + { + ipfw_table_entry ent; + + error = sooptcopyin(sopt, &ent, + sizeof(ent), sizeof(ent)); + if (error) + break; + error = ipfw_add_table_entry(chain, ent.tbl, + ent.addr, ent.masklen, ent.value); + } + break; + + case IP_FW_TABLE_DEL: + { + ipfw_table_entry ent; + + error = sooptcopyin(sopt, &ent, + sizeof(ent), sizeof(ent)); + if (error) + break; + error = ipfw_del_table_entry(chain, ent.tbl, + ent.addr, ent.masklen); + } + break; + + case IP_FW_TABLE_FLUSH: + { + u_int16_t tbl; + + error = sooptcopyin(sopt, &tbl, + sizeof(tbl), sizeof(tbl)); + if (error) + break; + IPFW_WLOCK(chain); + error = ipfw_flush_table(chain, tbl); + IPFW_WUNLOCK(chain); + } + break; + + case IP_FW_TABLE_GETSIZE: + { + u_int32_t tbl, cnt; + + if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), + sizeof(tbl)))) + break; + IPFW_RLOCK(chain); + error = ipfw_count_table(chain, tbl, &cnt); + IPFW_RUNLOCK(chain); + if (error) + break; + error = sooptcopyout(sopt, &cnt, sizeof(cnt)); + } + break; + + case IP_FW_TABLE_LIST: + { + ipfw_table *tbl; + + if (sopt->sopt_valsize < sizeof(*tbl)) { + error = EINVAL; + break; + } + size = sopt->sopt_valsize; + tbl = malloc(size, M_TEMP, M_WAITOK); + error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); + if (error) { + free(tbl, M_TEMP); + break; + } + tbl->size = (size - sizeof(*tbl)) / + sizeof(ipfw_table_entry); + IPFW_RLOCK(chain); + error = ipfw_dump_table(chain, tbl); + IPFW_RUNLOCK(chain); + if (error) { + free(tbl, M_TEMP); + break; + } + error = sooptcopyout(sopt, tbl, size); + free(tbl, M_TEMP); + } + break; + + /*--- NAT operations are protected by the IPFW_LOCK ---*/ + case IP_FW_NAT_CFG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_cfg_ptr(sopt); + else { + printf("IP_FW_NAT_CFG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_DEL: + if (IPFW_NAT_LOADED) + error = ipfw_nat_del_ptr(sopt); + else { + printf("IP_FW_NAT_DEL: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_GET_CONFIG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_get_cfg_ptr(sopt); + else { + printf("IP_FW_NAT_GET_CFG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + case IP_FW_NAT_GET_LOG: + if (IPFW_NAT_LOADED) + error = ipfw_nat_get_log_ptr(sopt); + else { + printf("IP_FW_NAT_GET_LOG: %s\n", + "ipfw_nat not present, please load it"); + error = EINVAL; + } + break; + + default: + printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); + error = EINVAL; + } + + return (error); +#undef RULE_MAXSIZE +} + + +#define RULE_MAXSIZE (256*sizeof(u_int32_t)) + +/* Functions to convert rules 7.2 <==> 8.0 */ +int +convert_rule_to_7(struct ip_fw *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; + /* copy of original rule, version 8 */ + struct ip_fw *tmp; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + bcopy(rule, tmp, RULE_MAXSIZE); + + /* Copy fields */ + rule7->_pad = tmp->_pad; + rule7->set = tmp->set; + rule7->rulenum = tmp->rulenum; + rule7->cmd_len = tmp->cmd_len; + rule7->act_ofs = tmp->act_ofs; + rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; + rule7->next = (struct ip_fw7 *)tmp->x_next; + rule7->cmd_len = tmp->cmd_len; + rule7->pcnt = tmp->pcnt; + rule7->bcnt = tmp->bcnt; + rule7->timestamp = tmp->timestamp; + + /* Copy commands */ + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + + if (dst->opcode > O_NAT) + /* O_REASS doesn't exists in 7.2 version, so + * decrement opcode if it is after O_REASS + */ + dst->opcode--; + + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + free(tmp, M_TEMP); + + return 0; +} + +int +convert_rule_to_8(struct ip_fw *rule) +{ + /* Used to modify original rule */ + struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; + + /* Used to copy commands */ + ipfw_insn *ccmd, *dst; + int ll = 0, ccmdlen = 0; + + /* Copy of original rule */ + struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); + if (tmp == NULL) { + return 1; //XXX error + } + + bcopy(rule7, tmp, RULE_MAXSIZE); + + for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; + ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { + ccmdlen = F_LEN(ccmd); + + bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); + + if (dst->opcode > O_NAT) + /* O_REASS doesn't exists in 7.2 version, so + * increment opcode if it is after O_REASS + */ + dst->opcode++; + + if (ccmdlen > ll) { + printf("ipfw: opcode %d size truncated\n", + ccmd->opcode); + return EINVAL; + } + } + + rule->_pad = tmp->_pad; + rule->set = tmp->set; + rule->rulenum = tmp->rulenum; + rule->cmd_len = tmp->cmd_len; + rule->act_ofs = tmp->act_ofs; + rule->next_rule = (struct ip_fw *)tmp->next_rule; + rule->x_next = (struct ip_fw *)tmp->next; + rule->cmd_len = tmp->cmd_len; + rule->id = 0; /* XXX see if is ok = 0 */ + rule->pcnt = tmp->pcnt; + rule->bcnt = tmp->bcnt; + rule->timestamp = tmp->timestamp; + + free (tmp, M_TEMP); + return 0; +} + +/* end of file */ diff --git a/freebsd/sys/netinet/ipfw/ip_fw_table.c b/freebsd/sys/netinet/ipfw/ip_fw_table.c new file mode 100644 index 00000000..39a1dfcc --- /dev/null +++ b/freebsd/sys/netinet/ipfw/ip_fw_table.c @@ -0,0 +1,288 @@ +#include + +/*- + * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * Lookup table support for ipfw + * + * Lookup tables are implemented (at the moment) using the radix + * tree used for routing tables. Tables store key-value entries, where + * keys are network prefixes (addr/masklen), and values are integers. + * As a degenerate case we can interpret keys as 32-bit integers + * (with a /32 mask). + * + * The table is protected by the IPFW lock even for manipulation coming + * from userland, because operations are typically fast. + */ + +#if !defined(KLD_MODULE) +#include +#include +#include +#include +#ifndef INET +#error IPFIREWALL requires INET. +#endif /* INET */ +#endif +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include /* ip_fw.h requires IFNAMSIZ */ +#include +#include +#include + +#include +#include /* struct ipfw_rule_ref */ +#include +#include /* LIST_HEAD */ +#include + +#ifdef MAC +#include +#endif + +MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); + +struct table_entry { + struct radix_node rn[2]; + struct sockaddr_in addr, mask; + u_int32_t value; +}; + +/* + * The radix code expects addr and mask to be array of bytes, + * with the first byte being the length of the array. rn_inithead + * is called with the offset in bits of the lookup key within the + * array. If we use a sockaddr_in as the underlying type, + * sin_len is conveniently located at offset 0, sin_addr is at + * offset 4 and normally aligned. + * But for portability, let's avoid assumption and make the code explicit + */ +#define KEY_LEN(v) *((uint8_t *)&(v)) +#define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr)) + +int +ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen, uint32_t value) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct radix_node *rn; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + ent = malloc(sizeof(*ent), M_IPFW_TBL, M_NOWAIT | M_ZERO); + if (ent == NULL) + return (ENOMEM); + ent->value = value; + KEY_LEN(ent->addr) = KEY_LEN(ent->mask) = 8; + ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); + ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; + IPFW_WLOCK(ch); + rn = rnh->rnh_addaddr(&ent->addr, &ent->mask, rnh, (void *)ent); + if (rn == NULL) { + IPFW_WUNLOCK(ch); + free(ent, M_IPFW_TBL); + return (EEXIST); + } + IPFW_WUNLOCK(ch); + return (0); +} + +int +ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint8_t mlen) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct sockaddr_in sa, mask; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + KEY_LEN(sa) = KEY_LEN(mask) = 8; + mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); + sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; + IPFW_WLOCK(ch); + ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh); + if (ent == NULL) { + IPFW_WUNLOCK(ch); + return (ESRCH); + } + IPFW_WUNLOCK(ch); + free(ent, M_IPFW_TBL); + return (0); +} + +static int +flush_table_entry(struct radix_node *rn, void *arg) +{ + struct radix_node_head * const rnh = arg; + struct table_entry *ent; + + ent = (struct table_entry *) + rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); + if (ent != NULL) + free(ent, M_IPFW_TBL); + return (0); +} + +int +ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl) +{ + struct radix_node_head *rnh; + + IPFW_WLOCK_ASSERT(ch); + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + KASSERT(rnh != NULL, ("NULL IPFW table")); + rnh->rnh_walktree(rnh, flush_table_entry, rnh); + return (0); +} + +void +ipfw_destroy_tables(struct ip_fw_chain *ch) +{ + uint16_t tbl; + struct radix_node_head *rnh; + + IPFW_WLOCK_ASSERT(ch); + + for (tbl = 0; tbl < IPFW_TABLES_MAX; tbl++) { + ipfw_flush_table(ch, tbl); + rnh = ch->tables[tbl]; + rn_detachhead((void **)&rnh); + } +} + +int +ipfw_init_tables(struct ip_fw_chain *ch) +{ + int i; + uint16_t j; + + for (i = 0; i < IPFW_TABLES_MAX; i++) { + if (!rn_inithead((void **)&ch->tables[i], KEY_OFS)) { + for (j = 0; j < i; j++) { + (void) ipfw_flush_table(ch, j); + } + return (ENOMEM); + } + } + return (0); +} + +int +ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, + uint32_t *val) +{ + struct radix_node_head *rnh; + struct table_entry *ent; + struct sockaddr_in sa; + + if (tbl >= IPFW_TABLES_MAX) + return (0); + rnh = ch->tables[tbl]; + KEY_LEN(sa) = 8; + sa.sin_addr.s_addr = addr; + ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); + if (ent != NULL) { + *val = ent->value; + return (1); + } + return (0); +} + +static int +count_table_entry(struct radix_node *rn, void *arg) +{ + u_int32_t * const cnt = arg; + + (*cnt)++; + return (0); +} + +int +ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) +{ + struct radix_node_head *rnh; + + if (tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl]; + *cnt = 0; + rnh->rnh_walktree(rnh, count_table_entry, cnt); + return (0); +} + +static int +dump_table_entry(struct radix_node *rn, void *arg) +{ + struct table_entry * const n = (struct table_entry *)rn; + ipfw_table * const tbl = arg; + ipfw_table_entry *ent; + + if (tbl->cnt == tbl->size) + return (1); + ent = &tbl->ent[tbl->cnt]; + ent->tbl = tbl->tbl; + if (in_nullhost(n->mask.sin_addr)) + ent->masklen = 0; + else + ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); + ent->addr = n->addr.sin_addr.s_addr; + ent->value = n->value; + tbl->cnt++; + return (0); +} + +int +ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) +{ + struct radix_node_head *rnh; + + if (tbl->tbl >= IPFW_TABLES_MAX) + return (EINVAL); + rnh = ch->tables[tbl->tbl]; + tbl->cnt = 0; + rnh->rnh_walktree(rnh, dump_table_entry, tbl); + return (0); +} +/* end of file */ diff --git a/freebsd/sys/netinet/libalias/alias.c b/freebsd/sys/netinet/libalias/alias.c new file mode 100644 index 00000000..e5c5138d --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias.c @@ -0,0 +1,1793 @@ +#include + +/*- + * Copyright (c) 2001 Charles Mott + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + Alias.c provides supervisory control for the functions of the + packet aliasing software. It consists of routines to monitor + TCP connection state, protocol-specific aliasing routines, + fragment handling and the following outside world functional + interfaces: SaveFragmentPtr, GetFragmentPtr, FragmentAliasIn, + PacketAliasIn and PacketAliasOut. + + The other C program files are briefly described. The data + structure framework which holds information needed to translate + packets is encapsulated in alias_db.c. Data is accessed by + function calls, so other segments of the program need not know + about the underlying data structures. Alias_ftp.c contains + special code for modifying the ftp PORT command used to establish + data connections, while alias_irc.c does the same for IRC + DCC. Alias_util.c contains a few utility routines. + + Version 1.0 August, 1996 (cjm) + + Version 1.1 August 20, 1996 (cjm) + PPP host accepts incoming connections for ports 0 to 1023. + (Gary Roberts pointed out the need to handle incoming + connections.) + + Version 1.2 September 7, 1996 (cjm) + Fragment handling error in alias_db.c corrected. + (Tom Torrance helped fix this problem.) + + Version 1.4 September 16, 1996 (cjm) + - A more generalized method for handling incoming + connections, without the 0-1023 restriction, is + implemented in alias_db.c + - Improved ICMP support in alias.c. Traceroute + packet streams can now be correctly aliased. + - TCP connection closing logic simplified in + alias.c and now allows for additional 1 minute + "grace period" after FIN or RST is observed. + + Version 1.5 September 17, 1996 (cjm) + Corrected error in handling incoming UDP packets with 0 checksum. + (Tom Torrance helped fix this problem.) + + Version 1.6 September 18, 1996 (cjm) + Simplified ICMP aliasing scheme. Should now support + traceroute from Win95 as well as FreeBSD. + + Version 1.7 January 9, 1997 (cjm) + - Out-of-order fragment handling. + - IP checksum error fixed for ftp transfers + from aliasing host. + - Integer return codes added to all + aliasing/de-aliasing functions. + - Some obsolete comments cleaned up. + - Differential checksum computations for + IP header (TCP, UDP and ICMP were already + differential). + + Version 2.1 May 1997 (cjm) + - Added support for outgoing ICMP error + messages. + - Added two functions PacketAliasIn2() + and PacketAliasOut2() for dynamic address + control (e.g. round-robin allocation of + incoming packets). + + Version 2.2 July 1997 (cjm) + - Rationalized API function names to begin + with "PacketAlias..." + - Eliminated PacketAliasIn2() and + PacketAliasOut2() as poorly conceived. + + Version 2.3 Dec 1998 (dillon) + - Major bounds checking additions, see FreeBSD/CVS + + Version 3.1 May, 2000 (salander) + - Added hooks to handle PPTP. + + Version 3.2 July, 2000 (salander and satoh) + - Added PacketUnaliasOut routine. + - Added hooks to handle RTSP/RTP. + + See HISTORY file for additional revisions. +*/ + +#ifdef _KERNEL +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +/* + * Define libalias SYSCTL Node + */ +#ifdef SYSCTL_NODE + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, alias, CTLFLAG_RW, NULL, "Libalias sysctl API"); + +#endif + +static __inline int +twowords(void *p) +{ + uint8_t *c = p; + +#if BYTE_ORDER == LITTLE_ENDIAN + uint16_t s1 = ((uint16_t)c[1] << 8) + (uint16_t)c[0]; + uint16_t s2 = ((uint16_t)c[3] << 8) + (uint16_t)c[2]; +#else + uint16_t s1 = ((uint16_t)c[0] << 8) + (uint16_t)c[1]; + uint16_t s2 = ((uint16_t)c[2] << 8) + (uint16_t)c[3]; +#endif + return (s1 + s2); +} + +/* TCP Handling Routines + + TcpMonitorIn() -- These routines monitor TCP connections, and + TcpMonitorOut() delete a link when a connection is closed. + +These routines look for SYN, FIN and RST flags to determine when TCP +connections open and close. When a TCP connection closes, the data +structure containing packet aliasing information is deleted after +a timeout period. +*/ + +/* Local prototypes */ +static void TcpMonitorIn(u_char, struct alias_link *); + +static void TcpMonitorOut(u_char, struct alias_link *); + + +static void +TcpMonitorIn(u_char th_flags, struct alias_link *lnk) +{ + + switch (GetStateIn(lnk)) { + case ALIAS_TCP_STATE_NOT_CONNECTED: + if (th_flags & TH_RST) + SetStateIn(lnk, ALIAS_TCP_STATE_DISCONNECTED); + else if (th_flags & TH_SYN) + SetStateIn(lnk, ALIAS_TCP_STATE_CONNECTED); + break; + case ALIAS_TCP_STATE_CONNECTED: + if (th_flags & (TH_FIN | TH_RST)) + SetStateIn(lnk, ALIAS_TCP_STATE_DISCONNECTED); + break; + } +} + +static void +TcpMonitorOut(u_char th_flags, struct alias_link *lnk) +{ + + switch (GetStateOut(lnk)) { + case ALIAS_TCP_STATE_NOT_CONNECTED: + if (th_flags & TH_RST) + SetStateOut(lnk, ALIAS_TCP_STATE_DISCONNECTED); + else if (th_flags & TH_SYN) + SetStateOut(lnk, ALIAS_TCP_STATE_CONNECTED); + break; + case ALIAS_TCP_STATE_CONNECTED: + if (th_flags & (TH_FIN | TH_RST)) + SetStateOut(lnk, ALIAS_TCP_STATE_DISCONNECTED); + break; + } +} + + + + + +/* Protocol Specific Packet Aliasing Routines + + IcmpAliasIn(), IcmpAliasIn1(), IcmpAliasIn2() + IcmpAliasOut(), IcmpAliasOut1(), IcmpAliasOut2() + ProtoAliasIn(), ProtoAliasOut() + UdpAliasIn(), UdpAliasOut() + TcpAliasIn(), TcpAliasOut() + +These routines handle protocol specific details of packet aliasing. +One may observe a certain amount of repetitive arithmetic in these +functions, the purpose of which is to compute a revised checksum +without actually summing over the entire data packet, which could be +unnecessarily time consuming. + +The purpose of the packet aliasing routines is to replace the source +address of the outgoing packet and then correctly put it back for +any incoming packets. For TCP and UDP, ports are also re-mapped. + +For ICMP echo/timestamp requests and replies, the following scheme +is used: the ID number is replaced by an alias for the outgoing +packet. + +ICMP error messages are handled by looking at the IP fragment +in the data section of the message. + +For TCP and UDP protocols, a port number is chosen for an outgoing +packet, and then incoming packets are identified by IP address and +port numbers. For TCP packets, there is additional logic in the event +that sequence and ACK numbers have been altered (as in the case for +FTP data port commands). + +The port numbers used by the packet aliasing module are not true +ports in the Unix sense. No sockets are actually bound to ports. +They are more correctly thought of as placeholders. + +All packets go through the aliasing mechanism, whether they come from +the gateway machine or other machines on a local area network. +*/ + + +/* Local prototypes */ +static int IcmpAliasIn1(struct libalias *, struct ip *); +static int IcmpAliasIn2(struct libalias *, struct ip *); +static int IcmpAliasIn(struct libalias *, struct ip *); + +static int IcmpAliasOut1(struct libalias *, struct ip *, int create); +static int IcmpAliasOut2(struct libalias *, struct ip *); +static int IcmpAliasOut(struct libalias *, struct ip *, int create); + +static int ProtoAliasIn(struct libalias *la, struct in_addr ip_src, + struct in_addr *ip_dst, u_char ip_p, u_short *ip_sum); +static int ProtoAliasOut(struct libalias *la, struct in_addr *ip_src, + struct in_addr ip_dst, u_char ip_p, u_short *ip_sum, + int create); + +static int UdpAliasIn(struct libalias *, struct ip *); +static int UdpAliasOut(struct libalias *, struct ip *, int, int create); + +static int TcpAliasIn(struct libalias *, struct ip *); +static int TcpAliasOut(struct libalias *, struct ip *, int, int create); + + +static int +IcmpAliasIn1(struct libalias *la, struct ip *pip) +{ + + LIBALIAS_LOCK_ASSERT(la); +/* + De-alias incoming echo and timestamp replies. + Alias incoming echo and timestamp requests. +*/ + struct alias_link *lnk; + struct icmp *ic; + + ic = (struct icmp *)ip_next(pip); + +/* Get source address from ICMP data field and restore original data */ + lnk = FindIcmpIn(la, pip->ip_src, pip->ip_dst, ic->icmp_id, 1); + if (lnk != NULL) { + u_short original_id; + int accumulate; + + original_id = GetOriginalPort(lnk); + +/* Adjust ICMP checksum */ + accumulate = ic->icmp_id; + accumulate -= original_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* Put original sequence number back in */ + ic->icmp_id = original_id; + +/* Put original address back into IP header */ + { + struct in_addr original_address; + + original_address = GetOriginalAddress(lnk); + DifferentialChecksum(&pip->ip_sum, + &original_address, &pip->ip_dst, 2); + pip->ip_dst = original_address; + } + + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_IGNORED); +} + +static int +IcmpAliasIn2(struct libalias *la, struct ip *pip) +{ + + LIBALIAS_LOCK_ASSERT(la); +/* + Alias incoming ICMP error messages containing + IP header and first 64 bits of datagram. +*/ + struct ip *ip; + struct icmp *ic, *ic2; + struct udphdr *ud; + struct tcphdr *tc; + struct alias_link *lnk; + + ic = (struct icmp *)ip_next(pip); + ip = &ic->icmp_ip; + + ud = (struct udphdr *)ip_next(ip); + tc = (struct tcphdr *)ip_next(ip); + ic2 = (struct icmp *)ip_next(ip); + + if (ip->ip_p == IPPROTO_UDP) + lnk = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src, + ud->uh_dport, ud->uh_sport, + IPPROTO_UDP, 0); + else if (ip->ip_p == IPPROTO_TCP) + lnk = FindUdpTcpIn(la, ip->ip_dst, ip->ip_src, + tc->th_dport, tc->th_sport, + IPPROTO_TCP, 0); + else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) + lnk = FindIcmpIn(la, ip->ip_dst, ip->ip_src, ic2->icmp_id, 0); + else + lnk = NULL; + } else + lnk = NULL; + + if (lnk != NULL) { + if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP) { + int accumulate, accumulate2; + struct in_addr original_address; + u_short original_port; + + original_address = GetOriginalAddress(lnk); + original_port = GetOriginalPort(lnk); + +/* Adjust ICMP checksum */ + accumulate = twowords(&ip->ip_src); + accumulate -= twowords(&original_address); + accumulate += ud->uh_sport; + accumulate -= original_port; + accumulate2 = accumulate; + accumulate2 += ip->ip_sum; + ADJUST_CHECKSUM(accumulate, ip->ip_sum); + accumulate2 -= ip->ip_sum; + ADJUST_CHECKSUM(accumulate2, ic->icmp_cksum); + +/* Un-alias address in IP header */ + DifferentialChecksum(&pip->ip_sum, + &original_address, &pip->ip_dst, 2); + pip->ip_dst = original_address; + +/* Un-alias address and port number of original IP packet +fragment contained in ICMP data section */ + ip->ip_src = original_address; + ud->uh_sport = original_port; + } else if (ip->ip_p == IPPROTO_ICMP) { + int accumulate, accumulate2; + struct in_addr original_address; + u_short original_id; + + original_address = GetOriginalAddress(lnk); + original_id = GetOriginalPort(lnk); + +/* Adjust ICMP checksum */ + accumulate = twowords(&ip->ip_src); + accumulate -= twowords(&original_address); + accumulate += ic2->icmp_id; + accumulate -= original_id; + accumulate2 = accumulate; + accumulate2 += ip->ip_sum; + ADJUST_CHECKSUM(accumulate, ip->ip_sum); + accumulate2 -= ip->ip_sum; + ADJUST_CHECKSUM(accumulate2, ic->icmp_cksum); + +/* Un-alias address in IP header */ + DifferentialChecksum(&pip->ip_sum, + &original_address, &pip->ip_dst, 2); + pip->ip_dst = original_address; + +/* Un-alias address of original IP packet and sequence number of + embedded ICMP datagram */ + ip->ip_src = original_address; + ic2->icmp_id = original_id; + } + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_IGNORED); +} + + +static int +IcmpAliasIn(struct libalias *la, struct ip *pip) +{ + int iresult; + struct icmp *ic; + + LIBALIAS_LOCK_ASSERT(la); +/* Return if proxy-only mode is enabled */ + if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return (PKT_ALIAS_OK); + + ic = (struct icmp *)ip_next(pip); + + iresult = PKT_ALIAS_IGNORED; + switch (ic->icmp_type) { + case ICMP_ECHOREPLY: + case ICMP_TSTAMPREPLY: + if (ic->icmp_code == 0) { + iresult = IcmpAliasIn1(la, pip); + } + break; + case ICMP_UNREACH: + case ICMP_SOURCEQUENCH: + case ICMP_TIMXCEED: + case ICMP_PARAMPROB: + iresult = IcmpAliasIn2(la, pip); + break; + case ICMP_ECHO: + case ICMP_TSTAMP: + iresult = IcmpAliasIn1(la, pip); + break; + } + return (iresult); +} + + +static int +IcmpAliasOut1(struct libalias *la, struct ip *pip, int create) +{ +/* + Alias outgoing echo and timestamp requests. + De-alias outgoing echo and timestamp replies. +*/ + struct alias_link *lnk; + struct icmp *ic; + + LIBALIAS_LOCK_ASSERT(la); + ic = (struct icmp *)ip_next(pip); + +/* Save overwritten data for when echo packet returns */ + lnk = FindIcmpOut(la, pip->ip_src, pip->ip_dst, ic->icmp_id, create); + if (lnk != NULL) { + u_short alias_id; + int accumulate; + + alias_id = GetAliasPort(lnk); + +/* Since data field is being modified, adjust ICMP checksum */ + accumulate = ic->icmp_id; + accumulate -= alias_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* Alias sequence number */ + ic->icmp_id = alias_id; + +/* Change source address */ + { + struct in_addr alias_address; + + alias_address = GetAliasAddress(lnk); + DifferentialChecksum(&pip->ip_sum, + &alias_address, &pip->ip_src, 2); + pip->ip_src = alias_address; + } + + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_IGNORED); +} + + +static int +IcmpAliasOut2(struct libalias *la, struct ip *pip) +{ +/* + Alias outgoing ICMP error messages containing + IP header and first 64 bits of datagram. +*/ + struct ip *ip; + struct icmp *ic, *ic2; + struct udphdr *ud; + struct tcphdr *tc; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + ic = (struct icmp *)ip_next(pip); + ip = &ic->icmp_ip; + + ud = (struct udphdr *)ip_next(ip); + tc = (struct tcphdr *)ip_next(ip); + ic2 = (struct icmp *)ip_next(ip); + + if (ip->ip_p == IPPROTO_UDP) + lnk = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src, + ud->uh_dport, ud->uh_sport, + IPPROTO_UDP, 0); + else if (ip->ip_p == IPPROTO_TCP) + lnk = FindUdpTcpOut(la, ip->ip_dst, ip->ip_src, + tc->th_dport, tc->th_sport, + IPPROTO_TCP, 0); + else if (ip->ip_p == IPPROTO_ICMP) { + if (ic2->icmp_type == ICMP_ECHO || ic2->icmp_type == ICMP_TSTAMP) + lnk = FindIcmpOut(la, ip->ip_dst, ip->ip_src, ic2->icmp_id, 0); + else + lnk = NULL; + } else + lnk = NULL; + + if (lnk != NULL) { + if (ip->ip_p == IPPROTO_UDP || ip->ip_p == IPPROTO_TCP) { + int accumulate; + struct in_addr alias_address; + u_short alias_port; + + alias_address = GetAliasAddress(lnk); + alias_port = GetAliasPort(lnk); + +/* Adjust ICMP checksum */ + accumulate = twowords(&ip->ip_dst); + accumulate -= twowords(&alias_address); + accumulate += ud->uh_dport; + accumulate -= alias_port; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* + * Alias address in IP header if it comes from the host + * the original TCP/UDP packet was destined for. + */ + if (pip->ip_src.s_addr == ip->ip_dst.s_addr) { + DifferentialChecksum(&pip->ip_sum, + &alias_address, &pip->ip_src, 2); + pip->ip_src = alias_address; + } +/* Alias address and port number of original IP packet +fragment contained in ICMP data section */ + ip->ip_dst = alias_address; + ud->uh_dport = alias_port; + } else if (ip->ip_p == IPPROTO_ICMP) { + int accumulate; + struct in_addr alias_address; + u_short alias_id; + + alias_address = GetAliasAddress(lnk); + alias_id = GetAliasPort(lnk); + +/* Adjust ICMP checksum */ + accumulate = twowords(&ip->ip_dst); + accumulate -= twowords(&alias_address); + accumulate += ic2->icmp_id; + accumulate -= alias_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + +/* + * Alias address in IP header if it comes from the host + * the original ICMP message was destined for. + */ + if (pip->ip_src.s_addr == ip->ip_dst.s_addr) { + DifferentialChecksum(&pip->ip_sum, + &alias_address, &pip->ip_src, 2); + pip->ip_src = alias_address; + } +/* Alias address of original IP packet and sequence number of + embedded ICMP datagram */ + ip->ip_dst = alias_address; + ic2->icmp_id = alias_id; + } + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_IGNORED); +} + + +static int +IcmpAliasOut(struct libalias *la, struct ip *pip, int create) +{ + int iresult; + struct icmp *ic; + + LIBALIAS_LOCK_ASSERT(la); + (void)create; + +/* Return if proxy-only mode is enabled */ + if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return (PKT_ALIAS_OK); + + ic = (struct icmp *)ip_next(pip); + + iresult = PKT_ALIAS_IGNORED; + switch (ic->icmp_type) { + case ICMP_ECHO: + case ICMP_TSTAMP: + if (ic->icmp_code == 0) { + iresult = IcmpAliasOut1(la, pip, create); + } + break; + case ICMP_UNREACH: + case ICMP_SOURCEQUENCH: + case ICMP_TIMXCEED: + case ICMP_PARAMPROB: + iresult = IcmpAliasOut2(la, pip); + break; + case ICMP_ECHOREPLY: + case ICMP_TSTAMPREPLY: + iresult = IcmpAliasOut1(la, pip, create); + } + return (iresult); +} + +static int +ProtoAliasIn(struct libalias *la, struct in_addr ip_src, + struct in_addr *ip_dst, u_char ip_p, u_short *ip_sum) +{ +/* + Handle incoming IP packets. The + only thing which is done in this case is to alias + the dest IP address of the packet to our inside + machine. +*/ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); +/* Return if proxy-only mode is enabled */ + if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return (PKT_ALIAS_OK); + + lnk = FindProtoIn(la, ip_src, *ip_dst, ip_p); + if (lnk != NULL) { + struct in_addr original_address; + + original_address = GetOriginalAddress(lnk); + +/* Restore original IP address */ + DifferentialChecksum(ip_sum, + &original_address, ip_dst, 2); + *ip_dst = original_address; + + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_IGNORED); +} + +static int +ProtoAliasOut(struct libalias *la, struct in_addr *ip_src, + struct in_addr ip_dst, u_char ip_p, u_short *ip_sum, int create) +{ +/* + Handle outgoing IP packets. The + only thing which is done in this case is to alias + the source IP address of the packet. +*/ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + (void)create; + +/* Return if proxy-only mode is enabled */ + if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY) + return (PKT_ALIAS_OK); + + lnk = FindProtoOut(la, *ip_src, ip_dst, ip_p); + if (lnk != NULL) { + struct in_addr alias_address; + + alias_address = GetAliasAddress(lnk); + +/* Change source address */ + DifferentialChecksum(ip_sum, + &alias_address, ip_src, 2); + *ip_src = alias_address; + + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_IGNORED); +} + + +static int +UdpAliasIn(struct libalias *la, struct ip *pip) +{ + struct udphdr *ud; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + + ud = (struct udphdr *)ip_next(pip); + + lnk = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst, + ud->uh_sport, ud->uh_dport, + IPPROTO_UDP, !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)); + if (lnk != NULL) { + struct in_addr alias_address; + struct in_addr original_address; + struct in_addr proxy_address; + u_short alias_port; + u_short proxy_port; + int accumulate; + int error; + struct alias_data ad = { + .lnk = lnk, + .oaddr = &original_address, + .aaddr = &alias_address, + .aport = &alias_port, + .sport = &ud->uh_sport, + .dport = &ud->uh_dport, + .maxpktsize = 0 + }; + + alias_address = GetAliasAddress(lnk); + original_address = GetOriginalAddress(lnk); + proxy_address = GetProxyAddress(lnk); + alias_port = ud->uh_dport; + ud->uh_dport = GetOriginalPort(lnk); + proxy_port = GetProxyPort(lnk); + + /* Walk out chain. */ + error = find_handler(IN, UDP, la, pip, &ad); + /* If we cannot figure out the packet, ignore it. */ + if (error < 0) + return (PKT_ALIAS_IGNORED); + +/* If UDP checksum is not zero, then adjust since destination port */ +/* is being unaliased and destination address is being altered. */ + if (ud->uh_sum != 0) { + accumulate = alias_port; + accumulate -= ud->uh_dport; + accumulate += twowords(&alias_address); + accumulate -= twowords(&original_address); + +/* If this is a proxy packet, modify checksum because of source change.*/ + if (proxy_port != 0) { + accumulate += ud->uh_sport; + accumulate -= proxy_port; + } + + if (proxy_address.s_addr != 0) { + accumulate += twowords(&pip->ip_src); + accumulate -= twowords(&proxy_address); + } + + ADJUST_CHECKSUM(accumulate, ud->uh_sum); + } +/* XXX: Could the two if's below be concatenated to one ? */ +/* Restore source port and/or address in case of proxying*/ + + if (proxy_port != 0) + ud->uh_sport = proxy_port; + + if (proxy_address.s_addr != 0) { + DifferentialChecksum(&pip->ip_sum, + &proxy_address, &pip->ip_src, 2); + pip->ip_src = proxy_address; + } + +/* Restore original IP address */ + DifferentialChecksum(&pip->ip_sum, + &original_address, &pip->ip_dst, 2); + pip->ip_dst = original_address; + + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_IGNORED); +} + +static int +UdpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) +{ + struct udphdr *ud; + struct alias_link *lnk; + struct in_addr dest_address; + struct in_addr proxy_server_address; + u_short dest_port; + u_short proxy_server_port; + int proxy_type; + int error; + + LIBALIAS_LOCK_ASSERT(la); + +/* Return if proxy-only mode is enabled and not proxyrule found.*/ + ud = (struct udphdr *)ip_next(pip); + proxy_type = ProxyCheck(la, &proxy_server_address, + &proxy_server_port, pip->ip_src, pip->ip_dst, + ud->uh_dport, pip->ip_p); + if (proxy_type == 0 && (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)) + return (PKT_ALIAS_OK); + +/* If this is a transparent proxy, save original destination, + * then alter the destination and adjust checksums */ + dest_port = ud->uh_dport; + dest_address = pip->ip_dst; + + if (proxy_type != 0) { + int accumulate; + + accumulate = twowords(&pip->ip_dst); + accumulate -= twowords(&proxy_server_address); + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + + if (ud->uh_sum != 0) { + accumulate = twowords(&pip->ip_dst); + accumulate -= twowords(&proxy_server_address); + accumulate += ud->uh_dport; + accumulate -= proxy_server_port; + ADJUST_CHECKSUM(accumulate, ud->uh_sum); + } + pip->ip_dst = proxy_server_address; + ud->uh_dport = proxy_server_port; + } + lnk = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst, + ud->uh_sport, ud->uh_dport, + IPPROTO_UDP, create); + if (lnk != NULL) { + u_short alias_port; + struct in_addr alias_address; + struct alias_data ad = { + .lnk = lnk, + .oaddr = NULL, + .aaddr = &alias_address, + .aport = &alias_port, + .sport = &ud->uh_sport, + .dport = &ud->uh_dport, + .maxpktsize = 0 + }; + +/* Save original destination address, if this is a proxy packet. + * Also modify packet to include destination encoding. This may + * change the size of IP header. */ + if (proxy_type != 0) { + SetProxyPort(lnk, dest_port); + SetProxyAddress(lnk, dest_address); + ProxyModify(la, lnk, pip, maxpacketsize, proxy_type); + ud = (struct udphdr *)ip_next(pip); + } + + alias_address = GetAliasAddress(lnk); + alias_port = GetAliasPort(lnk); + + /* Walk out chain. */ + error = find_handler(OUT, UDP, la, pip, &ad); + +/* If UDP checksum is not zero, adjust since source port is */ +/* being aliased and source address is being altered */ + if (ud->uh_sum != 0) { + int accumulate; + + accumulate = ud->uh_sport; + accumulate -= alias_port; + accumulate += twowords(&pip->ip_src); + accumulate -= twowords(&alias_address); + ADJUST_CHECKSUM(accumulate, ud->uh_sum); + } +/* Put alias port in UDP header */ + ud->uh_sport = alias_port; + +/* Change source address */ + DifferentialChecksum(&pip->ip_sum, + &alias_address, &pip->ip_src, 2); + pip->ip_src = alias_address; + + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_IGNORED); +} + + + +static int +TcpAliasIn(struct libalias *la, struct ip *pip) +{ + struct tcphdr *tc; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + tc = (struct tcphdr *)ip_next(pip); + + lnk = FindUdpTcpIn(la, pip->ip_src, pip->ip_dst, + tc->th_sport, tc->th_dport, + IPPROTO_TCP, + !(la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)); + if (lnk != NULL) { + struct in_addr alias_address; + struct in_addr original_address; + struct in_addr proxy_address; + u_short alias_port; + u_short proxy_port; + int accumulate, error; + + /* + * The init of MANY vars is a bit below, but aliashandlepptpin + * seems to need the destination port that came within the + * packet and not the original one looks below [*]. + */ + + struct alias_data ad = { + .lnk = lnk, + .oaddr = NULL, + .aaddr = NULL, + .aport = NULL, + .sport = &tc->th_sport, + .dport = &tc->th_dport, + .maxpktsize = 0 + }; + + /* Walk out chain. */ + error = find_handler(IN, TCP, la, pip, &ad); + + alias_address = GetAliasAddress(lnk); + original_address = GetOriginalAddress(lnk); + proxy_address = GetProxyAddress(lnk); + alias_port = tc->th_dport; + tc->th_dport = GetOriginalPort(lnk); + proxy_port = GetProxyPort(lnk); + + /* + * Look above, if anyone is going to add find_handler AFTER + * this aliashandlepptpin/point, please redo alias_data too. + * Uncommenting the piece here below should be enough. + */ +#if 0 + struct alias_data ad = { + .lnk = lnk, + .oaddr = &original_address, + .aaddr = &alias_address, + .aport = &alias_port, + .sport = &ud->uh_sport, + .dport = &ud->uh_dport, + .maxpktsize = 0 + }; + + /* Walk out chain. */ + error = find_handler(la, pip, &ad); + if (error == EHDNOF) + printf("Protocol handler not found\n"); +#endif + +/* Adjust TCP checksum since destination port is being unaliased */ +/* and destination port is being altered. */ + accumulate = alias_port; + accumulate -= tc->th_dport; + accumulate += twowords(&alias_address); + accumulate -= twowords(&original_address); + +/* If this is a proxy, then modify the TCP source port and + checksum accumulation */ + if (proxy_port != 0) { + accumulate += tc->th_sport; + tc->th_sport = proxy_port; + accumulate -= tc->th_sport; + accumulate += twowords(&pip->ip_src); + accumulate -= twowords(&proxy_address); + } +/* See if ACK number needs to be modified */ + if (GetAckModified(lnk) == 1) { + int delta; + + tc = (struct tcphdr *)ip_next(pip); + delta = GetDeltaAckIn(tc->th_ack, lnk); + if (delta != 0) { + accumulate += twowords(&tc->th_ack); + tc->th_ack = htonl(ntohl(tc->th_ack) - delta); + accumulate -= twowords(&tc->th_ack); + } + } + ADJUST_CHECKSUM(accumulate, tc->th_sum); + +/* Restore original IP address */ + accumulate = twowords(&pip->ip_dst); + pip->ip_dst = original_address; + accumulate -= twowords(&pip->ip_dst); + +/* If this is a transparent proxy packet, then modify the source + address */ + if (proxy_address.s_addr != 0) { + accumulate += twowords(&pip->ip_src); + pip->ip_src = proxy_address; + accumulate -= twowords(&pip->ip_src); + } + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + +/* Monitor TCP connection state */ + tc = (struct tcphdr *)ip_next(pip); + TcpMonitorIn(tc->th_flags, lnk); + + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_IGNORED); +} + +static int +TcpAliasOut(struct libalias *la, struct ip *pip, int maxpacketsize, int create) +{ + int proxy_type, error; + u_short dest_port; + u_short proxy_server_port; + struct in_addr dest_address; + struct in_addr proxy_server_address; + struct tcphdr *tc; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + tc = (struct tcphdr *)ip_next(pip); + + if (create) + proxy_type = ProxyCheck(la, &proxy_server_address, + &proxy_server_port, pip->ip_src, pip->ip_dst, + tc->th_dport, pip->ip_p); + else + proxy_type = 0; + + if (proxy_type == 0 && (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY)) + return (PKT_ALIAS_OK); + +/* If this is a transparent proxy, save original destination, + then alter the destination and adjust checksums */ + dest_port = tc->th_dport; + dest_address = pip->ip_dst; + if (proxy_type != 0) { + int accumulate; + + accumulate = tc->th_dport; + tc->th_dport = proxy_server_port; + accumulate -= tc->th_dport; + accumulate += twowords(&pip->ip_dst); + accumulate -= twowords(&proxy_server_address); + ADJUST_CHECKSUM(accumulate, tc->th_sum); + + accumulate = twowords(&pip->ip_dst); + pip->ip_dst = proxy_server_address; + accumulate -= twowords(&pip->ip_dst); + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } + lnk = FindUdpTcpOut(la, pip->ip_src, pip->ip_dst, + tc->th_sport, tc->th_dport, + IPPROTO_TCP, create); + if (lnk == NULL) + return (PKT_ALIAS_IGNORED); + if (lnk != NULL) { + u_short alias_port; + struct in_addr alias_address; + int accumulate; + struct alias_data ad = { + .lnk = lnk, + .oaddr = NULL, + .aaddr = &alias_address, + .aport = &alias_port, + .sport = &tc->th_sport, + .dport = &tc->th_dport, + .maxpktsize = maxpacketsize + }; + +/* Save original destination address, if this is a proxy packet. + Also modify packet to include destination encoding. This may + change the size of IP header. */ + if (proxy_type != 0) { + SetProxyPort(lnk, dest_port); + SetProxyAddress(lnk, dest_address); + ProxyModify(la, lnk, pip, maxpacketsize, proxy_type); + tc = (struct tcphdr *)ip_next(pip); + } +/* Get alias address and port */ + alias_port = GetAliasPort(lnk); + alias_address = GetAliasAddress(lnk); + +/* Monitor TCP connection state */ + tc = (struct tcphdr *)ip_next(pip); + TcpMonitorOut(tc->th_flags, lnk); + + /* Walk out chain. */ + error = find_handler(OUT, TCP, la, pip, &ad); + +/* Adjust TCP checksum since source port is being aliased */ +/* and source address is being altered */ + accumulate = tc->th_sport; + tc->th_sport = alias_port; + accumulate -= tc->th_sport; + accumulate += twowords(&pip->ip_src); + accumulate -= twowords(&alias_address); + +/* Modify sequence number if necessary */ + if (GetAckModified(lnk) == 1) { + int delta; + + tc = (struct tcphdr *)ip_next(pip); + delta = GetDeltaSeqOut(tc->th_seq, lnk); + if (delta != 0) { + accumulate += twowords(&tc->th_seq); + tc->th_seq = htonl(ntohl(tc->th_seq) + delta); + accumulate -= twowords(&tc->th_seq); + } + } + ADJUST_CHECKSUM(accumulate, tc->th_sum); + +/* Change source address */ + accumulate = twowords(&pip->ip_src); + pip->ip_src = alias_address; + accumulate -= twowords(&pip->ip_src); + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_IGNORED); +} + + + + +/* Fragment Handling + + FragmentIn() + FragmentOut() + +The packet aliasing module has a limited ability for handling IP +fragments. If the ICMP, TCP or UDP header is in the first fragment +received, then the ID number of the IP packet is saved, and other +fragments are identified according to their ID number and IP address +they were sent from. Pointers to unresolved fragments can also be +saved and recalled when a header fragment is seen. +*/ + +/* Local prototypes */ +static int FragmentIn(struct libalias *la, struct in_addr ip_src, + struct in_addr *ip_dst, u_short ip_id, u_short *ip_sum); +static int FragmentOut(struct libalias *, struct in_addr *ip_src, + u_short *ip_sum); + +static int +FragmentIn(struct libalias *la, struct in_addr ip_src, struct in_addr *ip_dst, + u_short ip_id, u_short *ip_sum) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = FindFragmentIn2(la, ip_src, *ip_dst, ip_id); + if (lnk != NULL) { + struct in_addr original_address; + + GetFragmentAddr(lnk, &original_address); + DifferentialChecksum(ip_sum, + &original_address, ip_dst, 2); + *ip_dst = original_address; + + return (PKT_ALIAS_OK); + } + return (PKT_ALIAS_UNRESOLVED_FRAGMENT); +} + +static int +FragmentOut(struct libalias *la, struct in_addr *ip_src, u_short *ip_sum) +{ + struct in_addr alias_address; + + LIBALIAS_LOCK_ASSERT(la); + alias_address = FindAliasAddress(la, *ip_src); + DifferentialChecksum(ip_sum, + &alias_address, ip_src, 2); + *ip_src = alias_address; + + return (PKT_ALIAS_OK); +} + + + + + + +/* Outside World Access + + PacketAliasSaveFragment() + PacketAliasGetFragment() + PacketAliasFragmentIn() + PacketAliasIn() + PacketAliasOut() + PacketUnaliasOut() + +(prototypes in alias.h) +*/ + +int +LibAliasSaveFragment(struct libalias *la, char *ptr) +{ + int iresult; + struct alias_link *lnk; + struct ip *pip; + + LIBALIAS_LOCK(la); + pip = (struct ip *)ptr; + lnk = AddFragmentPtrLink(la, pip->ip_src, pip->ip_id); + iresult = PKT_ALIAS_ERROR; + if (lnk != NULL) { + SetFragmentPtr(lnk, ptr); + iresult = PKT_ALIAS_OK; + } + LIBALIAS_UNLOCK(la); + return (iresult); +} + +char * +LibAliasGetFragment(struct libalias *la, char *ptr) +{ + struct alias_link *lnk; + char *fptr; + struct ip *pip; + + LIBALIAS_LOCK(la); + pip = (struct ip *)ptr; + lnk = FindFragmentPtr(la, pip->ip_src, pip->ip_id); + if (lnk != NULL) { + GetFragmentPtr(lnk, &fptr); + SetFragmentPtr(lnk, NULL); + SetExpire(lnk, 0); /* Deletes link */ + } else + fptr = NULL; + + LIBALIAS_UNLOCK(la); + return (fptr); +} + +void +LibAliasFragmentIn(struct libalias *la, char *ptr, /* Points to correctly + * de-aliased header + * fragment */ + char *ptr_fragment /* Points to fragment which must be + * de-aliased */ +) +{ + struct ip *pip; + struct ip *fpip; + + LIBALIAS_LOCK(la); + (void)la; + pip = (struct ip *)ptr; + fpip = (struct ip *)ptr_fragment; + + DifferentialChecksum(&fpip->ip_sum, + &pip->ip_dst, &fpip->ip_dst, 2); + fpip->ip_dst = pip->ip_dst; + LIBALIAS_UNLOCK(la); +} + +/* Local prototypes */ +static int +LibAliasOutLocked(struct libalias *la, char *ptr, + int maxpacketsize, int create); +static int +LibAliasInLocked(struct libalias *la, char *ptr, + int maxpacketsize); + +int +LibAliasIn(struct libalias *la, char *ptr, int maxpacketsize) +{ + int res; + + LIBALIAS_LOCK(la); + res = LibAliasInLocked(la, ptr, maxpacketsize); + LIBALIAS_UNLOCK(la); + return (res); +} + +static int +LibAliasInLocked(struct libalias *la, char *ptr, int maxpacketsize) +{ + struct in_addr alias_addr; + struct ip *pip; + int iresult; + + if (la->packetAliasMode & PKT_ALIAS_REVERSE) { + la->packetAliasMode &= ~PKT_ALIAS_REVERSE; + iresult = LibAliasOutLocked(la, ptr, maxpacketsize, 1); + la->packetAliasMode |= PKT_ALIAS_REVERSE; + goto getout; + } + HouseKeeping(la); + ClearCheckNewLink(la); + pip = (struct ip *)ptr; + alias_addr = pip->ip_dst; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl << 2) > maxpacketsize) { + iresult = PKT_ALIAS_IGNORED; + goto getout; + } + + iresult = PKT_ALIAS_IGNORED; + if ((ntohs(pip->ip_off) & IP_OFFMASK) == 0) { + switch (pip->ip_p) { + case IPPROTO_ICMP: + iresult = IcmpAliasIn(la, pip); + break; + case IPPROTO_UDP: + iresult = UdpAliasIn(la, pip); + break; + case IPPROTO_TCP: + iresult = TcpAliasIn(la, pip); + break; +#ifdef _KERNEL + case IPPROTO_SCTP: + iresult = SctpAlias(la, pip, SN_TO_LOCAL); + break; +#endif + case IPPROTO_GRE: { + int error; + struct alias_data ad = { + .lnk = NULL, + .oaddr = NULL, + .aaddr = NULL, + .aport = NULL, + .sport = NULL, + .dport = NULL, + .maxpktsize = 0 + }; + + /* Walk out chain. */ + error = find_handler(IN, IP, la, pip, &ad); + if (error == 0) + iresult = PKT_ALIAS_OK; + else + iresult = ProtoAliasIn(la, pip->ip_src, + &pip->ip_dst, pip->ip_p, &pip->ip_sum); + } + break; + default: + iresult = ProtoAliasIn(la, pip->ip_src, &pip->ip_dst, + pip->ip_p, &pip->ip_sum); + break; + } + + if (ntohs(pip->ip_off) & IP_MF) { + struct alias_link *lnk; + + lnk = FindFragmentIn1(la, pip->ip_src, alias_addr, pip->ip_id); + if (lnk != NULL) { + iresult = PKT_ALIAS_FOUND_HEADER_FRAGMENT; + SetFragmentAddr(lnk, pip->ip_dst); + } else { + iresult = PKT_ALIAS_ERROR; + } + } + } else { + iresult = FragmentIn(la, pip->ip_src, &pip->ip_dst, pip->ip_id, + &pip->ip_sum); + } + +getout: + return (iresult); +} + + + +/* Unregistered address ranges */ + +/* 10.0.0.0 -> 10.255.255.255 */ +#define UNREG_ADDR_A_LOWER 0x0a000000 +#define UNREG_ADDR_A_UPPER 0x0affffff + +/* 172.16.0.0 -> 172.31.255.255 */ +#define UNREG_ADDR_B_LOWER 0xac100000 +#define UNREG_ADDR_B_UPPER 0xac1fffff + +/* 192.168.0.0 -> 192.168.255.255 */ +#define UNREG_ADDR_C_LOWER 0xc0a80000 +#define UNREG_ADDR_C_UPPER 0xc0a8ffff + +int +LibAliasOut(struct libalias *la, char *ptr, int maxpacketsize) +{ + int res; + + LIBALIAS_LOCK(la); + res = LibAliasOutLocked(la, ptr, maxpacketsize, 1); + LIBALIAS_UNLOCK(la); + return (res); +} + +int +LibAliasOutTry(struct libalias *la, char *ptr, int maxpacketsize, int create) +{ + int res; + + LIBALIAS_LOCK(la); + res = LibAliasOutLocked(la, ptr, maxpacketsize, create); + LIBALIAS_UNLOCK(la); + return (res); +} + +static int +LibAliasOutLocked(struct libalias *la, char *ptr, /* valid IP packet */ + int maxpacketsize, /* How much the packet data may grow (FTP + * and IRC inline changes) */ + int create /* Create new entries ? */ +) +{ + int iresult; + struct in_addr addr_save; + struct ip *pip; + + if (la->packetAliasMode & PKT_ALIAS_REVERSE) { + la->packetAliasMode &= ~PKT_ALIAS_REVERSE; + iresult = LibAliasInLocked(la, ptr, maxpacketsize); + la->packetAliasMode |= PKT_ALIAS_REVERSE; + goto getout; + } + HouseKeeping(la); + ClearCheckNewLink(la); + pip = (struct ip *)ptr; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl << 2) > maxpacketsize) { + iresult = PKT_ALIAS_IGNORED; + goto getout; + } + + addr_save = GetDefaultAliasAddress(la); + if (la->packetAliasMode & PKT_ALIAS_UNREGISTERED_ONLY) { + u_long addr; + int iclass; + + iclass = 0; + addr = ntohl(pip->ip_src.s_addr); + if (addr >= UNREG_ADDR_C_LOWER && addr <= UNREG_ADDR_C_UPPER) + iclass = 3; + else if (addr >= UNREG_ADDR_B_LOWER && addr <= UNREG_ADDR_B_UPPER) + iclass = 2; + else if (addr >= UNREG_ADDR_A_LOWER && addr <= UNREG_ADDR_A_UPPER) + iclass = 1; + + if (iclass == 0) { + SetDefaultAliasAddress(la, pip->ip_src); + } + } else if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY) { + SetDefaultAliasAddress(la, pip->ip_src); + } + iresult = PKT_ALIAS_IGNORED; + if ((ntohs(pip->ip_off) & IP_OFFMASK) == 0) { + switch (pip->ip_p) { + case IPPROTO_ICMP: + iresult = IcmpAliasOut(la, pip, create); + break; + case IPPROTO_UDP: + iresult = UdpAliasOut(la, pip, maxpacketsize, create); + break; + case IPPROTO_TCP: + iresult = TcpAliasOut(la, pip, maxpacketsize, create); + break; +#ifdef _KERNEL + case IPPROTO_SCTP: + iresult = SctpAlias(la, pip, SN_TO_GLOBAL); + break; +#endif + case IPPROTO_GRE: { + int error; + struct alias_data ad = { + .lnk = NULL, + .oaddr = NULL, + .aaddr = NULL, + .aport = NULL, + .sport = NULL, + .dport = NULL, + .maxpktsize = 0 + }; + /* Walk out chain. */ + error = find_handler(OUT, IP, la, pip, &ad); + if (error == 0) + iresult = PKT_ALIAS_OK; + else + iresult = ProtoAliasOut(la, &pip->ip_src, + pip->ip_dst, pip->ip_p, &pip->ip_sum, create); + } + break; + default: + iresult = ProtoAliasOut(la, &pip->ip_src, + pip->ip_dst, pip->ip_p, &pip->ip_sum, create); + break; + } + } else { + iresult = FragmentOut(la, &pip->ip_src, &pip->ip_sum); + } + + SetDefaultAliasAddress(la, addr_save); +getout: + return (iresult); +} + +int +LibAliasUnaliasOut(struct libalias *la, char *ptr, /* valid IP packet */ + int maxpacketsize /* for error checking */ +) +{ + struct ip *pip; + struct icmp *ic; + struct udphdr *ud; + struct tcphdr *tc; + struct alias_link *lnk; + int iresult = PKT_ALIAS_IGNORED; + + LIBALIAS_LOCK(la); + pip = (struct ip *)ptr; + + /* Defense against mangled packets */ + if (ntohs(pip->ip_len) > maxpacketsize + || (pip->ip_hl << 2) > maxpacketsize) + goto getout; + + ud = (struct udphdr *)ip_next(pip); + tc = (struct tcphdr *)ip_next(pip); + ic = (struct icmp *)ip_next(pip); + + /* Find a link */ + if (pip->ip_p == IPPROTO_UDP) + lnk = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src, + ud->uh_dport, ud->uh_sport, + IPPROTO_UDP, 0); + else if (pip->ip_p == IPPROTO_TCP) + lnk = FindUdpTcpIn(la, pip->ip_dst, pip->ip_src, + tc->th_dport, tc->th_sport, + IPPROTO_TCP, 0); + else if (pip->ip_p == IPPROTO_ICMP) + lnk = FindIcmpIn(la, pip->ip_dst, pip->ip_src, ic->icmp_id, 0); + else + lnk = NULL; + + /* Change it from an aliased packet to an unaliased packet */ + if (lnk != NULL) { + if (pip->ip_p == IPPROTO_UDP || pip->ip_p == IPPROTO_TCP) { + int accumulate; + struct in_addr original_address; + u_short original_port; + + original_address = GetOriginalAddress(lnk); + original_port = GetOriginalPort(lnk); + + /* Adjust TCP/UDP checksum */ + accumulate = twowords(&pip->ip_src); + accumulate -= twowords(&original_address); + + if (pip->ip_p == IPPROTO_UDP) { + accumulate += ud->uh_sport; + accumulate -= original_port; + ADJUST_CHECKSUM(accumulate, ud->uh_sum); + } else { + accumulate += tc->th_sport; + accumulate -= original_port; + ADJUST_CHECKSUM(accumulate, tc->th_sum); + } + + /* Adjust IP checksum */ + DifferentialChecksum(&pip->ip_sum, + &original_address, &pip->ip_src, 2); + + /* Un-alias source address and port number */ + pip->ip_src = original_address; + if (pip->ip_p == IPPROTO_UDP) + ud->uh_sport = original_port; + else + tc->th_sport = original_port; + + iresult = PKT_ALIAS_OK; + + } else if (pip->ip_p == IPPROTO_ICMP) { + + int accumulate; + struct in_addr original_address; + u_short original_id; + + original_address = GetOriginalAddress(lnk); + original_id = GetOriginalPort(lnk); + + /* Adjust ICMP checksum */ + accumulate = twowords(&pip->ip_src); + accumulate -= twowords(&original_address); + accumulate += ic->icmp_id; + accumulate -= original_id; + ADJUST_CHECKSUM(accumulate, ic->icmp_cksum); + + /* Adjust IP checksum */ + DifferentialChecksum(&pip->ip_sum, + &original_address, &pip->ip_src, 2); + + /* Un-alias source address and port number */ + pip->ip_src = original_address; + ic->icmp_id = original_id; + + iresult = PKT_ALIAS_OK; + } + } +getout: + LIBALIAS_UNLOCK(la); + return (iresult); + +} + +#ifndef _KERNEL + +int +LibAliasRefreshModules(void) +{ + char buf[256], conf[] = "/etc/libalias.conf"; + FILE *fd; + int i, len; + + fd = fopen(conf, "r"); + if (fd == NULL) + err(1, "fopen(%s)", conf); + + LibAliasUnLoadAllModule(); + + for (;;) { + fgets(buf, 256, fd); + if (feof(fd)) + break; + len = strlen(buf); + if (len > 1) { + for (i = 0; i < len; i++) + if (!isspace(buf[i])) + break; + if (buf[i] == '#') + continue; + buf[len - 1] = '\0'; + LibAliasLoadModule(buf); + } + } + fclose(fd); + return (0); +} + +int +LibAliasLoadModule(char *path) +{ + struct dll *t; + void *handle; + struct proto_handler *m; + const char *error; + moduledata_t *p; + + handle = dlopen (path, RTLD_LAZY); + if (!handle) { + fprintf(stderr, "%s\n", dlerror()); + return (EINVAL); + } + + p = dlsym(handle, "alias_mod"); + if ((error = dlerror()) != NULL) { + fprintf(stderr, "%s\n", dlerror()); + return (EINVAL); + } + + t = malloc(sizeof(struct dll)); + if (t == NULL) + return (ENOMEM); + strncpy(t->name, p->name, DLL_LEN); + t->handle = handle; + if (attach_dll(t) == EEXIST) { + free(t); + fprintf(stderr, "dll conflict\n"); + return (EEXIST); + } + + m = dlsym(t->handle, "handlers"); + if ((error = dlerror()) != NULL) { + fprintf(stderr, "%s\n", error); + return (EINVAL); + } + + LibAliasAttachHandlers(m); + return (0); +} + +int +LibAliasUnLoadAllModule(void) +{ + struct dll *t; + struct proto_handler *p; + + /* Unload all modules then reload everything. */ + while ((p = first_handler()) != NULL) { + detach_handler(p); + } + while ((t = walk_dll_chain()) != NULL) { + dlclose(t->handle); + free(t); + } + return (1); +} + +#endif + +#ifdef _KERNEL +/* + * m_megapullup() - this function is a big hack. + * Thankfully, it's only used in ng_nat and ipfw+nat. + * + * It allocates an mbuf with cluster and copies the specified part of the chain + * into cluster, so that it is all contiguous and can be accessed via a plain + * (char *) pointer. This is required, because libalias doesn't know how to + * handle mbuf chains. + * + * On success, m_megapullup returns an mbuf (possibly with cluster) containing + * the input packet, on failure NULL. The input packet is always consumed. + */ +struct mbuf * +m_megapullup(struct mbuf *m, int len) { + struct mbuf *mcl; + + if (len > m->m_pkthdr.len) + goto bad; + + /* Do not reallocate packet if it is sequentional, + * writable and has some extra space for expansion. + * XXX: Constant 100bytes is completely empirical. */ +#define RESERVE 100 + if (m->m_next == NULL && M_WRITABLE(m) && M_TRAILINGSPACE(m) >= RESERVE) + return (m); + + if (len <= MCLBYTES - RESERVE) { + mcl = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + } else if (len < MJUM16BYTES) { + int size; + if (len <= MJUMPAGESIZE - RESERVE) { + size = MJUMPAGESIZE; + } else if (len <= MJUM9BYTES - RESERVE) { + size = MJUM9BYTES; + } else { + size = MJUM16BYTES; + }; + mcl = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, size); + } else { + goto bad; + } + if (mcl == NULL) + goto bad; + + m_move_pkthdr(mcl, m); + m_copydata(m, 0, len, mtod(mcl, caddr_t)); + mcl->m_len = mcl->m_pkthdr.len = len; + m_freem(m); + + return (mcl); +bad: + m_freem(m); + return (NULL); +} +#endif diff --git a/freebsd/sys/netinet/libalias/alias.h b/freebsd/sys/netinet/libalias/alias.h new file mode 100644 index 00000000..f835e1b7 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias.h @@ -0,0 +1,232 @@ +/* lint -save -library Flexelint comment for external headers */ + +/*- + * Copyright (c) 2001 Charles Mott + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Alias.h defines the outside world interfaces for the packet aliasing + * software. + * + * This software is placed into the public domain with no restrictions on its + * distribution. + */ + +#ifndef _ALIAS_HH_ +#define _ALIAS_HH_ + +#include +#include +#include + +#define LIBALIAS_BUF_SIZE 128 +#ifdef _KERNEL +/* + * The kernel version of libalias does not support these features. + */ +#define NO_FW_PUNCH +#define NO_USE_SOCKETS +#endif + +/* + * The external interface to libalias, the packet aliasing engine. + * + * There are two sets of functions: + * + * PacketAlias*() the old API which doesn't take an instance pointer + * and therefore can only have one packet engine at a time. + * + * LibAlias*() the new API which takes as first argument a pointer to + * the instance of the packet aliasing engine. + * + * The functions otherwise correspond to each other one for one, except + * for the LibAliasUnaliasOut()/PacketUnaliasOut() function which were + * were misnamed in the old API. + */ + +/* + * The instance structure + */ +struct libalias; + +/* + * An anonymous structure, a pointer to which is returned from + * PacketAliasRedirectAddr(), PacketAliasRedirectPort() or + * PacketAliasRedirectProto(), passed to PacketAliasAddServer(), + * and freed by PacketAliasRedirectDelete(). + */ +struct alias_link; + +/* Initialization and control functions. */ +struct libalias *LibAliasInit(struct libalias *); +void LibAliasSetAddress(struct libalias *, struct in_addr _addr); +void LibAliasSetFWBase(struct libalias *, unsigned int _base, unsigned int _num); +void LibAliasSetSkinnyPort(struct libalias *, unsigned int _port); +unsigned int + LibAliasSetMode(struct libalias *, unsigned int _flags, unsigned int _mask); +void LibAliasUninit(struct libalias *); + +/* Packet Handling functions. */ +int LibAliasIn (struct libalias *, char *_ptr, int _maxpacketsize); +int LibAliasOut(struct libalias *, char *_ptr, int _maxpacketsize); +int LibAliasOutTry(struct libalias *, char *_ptr, int _maxpacketsize, int _create); +int LibAliasUnaliasOut(struct libalias *, char *_ptr, int _maxpacketsize); + +/* Port and address redirection functions. */ + +int +LibAliasAddServer(struct libalias *, struct alias_link *_lnk, + struct in_addr _addr, unsigned short _port); +struct alias_link * +LibAliasRedirectAddr(struct libalias *, struct in_addr _src_addr, + struct in_addr _alias_addr); +int LibAliasRedirectDynamic(struct libalias *, struct alias_link *_lnk); +void LibAliasRedirectDelete(struct libalias *, struct alias_link *_lnk); +struct alias_link * +LibAliasRedirectPort(struct libalias *, struct in_addr _src_addr, + unsigned short _src_port, struct in_addr _dst_addr, + unsigned short _dst_port, struct in_addr _alias_addr, + unsigned short _alias_port, unsigned char _proto); +struct alias_link * +LibAliasRedirectProto(struct libalias *, struct in_addr _src_addr, + struct in_addr _dst_addr, struct in_addr _alias_addr, + unsigned char _proto); + +/* Fragment Handling functions. */ +void LibAliasFragmentIn(struct libalias *, char *_ptr, char *_ptr_fragment); +char *LibAliasGetFragment(struct libalias *, char *_ptr); +int LibAliasSaveFragment(struct libalias *, char *_ptr); + +/* Miscellaneous functions. */ +int LibAliasCheckNewLink(struct libalias *); +unsigned short + LibAliasInternetChecksum(struct libalias *, unsigned short *_ptr, int _nbytes); +void LibAliasSetTarget(struct libalias *, struct in_addr _target_addr); + +/* Transparent proxying routines. */ +int LibAliasProxyRule(struct libalias *, const char *_cmd); + +/* Module handling API */ +int LibAliasLoadModule(char *); +int LibAliasUnLoadAllModule(void); +int LibAliasRefreshModules(void); + +/* Mbuf helper function. */ +struct mbuf *m_megapullup(struct mbuf *, int); + +/* + * Mode flags and other constants. + */ + + +/* Mode flags, set using PacketAliasSetMode() */ + +/* + * If PKT_ALIAS_LOG is set, a message will be printed to /var/log/alias.log + * every time a link is created or deleted. This is useful for debugging. + */ +#define PKT_ALIAS_LOG 0x01 + +/* + * If PKT_ALIAS_DENY_INCOMING is set, then incoming connections (e.g. to ftp, + * telnet or web servers will be prevented by the aliasing mechanism. + */ +#define PKT_ALIAS_DENY_INCOMING 0x02 + +/* + * If PKT_ALIAS_SAME_PORTS is set, packets will be attempted sent from the + * same port as they originated on. This allows e.g. rsh to work *99% of the + * time*, but _not_ 100% (it will be slightly flakey instead of not working + * at all). This mode bit is set by PacketAliasInit(), so it is a default + * mode of operation. + */ +#define PKT_ALIAS_SAME_PORTS 0x04 + +/* + * If PKT_ALIAS_USE_SOCKETS is set, then when partially specified links (e.g. + * destination port and/or address is zero), the packet aliasing engine will + * attempt to allocate a socket for the aliasing port it chooses. This will + * avoid interference with the host machine. Fully specified links do not + * require this. This bit is set after a call to PacketAliasInit(), so it is + * a default mode of operation. + */ +#ifndef NO_USE_SOCKETS +#define PKT_ALIAS_USE_SOCKETS 0x08 +#endif +/*- + * If PKT_ALIAS_UNREGISTERED_ONLY is set, then only packets with + * unregistered source addresses will be aliased. Private + * addresses are those in the following ranges: + * + * 10.0.0.0 -> 10.255.255.255 + * 172.16.0.0 -> 172.31.255.255 + * 192.168.0.0 -> 192.168.255.255 + */ +#define PKT_ALIAS_UNREGISTERED_ONLY 0x10 + +/* + * If PKT_ALIAS_RESET_ON_ADDR_CHANGE is set, then the table of dynamic + * aliasing links will be reset whenever PacketAliasSetAddress() changes the + * default aliasing address. If the default aliasing address is left + * unchanged by this function call, then the table of dynamic aliasing links + * will be left intact. This bit is set after a call to PacketAliasInit(). + */ +#define PKT_ALIAS_RESET_ON_ADDR_CHANGE 0x20 + +#ifndef NO_FW_PUNCH +/* + * If PKT_ALIAS_PUNCH_FW is set, active FTP and IRC DCC connections will + * create a 'hole' in the firewall to allow the transfers to work. The + * ipfw rule number that the hole is created with is controlled by + * PacketAliasSetFWBase(). The hole will be attached to that + * particular alias_link, so when the link goes away the hole is deleted. + */ +#define PKT_ALIAS_PUNCH_FW 0x100 +#endif + +/* + * If PKT_ALIAS_PROXY_ONLY is set, then NAT will be disabled and only + * transparent proxying is performed. + */ +#define PKT_ALIAS_PROXY_ONLY 0x40 + +/* + * If PKT_ALIAS_REVERSE is set, the actions of PacketAliasIn() and + * PacketAliasOut() are reversed. + */ +#define PKT_ALIAS_REVERSE 0x80 + +/* Function return codes. */ +#define PKT_ALIAS_ERROR -1 +#define PKT_ALIAS_OK 1 +#define PKT_ALIAS_IGNORED 2 +#define PKT_ALIAS_UNRESOLVED_FRAGMENT 3 +#define PKT_ALIAS_FOUND_HEADER_FRAGMENT 4 + +#endif /* !_ALIAS_HH_ */ + +/* lint -restore */ diff --git a/freebsd/sys/netinet/libalias/alias_cuseeme.c b/freebsd/sys/netinet/libalias/alias_cuseeme.c new file mode 100644 index 00000000..90f2aaae --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_cuseeme.c @@ -0,0 +1,230 @@ +#include + +/*- + * Copyright (c) 1998 Brian Somers + * with the aid of code written by + * Junichi SATOH 1996, 1997. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include +#include +#endif + +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include +#endif + +#define CUSEEME_PORT_NUMBER 7648 + +static void +AliasHandleCUSeeMeOut(struct libalias *la, struct ip *pip, + struct alias_link *lnk); + +static void +AliasHandleCUSeeMeIn(struct libalias *la, struct ip *pip, + struct in_addr original_addr); + +static int +fingerprint(struct libalias *la, struct alias_data *ah) +{ + + if (ah->dport == NULL || ah->oaddr == NULL) + return (-1); + if (ntohs(*ah->dport) == CUSEEME_PORT_NUMBER) + return (0); + return (-1); +} + +static int +protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + AliasHandleCUSeeMeIn(la, pip, *ah->oaddr); + return (0); +} + +static int +protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + AliasHandleCUSeeMeOut(la, pip, ah->lnk); + return (0); +} + +/* Kernel module definition. */ +struct proto_handler handlers[] = { + { + .pri = 120, + .dir = OUT, + .proto = UDP, + .fingerprint = &fingerprint, + .protohandler = &protohandlerout + }, + { + .pri = 120, + .dir = IN, + .proto = UDP, + .fingerprint = &fingerprint, + .protohandler = &protohandlerin + }, + { EOH } +}; + +static int +mod_handler(module_t mod, int type, void *data) +{ + int error; + + switch (type) { + case MOD_LOAD: + error = 0; + LibAliasAttachHandlers(handlers); + break; + case MOD_UNLOAD: + error = 0; + LibAliasDetachHandlers(handlers); + break; + default: + error = EINVAL; + } + return (error); +} + +#ifdef _KERNEL +static +#endif +moduledata_t +alias_mod = { + "alias_cuseeme", mod_handler, NULL +}; + +#ifdef _KERNEL +DECLARE_MODULE(alias_cuseeme, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); +MODULE_VERSION(alias_cuseeme, 1); +MODULE_DEPEND(alias_cuseeme, libalias, 1, 1, 1); +#endif + +/* CU-SeeMe Data Header */ +struct cu_header { + u_int16_t dest_family; + u_int16_t dest_port; + u_int32_t dest_addr; + int16_t family; + u_int16_t port; + u_int32_t addr; + u_int32_t seq; + u_int16_t msg; + u_int16_t data_type; + u_int16_t packet_len; +}; + +/* Open Continue Header */ +struct oc_header { + u_int16_t client_count; /* Number of client info structs */ + u_int32_t seq_no; + char user_name [20]; + char reserved [4]; /* flags, version stuff, etc */ +}; + +/* client info structures */ +struct client_info { + u_int32_t address;/* Client address */ + char reserved [8]; /* Flags, pruning bitfield, packet + * counts etc */ +}; + +static void +AliasHandleCUSeeMeOut(struct libalias *la, struct ip *pip, struct alias_link *lnk) +{ + struct udphdr *ud = ip_next(pip); + + if (ntohs(ud->uh_ulen) - sizeof(struct udphdr) >= sizeof(struct cu_header)) { + struct cu_header *cu; + struct alias_link *cu_lnk; + + cu = udp_next(ud); + if (cu->addr) + cu->addr = (u_int32_t) GetAliasAddress(lnk).s_addr; + + cu_lnk = FindUdpTcpOut(la, pip->ip_src, GetDestAddress(lnk), + ud->uh_dport, 0, IPPROTO_UDP, 1); + +#ifndef NO_FW_PUNCH + if (cu_lnk) + PunchFWHole(cu_lnk); +#endif + } +} + +static void +AliasHandleCUSeeMeIn(struct libalias *la, struct ip *pip, struct in_addr original_addr) +{ + struct in_addr alias_addr; + struct udphdr *ud; + struct cu_header *cu; + struct oc_header *oc; + struct client_info *ci; + char *end; + int i; + + (void)la; + alias_addr.s_addr = pip->ip_dst.s_addr; + ud = ip_next(pip); + cu = udp_next(ud); + oc = (struct oc_header *)(cu + 1); + ci = (struct client_info *)(oc + 1); + end = (char *)ud + ntohs(ud->uh_ulen); + + if ((char *)oc <= end) { + if (cu->dest_addr) + cu->dest_addr = (u_int32_t) original_addr.s_addr; + if (ntohs(cu->data_type) == 101) + /* Find and change our address */ + for (i = 0; (char *)(ci + 1) <= end && i < oc->client_count; i++, ci++) + if (ci->address == (u_int32_t) alias_addr.s_addr) { + ci->address = (u_int32_t) original_addr.s_addr; + break; + } + } +} diff --git a/freebsd/sys/netinet/libalias/alias_db.c b/freebsd/sys/netinet/libalias/alias_db.c new file mode 100644 index 00000000..4b003366 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_db.c @@ -0,0 +1,2940 @@ +#include + +/*- + * Copyright (c) 2001 Charles Mott + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + Alias_db.c encapsulates all data structures used for storing + packet aliasing data. Other parts of the aliasing software + access data through functions provided in this file. + + Data storage is based on the notion of a "link", which is + established for ICMP echo/reply packets, UDP datagrams and + TCP stream connections. A link stores the original source + and destination addresses. For UDP and TCP, it also stores + source and destination port numbers, as well as an alias + port number. Links are also used to store information about + fragments. + + There is a facility for sweeping through and deleting old + links as new packets are sent through. A simple timeout is + used for ICMP and UDP links. TCP links are left alone unless + there is an incomplete connection, in which case the link + can be deleted after a certain amount of time. + + + Initial version: August, 1996 (cjm) + + Version 1.4: September 16, 1996 (cjm) + Facility for handling incoming links added. + + Version 1.6: September 18, 1996 (cjm) + ICMP data handling simplified. + + Version 1.7: January 9, 1997 (cjm) + Fragment handling simplified. + Saves pointers for unresolved fragments. + Permits links for unspecified remote ports + or unspecified remote addresses. + Fixed bug which did not properly zero port + table entries after a link was deleted. + Cleaned up some obsolete comments. + + Version 1.8: January 14, 1997 (cjm) + Fixed data type error in StartPoint(). + (This error did not exist prior to v1.7 + and was discovered and fixed by Ari Suutari) + + Version 1.9: February 1, 1997 + Optionally, connections initiated from packet aliasing host + machine will will not have their port number aliased unless it + conflicts with an aliasing port already being used. (cjm) + + All options earlier being #ifdef'ed are now available through + a new interface, SetPacketAliasMode(). This allows run time + control (which is now available in PPP+pktAlias through the + 'alias' keyword). (ee) + + Added ability to create an alias port without + either destination address or port specified. + port type = ALIAS_PORT_UNKNOWN_DEST_ALL (ee) + + Removed K&R style function headers + and general cleanup. (ee) + + Added packetAliasMode to replace compiler #defines's (ee) + + Allocates sockets for partially specified + ports if ALIAS_USE_SOCKETS defined. (cjm) + + Version 2.0: March, 1997 + SetAliasAddress() will now clean up alias links + if the aliasing address is changed. (cjm) + + PacketAliasPermanentLink() function added to support permanent + links. (J. Fortes suggested the need for this.) + Examples: + + (192.168.0.1, port 23) <-> alias port 6002, unknown dest addr/port + + (192.168.0.2, port 21) <-> alias port 3604, known dest addr + unknown dest port + + These permanent links allow for incoming connections to + machines on the local network. They can be given with a + user-chosen amount of specificity, with increasing specificity + meaning more security. (cjm) + + Quite a bit of rework to the basic engine. The portTable[] + array, which kept track of which ports were in use was replaced + by a table/linked list structure. (cjm) + + SetExpire() function added. (cjm) + + DeleteLink() no longer frees memory association with a pointer + to a fragment (this bug was first recognized by E. Eklund in + v1.9). + + Version 2.1: May, 1997 (cjm) + Packet aliasing engine reworked so that it can handle + multiple external addresses rather than just a single + host address. + + PacketAliasRedirectPort() and PacketAliasRedirectAddr() + added to the API. The first function is a more generalized + version of PacketAliasPermanentLink(). The second function + implements static network address translation. + + Version 3.2: July, 2000 (salander and satoh) + Added FindNewPortGroup to get contiguous range of port values. + + Added QueryUdpTcpIn and QueryUdpTcpOut to look for an aliasing + link but not actually add one. + + Added FindRtspOut, which is closely derived from FindUdpTcpOut, + except that the alias port (from FindNewPortGroup) is provided + as input. + + See HISTORY file for additional revisions. +*/ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#include +#else +#include +#include +#include +#endif + +static LIST_HEAD(, libalias) instancehead = LIST_HEAD_INITIALIZER(instancehead); + + +/* + Constants (note: constants are also defined + near relevant functions or structs) +*/ + +/* Parameters used for cleanup of expired links */ +/* NOTE: ALIAS_CLEANUP_INTERVAL_SECS must be less then LINK_TABLE_OUT_SIZE */ +#define ALIAS_CLEANUP_INTERVAL_SECS 64 +#define ALIAS_CLEANUP_MAX_SPOKES (LINK_TABLE_OUT_SIZE/5) + +/* Timeouts (in seconds) for different link types */ +#define ICMP_EXPIRE_TIME 60 +#define UDP_EXPIRE_TIME 60 +#define PROTO_EXPIRE_TIME 60 +#define FRAGMENT_ID_EXPIRE_TIME 10 +#define FRAGMENT_PTR_EXPIRE_TIME 30 + +/* TCP link expire time for different cases */ +/* When the link has been used and closed - minimal grace time to + allow ACKs and potential re-connect in FTP (XXX - is this allowed?) */ +#ifndef TCP_EXPIRE_DEAD +#define TCP_EXPIRE_DEAD 10 +#endif + +/* When the link has been used and closed on one side - the other side + is allowed to still send data */ +#ifndef TCP_EXPIRE_SINGLEDEAD +#define TCP_EXPIRE_SINGLEDEAD 90 +#endif + +/* When the link isn't yet up */ +#ifndef TCP_EXPIRE_INITIAL +#define TCP_EXPIRE_INITIAL 300 +#endif + +/* When the link is up */ +#ifndef TCP_EXPIRE_CONNECTED +#define TCP_EXPIRE_CONNECTED 86400 +#endif + + +/* Dummy port number codes used for FindLinkIn/Out() and AddLink(). + These constants can be anything except zero, which indicates an + unknown port number. */ + +#define NO_DEST_PORT 1 +#define NO_SRC_PORT 1 + + + +/* Data Structures + + The fundamental data structure used in this program is + "struct alias_link". Whenever a TCP connection is made, + a UDP datagram is sent out, or an ICMP echo request is made, + a link record is made (if it has not already been created). + The link record is identified by the source address/port + and the destination address/port. In the case of an ICMP + echo request, the source port is treated as being equivalent + with the 16-bit ID number of the ICMP packet. + + The link record also can store some auxiliary data. For + TCP connections that have had sequence and acknowledgment + modifications, data space is available to track these changes. + A state field is used to keep track in changes to the TCP + connection state. ID numbers of fragments can also be + stored in the auxiliary space. Pointers to unresolved + fragments can also be stored. + + The link records support two independent chainings. Lookup + tables for input and out tables hold the initial pointers + the link chains. On input, the lookup table indexes on alias + port and link type. On output, the lookup table indexes on + source address, destination address, source port, destination + port and link type. +*/ + +struct ack_data_record { /* used to save changes to ACK/sequence + * numbers */ + u_long ack_old; + u_long ack_new; + int delta; + int active; +}; + +struct tcp_state { /* Information about TCP connection */ + int in; /* State for outside -> inside */ + int out; /* State for inside -> outside */ + int index; /* Index to ACK data array */ + int ack_modified; /* Indicates whether ACK and + * sequence numbers */ + /* been modified */ +}; + +#define N_LINK_TCP_DATA 3 /* Number of distinct ACK number changes + * saved for a modified TCP stream */ +struct tcp_dat { + struct tcp_state state; + struct ack_data_record ack[N_LINK_TCP_DATA]; + int fwhole; /* Which firewall record is used for this + * hole? */ +}; + +struct server { /* LSNAT server pool (circular list) */ + struct in_addr addr; + u_short port; + struct server *next; +}; + +struct alias_link { /* Main data structure */ + struct libalias *la; + struct in_addr src_addr; /* Address and port information */ + struct in_addr dst_addr; + struct in_addr alias_addr; + struct in_addr proxy_addr; + u_short src_port; + u_short dst_port; + u_short alias_port; + u_short proxy_port; + struct server *server; + + int link_type; /* Type of link: TCP, UDP, ICMP, + * proto, frag */ + +/* values for link_type */ +#define LINK_ICMP IPPROTO_ICMP +#define LINK_UDP IPPROTO_UDP +#define LINK_TCP IPPROTO_TCP +#define LINK_FRAGMENT_ID (IPPROTO_MAX + 1) +#define LINK_FRAGMENT_PTR (IPPROTO_MAX + 2) +#define LINK_ADDR (IPPROTO_MAX + 3) +#define LINK_PPTP (IPPROTO_MAX + 4) + + int flags; /* indicates special characteristics */ + int pflags; /* protocol-specific flags */ + +/* flag bits */ +#define LINK_UNKNOWN_DEST_PORT 0x01 +#define LINK_UNKNOWN_DEST_ADDR 0x02 +#define LINK_PERMANENT 0x04 +#define LINK_PARTIALLY_SPECIFIED 0x03 /* logical-or of first two bits */ +#define LINK_UNFIREWALLED 0x08 + + int timestamp; /* Time link was last accessed */ + int expire_time; /* Expire time for link */ +#ifndef NO_USE_SOCKETS + int sockfd; /* socket descriptor */ +#endif + LIST_ENTRY (alias_link) list_out; /* Linked list of + * pointers for */ + LIST_ENTRY (alias_link) list_in; /* input and output + * lookup tables */ + + union { /* Auxiliary data */ + char *frag_ptr; + struct in_addr frag_addr; + struct tcp_dat *tcp; + } data; +}; + +/* Clean up procedure. */ +static void finishoff(void); + +/* Kernel module definition. */ +#ifdef _KERNEL +MALLOC_DEFINE(M_ALIAS, "libalias", "packet aliasing"); + +MODULE_VERSION(libalias, 1); + +static int +alias_mod_handler(module_t mod, int type, void *data) +{ + int error; + + switch (type) { + case MOD_LOAD: + error = 0; + handler_chain_init(); + break; + case MOD_QUIESCE: + case MOD_UNLOAD: + handler_chain_destroy(); + finishoff(); + error = 0; + break; + default: + error = EINVAL; + } + + return (error); +} + +static moduledata_t alias_mod = { + "alias", alias_mod_handler, NULL +}; + +DECLARE_MODULE(alias, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); +#endif + +/* Internal utility routines (used only in alias_db.c) + +Lookup table starting points: + StartPointIn() -- link table initial search point for + incoming packets + StartPointOut() -- link table initial search point for + outgoing packets + +Miscellaneous: + SeqDiff() -- difference between two TCP sequences + ShowAliasStats() -- send alias statistics to a monitor file +*/ + + +/* Local prototypes */ +static u_int StartPointIn(struct in_addr, u_short, int); + +static u_int +StartPointOut(struct in_addr, struct in_addr, + u_short, u_short, int); + +static int SeqDiff(u_long, u_long); + +#ifndef NO_FW_PUNCH +/* Firewall control */ +static void InitPunchFW(struct libalias *); +static void UninitPunchFW(struct libalias *); +static void ClearFWHole(struct alias_link *); + +#endif + +/* Log file control */ +static void ShowAliasStats(struct libalias *); +static int InitPacketAliasLog(struct libalias *); +static void UninitPacketAliasLog(struct libalias *); + +void SctpShowAliasStats(struct libalias *la); + +static u_int +StartPointIn(struct in_addr alias_addr, + u_short alias_port, + int link_type) +{ + u_int n; + + n = alias_addr.s_addr; + if (link_type != LINK_PPTP) + n += alias_port; + n += link_type; + return (n % LINK_TABLE_IN_SIZE); +} + + +static u_int +StartPointOut(struct in_addr src_addr, struct in_addr dst_addr, + u_short src_port, u_short dst_port, int link_type) +{ + u_int n; + + n = src_addr.s_addr; + n += dst_addr.s_addr; + if (link_type != LINK_PPTP) { + n += src_port; + n += dst_port; + } + n += link_type; + + return (n % LINK_TABLE_OUT_SIZE); +} + + +static int +SeqDiff(u_long x, u_long y) +{ +/* Return the difference between two TCP sequence numbers */ + +/* + This function is encapsulated in case there are any unusual + arithmetic conditions that need to be considered. +*/ + + return (ntohl(y) - ntohl(x)); +} + +#ifdef _KERNEL + +static void +AliasLog(char *str, const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + vsnprintf(str, LIBALIAS_BUF_SIZE, format, ap); + va_end(ap); +} +#else +static void +AliasLog(FILE *stream, const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + vfprintf(stream, format, ap); + va_end(ap); + fflush(stream); +} +#endif + +static void +ShowAliasStats(struct libalias *la) +{ + + LIBALIAS_LOCK_ASSERT(la); +/* Used for debugging */ + if (la->logDesc) { + int tot = la->icmpLinkCount + la->udpLinkCount + + (la->sctpLinkCount>>1) + /* sctp counts half associations */ + la->tcpLinkCount + la->pptpLinkCount + + la->protoLinkCount + la->fragmentIdLinkCount + + la->fragmentPtrLinkCount; + + AliasLog(la->logDesc, + "icmp=%u, udp=%u, tcp=%u, sctp=%u, pptp=%u, proto=%u, frag_id=%u frag_ptr=%u / tot=%u", + la->icmpLinkCount, + la->udpLinkCount, + la->tcpLinkCount, + la->sctpLinkCount>>1, /* sctp counts half associations */ + la->pptpLinkCount, + la->protoLinkCount, + la->fragmentIdLinkCount, + la->fragmentPtrLinkCount, tot); +#ifndef _KERNEL + AliasLog(la->logDesc, " (sock=%u)\n", la->sockCount); +#endif + } +} + +void SctpShowAliasStats(struct libalias *la) +{ + + ShowAliasStats(la); +} + + +/* Internal routines for finding, deleting and adding links + +Port Allocation: + GetNewPort() -- find and reserve new alias port number + GetSocket() -- try to allocate a socket for a given port + +Link creation and deletion: + CleanupAliasData() - remove all link chains from lookup table + IncrementalCleanup() - look for stale links in a single chain + DeleteLink() - remove link + AddLink() - add link + ReLink() - change link + +Link search: + FindLinkOut() - find link for outgoing packets + FindLinkIn() - find link for incoming packets + +Port search: + FindNewPortGroup() - find an available group of ports +*/ + +/* Local prototypes */ +static int GetNewPort(struct libalias *, struct alias_link *, int); +#ifndef NO_USE_SOCKETS +static u_short GetSocket(struct libalias *, u_short, int *, int); +#endif +static void CleanupAliasData(struct libalias *); + +static void IncrementalCleanup(struct libalias *); + +static void DeleteLink(struct alias_link *); + +static struct alias_link * +AddLink(struct libalias *, struct in_addr, struct in_addr, struct in_addr, + u_short, u_short, int, int); + +static struct alias_link * +ReLink(struct alias_link *, + struct in_addr, struct in_addr, struct in_addr, + u_short, u_short, int, int); + +static struct alias_link * + FindLinkOut (struct libalias *, struct in_addr, struct in_addr, u_short, u_short, int, int); + +static struct alias_link * + FindLinkIn (struct libalias *, struct in_addr, struct in_addr, u_short, u_short, int, int); + + +#define ALIAS_PORT_BASE 0x08000 +#define ALIAS_PORT_MASK 0x07fff +#define ALIAS_PORT_MASK_EVEN 0x07ffe +#define GET_NEW_PORT_MAX_ATTEMPTS 20 + +#define GET_ALIAS_PORT -1 +#define GET_ALIAS_ID GET_ALIAS_PORT + +#define FIND_EVEN_ALIAS_BASE 1 + +/* GetNewPort() allocates port numbers. Note that if a port number + is already in use, that does not mean that it cannot be used by + another link concurrently. This is because GetNewPort() looks for + unused triplets: (dest addr, dest port, alias port). */ + +static int +GetNewPort(struct libalias *la, struct alias_link *lnk, int alias_port_param) +{ + int i; + int max_trials; + u_short port_sys; + u_short port_net; + + LIBALIAS_LOCK_ASSERT(la); +/* + Description of alias_port_param for GetNewPort(). When + this parameter is zero or positive, it precisely specifies + the port number. GetNewPort() will return this number + without check that it is in use. + + When this parameter is GET_ALIAS_PORT, it indicates to get a randomly + selected port number. +*/ + + if (alias_port_param == GET_ALIAS_PORT) { + /* + * The aliasing port is automatically selected by one of + * two methods below: + */ + max_trials = GET_NEW_PORT_MAX_ATTEMPTS; + + if (la->packetAliasMode & PKT_ALIAS_SAME_PORTS) { + /* + * When the PKT_ALIAS_SAME_PORTS option is chosen, + * the first try will be the actual source port. If + * this is already in use, the remainder of the + * trials will be random. + */ + port_net = lnk->src_port; + port_sys = ntohs(port_net); + } else { + /* First trial and all subsequent are random. */ + port_sys = arc4random() & ALIAS_PORT_MASK; + port_sys += ALIAS_PORT_BASE; + port_net = htons(port_sys); + } + } else if (alias_port_param >= 0 && alias_port_param < 0x10000) { + lnk->alias_port = (u_short) alias_port_param; + return (0); + } else { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "PacketAlias/GetNewPort(): "); + fprintf(stderr, "input parameter error\n"); +#endif + return (-1); + } + + +/* Port number search */ + for (i = 0; i < max_trials; i++) { + int go_ahead; + struct alias_link *search_result; + + search_result = FindLinkIn(la, lnk->dst_addr, lnk->alias_addr, + lnk->dst_port, port_net, + lnk->link_type, 0); + + if (search_result == NULL) + go_ahead = 1; + else if (!(lnk->flags & LINK_PARTIALLY_SPECIFIED) + && (search_result->flags & LINK_PARTIALLY_SPECIFIED)) + go_ahead = 1; + else + go_ahead = 0; + + if (go_ahead) { +#ifndef NO_USE_SOCKETS + if ((la->packetAliasMode & PKT_ALIAS_USE_SOCKETS) + && (lnk->flags & LINK_PARTIALLY_SPECIFIED) + && ((lnk->link_type == LINK_TCP) || + (lnk->link_type == LINK_UDP))) { + if (GetSocket(la, port_net, &lnk->sockfd, lnk->link_type)) { + lnk->alias_port = port_net; + return (0); + } + } else { +#endif + lnk->alias_port = port_net; + return (0); +#ifndef NO_USE_SOCKETS + } +#endif + } + port_sys = arc4random() & ALIAS_PORT_MASK; + port_sys += ALIAS_PORT_BASE; + port_net = htons(port_sys); + } + +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "PacketAlias/GetnewPort(): "); + fprintf(stderr, "could not find free port\n"); +#endif + + return (-1); +} + +#ifndef NO_USE_SOCKETS +static u_short +GetSocket(struct libalias *la, u_short port_net, int *sockfd, int link_type) +{ + int err; + int sock; + struct sockaddr_in sock_addr; + + LIBALIAS_LOCK_ASSERT(la); + if (link_type == LINK_TCP) + sock = socket(AF_INET, SOCK_STREAM, 0); + else if (link_type == LINK_UDP) + sock = socket(AF_INET, SOCK_DGRAM, 0); + else { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "PacketAlias/GetSocket(): "); + fprintf(stderr, "incorrect link type\n"); +#endif + return (0); + } + + if (sock < 0) { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "PacketAlias/GetSocket(): "); + fprintf(stderr, "socket() error %d\n", *sockfd); +#endif + return (0); + } + sock_addr.sin_family = AF_INET; + sock_addr.sin_addr.s_addr = htonl(INADDR_ANY); + sock_addr.sin_port = port_net; + + err = bind(sock, + (struct sockaddr *)&sock_addr, + sizeof(sock_addr)); + if (err == 0) { + la->sockCount++; + *sockfd = sock; + return (1); + } else { + close(sock); + return (0); + } +} +#endif + +/* FindNewPortGroup() returns a base port number for an available + range of contiguous port numbers. Note that if a port number + is already in use, that does not mean that it cannot be used by + another link concurrently. This is because FindNewPortGroup() + looks for unused triplets: (dest addr, dest port, alias port). */ + +int +FindNewPortGroup(struct libalias *la, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + u_short port_count, + u_char proto, + u_char align) +{ + int i, j; + int max_trials; + u_short port_sys; + int link_type; + + LIBALIAS_LOCK_ASSERT(la); + /* + * Get link_type from protocol + */ + + switch (proto) { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return (0); + break; + } + + /* + * The aliasing port is automatically selected by one of two + * methods below: + */ + max_trials = GET_NEW_PORT_MAX_ATTEMPTS; + + if (la->packetAliasMode & PKT_ALIAS_SAME_PORTS) { + /* + * When the ALIAS_SAME_PORTS option is chosen, the first + * try will be the actual source port. If this is already + * in use, the remainder of the trials will be random. + */ + port_sys = ntohs(src_port); + + } else { + + /* First trial and all subsequent are random. */ + if (align == FIND_EVEN_ALIAS_BASE) + port_sys = arc4random() & ALIAS_PORT_MASK_EVEN; + else + port_sys = arc4random() & ALIAS_PORT_MASK; + + port_sys += ALIAS_PORT_BASE; + } + +/* Port number search */ + for (i = 0; i < max_trials; i++) { + + struct alias_link *search_result; + + for (j = 0; j < port_count; j++) + if (0 != (search_result = FindLinkIn(la, dst_addr, alias_addr, + dst_port, htons(port_sys + j), + link_type, 0))) + break; + + /* Found a good range, return base */ + if (j == port_count) + return (htons(port_sys)); + + /* Find a new base to try */ + if (align == FIND_EVEN_ALIAS_BASE) + port_sys = arc4random() & ALIAS_PORT_MASK_EVEN; + else + port_sys = arc4random() & ALIAS_PORT_MASK; + + port_sys += ALIAS_PORT_BASE; + } + +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "PacketAlias/FindNewPortGroup(): "); + fprintf(stderr, "could not find free port(s)\n"); +#endif + + return (0); +} + +static void +CleanupAliasData(struct libalias *la) +{ + struct alias_link *lnk; + int i; + + LIBALIAS_LOCK_ASSERT(la); + for (i = 0; i < LINK_TABLE_OUT_SIZE; i++) { + lnk = LIST_FIRST(&la->linkTableOut[i]); + while (lnk != NULL) { + struct alias_link *link_next = LIST_NEXT(lnk, list_out); + DeleteLink(lnk); + lnk = link_next; + } + } + + la->cleanupIndex = 0; +} + + +static void +IncrementalCleanup(struct libalias *la) +{ + struct alias_link *lnk, *lnk_tmp; + + LIBALIAS_LOCK_ASSERT(la); + LIST_FOREACH_SAFE(lnk, &la->linkTableOut[la->cleanupIndex++], + list_out, lnk_tmp) { + if (la->timeStamp - lnk->timestamp > lnk->expire_time) + DeleteLink(lnk); + } + + if (la->cleanupIndex == LINK_TABLE_OUT_SIZE) + la->cleanupIndex = 0; +} + +static void +DeleteLink(struct alias_link *lnk) +{ + struct libalias *la = lnk->la; + + LIBALIAS_LOCK_ASSERT(la); +/* Don't do anything if the link is marked permanent */ + if (la->deleteAllLinks == 0 && lnk->flags & LINK_PERMANENT) + return; + +#ifndef NO_FW_PUNCH +/* Delete associated firewall hole, if any */ + ClearFWHole(lnk); +#endif + +/* Free memory allocated for LSNAT server pool */ + if (lnk->server != NULL) { + struct server *head, *curr, *next; + + head = curr = lnk->server; + do { + next = curr->next; + free(curr); + } while ((curr = next) != head); + } +/* Adjust output table pointers */ + LIST_REMOVE(lnk, list_out); + +/* Adjust input table pointers */ + LIST_REMOVE(lnk, list_in); +#ifndef NO_USE_SOCKETS +/* Close socket, if one has been allocated */ + if (lnk->sockfd != -1) { + la->sockCount--; + close(lnk->sockfd); + } +#endif +/* Link-type dependent cleanup */ + switch (lnk->link_type) { + case LINK_ICMP: + la->icmpLinkCount--; + break; + case LINK_UDP: + la->udpLinkCount--; + break; + case LINK_TCP: + la->tcpLinkCount--; + free(lnk->data.tcp); + break; + case LINK_PPTP: + la->pptpLinkCount--; + break; + case LINK_FRAGMENT_ID: + la->fragmentIdLinkCount--; + break; + case LINK_FRAGMENT_PTR: + la->fragmentPtrLinkCount--; + if (lnk->data.frag_ptr != NULL) + free(lnk->data.frag_ptr); + break; + case LINK_ADDR: + break; + default: + la->protoLinkCount--; + break; + } + +/* Free memory */ + free(lnk); + +/* Write statistics, if logging enabled */ + if (la->packetAliasMode & PKT_ALIAS_LOG) { + ShowAliasStats(la); + } +} + + +static struct alias_link * +AddLink(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + int alias_port_param, /* if less than zero, alias */ + int link_type) +{ /* port will be automatically *//* chosen. + * If greater than */ + u_int start_point; /* zero, equal to alias port */ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = malloc(sizeof(struct alias_link)); + if (lnk != NULL) { + /* Basic initialization */ + lnk->la = la; + lnk->src_addr = src_addr; + lnk->dst_addr = dst_addr; + lnk->alias_addr = alias_addr; + lnk->proxy_addr.s_addr = INADDR_ANY; + lnk->src_port = src_port; + lnk->dst_port = dst_port; + lnk->proxy_port = 0; + lnk->server = NULL; + lnk->link_type = link_type; +#ifndef NO_USE_SOCKETS + lnk->sockfd = -1; +#endif + lnk->flags = 0; + lnk->pflags = 0; + lnk->timestamp = la->timeStamp; + + /* Expiration time */ + switch (link_type) { + case LINK_ICMP: + lnk->expire_time = ICMP_EXPIRE_TIME; + break; + case LINK_UDP: + lnk->expire_time = UDP_EXPIRE_TIME; + break; + case LINK_TCP: + lnk->expire_time = TCP_EXPIRE_INITIAL; + break; + case LINK_PPTP: + lnk->flags |= LINK_PERMANENT; /* no timeout. */ + break; + case LINK_FRAGMENT_ID: + lnk->expire_time = FRAGMENT_ID_EXPIRE_TIME; + break; + case LINK_FRAGMENT_PTR: + lnk->expire_time = FRAGMENT_PTR_EXPIRE_TIME; + break; + case LINK_ADDR: + break; + default: + lnk->expire_time = PROTO_EXPIRE_TIME; + break; + } + + /* Determine alias flags */ + if (dst_addr.s_addr == INADDR_ANY) + lnk->flags |= LINK_UNKNOWN_DEST_ADDR; + if (dst_port == 0) + lnk->flags |= LINK_UNKNOWN_DEST_PORT; + + /* Determine alias port */ + if (GetNewPort(la, lnk, alias_port_param) != 0) { + free(lnk); + return (NULL); + } + /* Link-type dependent initialization */ + switch (link_type) { + struct tcp_dat *aux_tcp; + + case LINK_ICMP: + la->icmpLinkCount++; + break; + case LINK_UDP: + la->udpLinkCount++; + break; + case LINK_TCP: + aux_tcp = malloc(sizeof(struct tcp_dat)); + if (aux_tcp != NULL) { + int i; + + la->tcpLinkCount++; + aux_tcp->state.in = ALIAS_TCP_STATE_NOT_CONNECTED; + aux_tcp->state.out = ALIAS_TCP_STATE_NOT_CONNECTED; + aux_tcp->state.index = 0; + aux_tcp->state.ack_modified = 0; + for (i = 0; i < N_LINK_TCP_DATA; i++) + aux_tcp->ack[i].active = 0; + aux_tcp->fwhole = -1; + lnk->data.tcp = aux_tcp; + } else { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "PacketAlias/AddLink: "); + fprintf(stderr, " cannot allocate auxiliary TCP data\n"); +#endif + free(lnk); + return (NULL); + } + break; + case LINK_PPTP: + la->pptpLinkCount++; + break; + case LINK_FRAGMENT_ID: + la->fragmentIdLinkCount++; + break; + case LINK_FRAGMENT_PTR: + la->fragmentPtrLinkCount++; + break; + case LINK_ADDR: + break; + default: + la->protoLinkCount++; + break; + } + + /* Set up pointers for output lookup table */ + start_point = StartPointOut(src_addr, dst_addr, + src_port, dst_port, link_type); + LIST_INSERT_HEAD(&la->linkTableOut[start_point], lnk, list_out); + + /* Set up pointers for input lookup table */ + start_point = StartPointIn(alias_addr, lnk->alias_port, link_type); + LIST_INSERT_HEAD(&la->linkTableIn[start_point], lnk, list_in); + } else { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "PacketAlias/AddLink(): "); + fprintf(stderr, "malloc() call failed.\n"); +#endif + } + if (la->packetAliasMode & PKT_ALIAS_LOG) { + ShowAliasStats(la); + } + return (lnk); +} + +static struct alias_link * +ReLink(struct alias_link *old_lnk, + struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_short src_port, + u_short dst_port, + int alias_port_param, /* if less than zero, alias */ + int link_type) +{ /* port will be automatically *//* chosen. + * If greater than */ + struct alias_link *new_lnk; /* zero, equal to alias port */ + struct libalias *la = old_lnk->la; + + LIBALIAS_LOCK_ASSERT(la); + new_lnk = AddLink(la, src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port_param, + link_type); +#ifndef NO_FW_PUNCH + if (new_lnk != NULL && + old_lnk->link_type == LINK_TCP && + old_lnk->data.tcp->fwhole > 0) { + PunchFWHole(new_lnk); + } +#endif + DeleteLink(old_lnk); + return (new_lnk); +} + +static struct alias_link * +_FindLinkOut(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + int link_type, + int replace_partial_links) +{ + u_int i; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + i = StartPointOut(src_addr, dst_addr, src_port, dst_port, link_type); + LIST_FOREACH(lnk, &la->linkTableOut[i], list_out) { + if (lnk->dst_addr.s_addr == dst_addr.s_addr && + lnk->src_addr.s_addr == src_addr.s_addr && + lnk->src_port == src_port && + lnk->dst_port == dst_port && + lnk->link_type == link_type && + lnk->server == NULL) { + lnk->timestamp = la->timeStamp; + break; + } + } + +/* Search for partially specified links. */ + if (lnk == NULL && replace_partial_links) { + if (dst_port != 0 && dst_addr.s_addr != INADDR_ANY) { + lnk = _FindLinkOut(la, src_addr, dst_addr, src_port, 0, + link_type, 0); + if (lnk == NULL) + lnk = _FindLinkOut(la, src_addr, la->nullAddress, src_port, + dst_port, link_type, 0); + } + if (lnk == NULL && + (dst_port != 0 || dst_addr.s_addr != INADDR_ANY)) { + lnk = _FindLinkOut(la, src_addr, la->nullAddress, src_port, 0, + link_type, 0); + } + if (lnk != NULL) { + lnk = ReLink(lnk, + src_addr, dst_addr, lnk->alias_addr, + src_port, dst_port, lnk->alias_port, + link_type); + } + } + return (lnk); +} + +static struct alias_link * +FindLinkOut(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + int link_type, + int replace_partial_links) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = _FindLinkOut(la, src_addr, dst_addr, src_port, dst_port, + link_type, replace_partial_links); + + if (lnk == NULL) { + /* + * The following allows permanent links to be specified as + * using the default source address (i.e. device interface + * address) without knowing in advance what that address + * is. + */ + if (la->aliasAddress.s_addr != INADDR_ANY && + src_addr.s_addr == la->aliasAddress.s_addr) { + lnk = _FindLinkOut(la, la->nullAddress, dst_addr, src_port, dst_port, + link_type, replace_partial_links); + } + } + return (lnk); +} + + +static struct alias_link * +_FindLinkIn(struct libalias *la, struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + int link_type, + int replace_partial_links) +{ + int flags_in; + u_int start_point; + struct alias_link *lnk; + struct alias_link *lnk_fully_specified; + struct alias_link *lnk_unknown_all; + struct alias_link *lnk_unknown_dst_addr; + struct alias_link *lnk_unknown_dst_port; + + LIBALIAS_LOCK_ASSERT(la); +/* Initialize pointers */ + lnk_fully_specified = NULL; + lnk_unknown_all = NULL; + lnk_unknown_dst_addr = NULL; + lnk_unknown_dst_port = NULL; + +/* If either the dest addr or port is unknown, the search + loop will have to know about this. */ + + flags_in = 0; + if (dst_addr.s_addr == INADDR_ANY) + flags_in |= LINK_UNKNOWN_DEST_ADDR; + if (dst_port == 0) + flags_in |= LINK_UNKNOWN_DEST_PORT; + +/* Search loop */ + start_point = StartPointIn(alias_addr, alias_port, link_type); + LIST_FOREACH(lnk, &la->linkTableIn[start_point], list_in) { + int flags; + + flags = flags_in | lnk->flags; + if (!(flags & LINK_PARTIALLY_SPECIFIED)) { + if (lnk->alias_addr.s_addr == alias_addr.s_addr + && lnk->alias_port == alias_port + && lnk->dst_addr.s_addr == dst_addr.s_addr + && lnk->dst_port == dst_port + && lnk->link_type == link_type) { + lnk_fully_specified = lnk; + break; + } + } else if ((flags & LINK_UNKNOWN_DEST_ADDR) + && (flags & LINK_UNKNOWN_DEST_PORT)) { + if (lnk->alias_addr.s_addr == alias_addr.s_addr + && lnk->alias_port == alias_port + && lnk->link_type == link_type) { + if (lnk_unknown_all == NULL) + lnk_unknown_all = lnk; + } + } else if (flags & LINK_UNKNOWN_DEST_ADDR) { + if (lnk->alias_addr.s_addr == alias_addr.s_addr + && lnk->alias_port == alias_port + && lnk->link_type == link_type + && lnk->dst_port == dst_port) { + if (lnk_unknown_dst_addr == NULL) + lnk_unknown_dst_addr = lnk; + } + } else if (flags & LINK_UNKNOWN_DEST_PORT) { + if (lnk->alias_addr.s_addr == alias_addr.s_addr + && lnk->alias_port == alias_port + && lnk->link_type == link_type + && lnk->dst_addr.s_addr == dst_addr.s_addr) { + if (lnk_unknown_dst_port == NULL) + lnk_unknown_dst_port = lnk; + } + } + } + + + + if (lnk_fully_specified != NULL) { + lnk_fully_specified->timestamp = la->timeStamp; + lnk = lnk_fully_specified; + } else if (lnk_unknown_dst_port != NULL) + lnk = lnk_unknown_dst_port; + else if (lnk_unknown_dst_addr != NULL) + lnk = lnk_unknown_dst_addr; + else if (lnk_unknown_all != NULL) + lnk = lnk_unknown_all; + else + return (NULL); + + if (replace_partial_links && + (lnk->flags & LINK_PARTIALLY_SPECIFIED || lnk->server != NULL)) { + struct in_addr src_addr; + u_short src_port; + + if (lnk->server != NULL) { /* LSNAT link */ + src_addr = lnk->server->addr; + src_port = lnk->server->port; + lnk->server = lnk->server->next; + } else { + src_addr = lnk->src_addr; + src_port = lnk->src_port; + } + + if (link_type == LINK_SCTP) { + lnk->src_addr = src_addr; + lnk->src_port = src_port; + return(lnk); + } + lnk = ReLink(lnk, + src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port, + link_type); + } + return (lnk); +} + +static struct alias_link * +FindLinkIn(struct libalias *la, struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + int link_type, + int replace_partial_links) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = _FindLinkIn(la, dst_addr, alias_addr, dst_port, alias_port, + link_type, replace_partial_links); + + if (lnk == NULL) { + /* + * The following allows permanent links to be specified as + * using the default aliasing address (i.e. device + * interface address) without knowing in advance what that + * address is. + */ + if (la->aliasAddress.s_addr != INADDR_ANY && + alias_addr.s_addr == la->aliasAddress.s_addr) { + lnk = _FindLinkIn(la, dst_addr, la->nullAddress, dst_port, alias_port, + link_type, replace_partial_links); + } + } + return (lnk); +} + + + + +/* External routines for finding/adding links + +-- "external" means outside alias_db.c, but within alias*.c -- + + FindIcmpIn(), FindIcmpOut() + FindFragmentIn1(), FindFragmentIn2() + AddFragmentPtrLink(), FindFragmentPtr() + FindProtoIn(), FindProtoOut() + FindUdpTcpIn(), FindUdpTcpOut() + AddPptp(), FindPptpOutByCallId(), FindPptpInByCallId(), + FindPptpOutByPeerCallId(), FindPptpInByPeerCallId() + FindOriginalAddress(), FindAliasAddress() + +(prototypes in alias_local.h) +*/ + + +struct alias_link * +FindIcmpIn(struct libalias *la, struct in_addr dst_addr, + struct in_addr alias_addr, + u_short id_alias, + int create) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = FindLinkIn(la, dst_addr, alias_addr, + NO_DEST_PORT, id_alias, + LINK_ICMP, 0); + if (lnk == NULL && create && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) { + struct in_addr target_addr; + + target_addr = FindOriginalAddress(la, alias_addr); + lnk = AddLink(la, target_addr, dst_addr, alias_addr, + id_alias, NO_DEST_PORT, id_alias, + LINK_ICMP); + } + return (lnk); +} + + +struct alias_link * +FindIcmpOut(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + u_short id, + int create) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = FindLinkOut(la, src_addr, dst_addr, + id, NO_DEST_PORT, + LINK_ICMP, 0); + if (lnk == NULL && create) { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(la, src_addr); + lnk = AddLink(la, src_addr, dst_addr, alias_addr, + id, NO_DEST_PORT, GET_ALIAS_ID, + LINK_ICMP); + } + return (lnk); +} + + +struct alias_link * +FindFragmentIn1(struct libalias *la, struct in_addr dst_addr, + struct in_addr alias_addr, + u_short ip_id) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = FindLinkIn(la, dst_addr, alias_addr, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID, 0); + + if (lnk == NULL) { + lnk = AddLink(la, la->nullAddress, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID); + } + return (lnk); +} + + +struct alias_link * +FindFragmentIn2(struct libalias *la, struct in_addr dst_addr, /* Doesn't add a link if + * one */ + struct in_addr alias_addr, /* is not found. */ + u_short ip_id) +{ + + LIBALIAS_LOCK_ASSERT(la); + return FindLinkIn(la, dst_addr, alias_addr, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_ID, 0); +} + + +struct alias_link * +AddFragmentPtrLink(struct libalias *la, struct in_addr dst_addr, + u_short ip_id) +{ + + LIBALIAS_LOCK_ASSERT(la); + return AddLink(la, la->nullAddress, dst_addr, la->nullAddress, + NO_SRC_PORT, NO_DEST_PORT, ip_id, + LINK_FRAGMENT_PTR); +} + + +struct alias_link * +FindFragmentPtr(struct libalias *la, struct in_addr dst_addr, + u_short ip_id) +{ + + LIBALIAS_LOCK_ASSERT(la); + return FindLinkIn(la, dst_addr, la->nullAddress, + NO_DEST_PORT, ip_id, + LINK_FRAGMENT_PTR, 0); +} + + +struct alias_link * +FindProtoIn(struct libalias *la, struct in_addr dst_addr, + struct in_addr alias_addr, + u_char proto) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = FindLinkIn(la, dst_addr, alias_addr, + NO_DEST_PORT, 0, + proto, 1); + + if (lnk == NULL && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) { + struct in_addr target_addr; + + target_addr = FindOriginalAddress(la, alias_addr); + lnk = AddLink(la, target_addr, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, 0, + proto); + } + return (lnk); +} + + +struct alias_link * +FindProtoOut(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + u_char proto) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = FindLinkOut(la, src_addr, dst_addr, + NO_SRC_PORT, NO_DEST_PORT, + proto, 1); + + if (lnk == NULL) { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(la, src_addr); + lnk = AddLink(la, src_addr, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, 0, + proto); + } + return (lnk); +} + + +struct alias_link * +FindUdpTcpIn(struct libalias *la, struct in_addr dst_addr, + struct in_addr alias_addr, + u_short dst_port, + u_short alias_port, + u_char proto, + int create) +{ + int link_type; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + switch (proto) { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return (NULL); + break; + } + + lnk = FindLinkIn(la, dst_addr, alias_addr, + dst_port, alias_port, + link_type, create); + + if (lnk == NULL && create && !(la->packetAliasMode & PKT_ALIAS_DENY_INCOMING)) { + struct in_addr target_addr; + + target_addr = FindOriginalAddress(la, alias_addr); + lnk = AddLink(la, target_addr, dst_addr, alias_addr, + alias_port, dst_port, alias_port, + link_type); + } + return (lnk); +} + + +struct alias_link * +FindUdpTcpOut(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short dst_port, + u_char proto, + int create) +{ + int link_type; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + switch (proto) { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return (NULL); + break; + } + + lnk = FindLinkOut(la, src_addr, dst_addr, src_port, dst_port, link_type, create); + + if (lnk == NULL && create) { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(la, src_addr); + lnk = AddLink(la, src_addr, dst_addr, alias_addr, + src_port, dst_port, GET_ALIAS_PORT, + link_type); + } + return (lnk); +} + + +struct alias_link * +AddPptp(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_int16_t src_call_id) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = AddLink(la, src_addr, dst_addr, alias_addr, + src_call_id, 0, GET_ALIAS_PORT, + LINK_PPTP); + + return (lnk); +} + + +struct alias_link * +FindPptpOutByCallId(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + u_int16_t src_call_id) +{ + u_int i; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP); + LIST_FOREACH(lnk, &la->linkTableOut[i], list_out) + if (lnk->link_type == LINK_PPTP && + lnk->src_addr.s_addr == src_addr.s_addr && + lnk->dst_addr.s_addr == dst_addr.s_addr && + lnk->src_port == src_call_id) + break; + + return (lnk); +} + + +struct alias_link * +FindPptpOutByPeerCallId(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + u_int16_t dst_call_id) +{ + u_int i; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + i = StartPointOut(src_addr, dst_addr, 0, 0, LINK_PPTP); + LIST_FOREACH(lnk, &la->linkTableOut[i], list_out) + if (lnk->link_type == LINK_PPTP && + lnk->src_addr.s_addr == src_addr.s_addr && + lnk->dst_addr.s_addr == dst_addr.s_addr && + lnk->dst_port == dst_call_id) + break; + + return (lnk); +} + + +struct alias_link * +FindPptpInByCallId(struct libalias *la, struct in_addr dst_addr, + struct in_addr alias_addr, + u_int16_t dst_call_id) +{ + u_int i; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + i = StartPointIn(alias_addr, 0, LINK_PPTP); + LIST_FOREACH(lnk, &la->linkTableIn[i], list_in) + if (lnk->link_type == LINK_PPTP && + lnk->dst_addr.s_addr == dst_addr.s_addr && + lnk->alias_addr.s_addr == alias_addr.s_addr && + lnk->dst_port == dst_call_id) + break; + + return (lnk); +} + + +struct alias_link * +FindPptpInByPeerCallId(struct libalias *la, struct in_addr dst_addr, + struct in_addr alias_addr, + u_int16_t alias_call_id) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = FindLinkIn(la, dst_addr, alias_addr, + 0 /* any */ , alias_call_id, + LINK_PPTP, 0); + + + return (lnk); +} + + +struct alias_link * +FindRtspOut(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + u_short src_port, + u_short alias_port, + u_char proto) +{ + int link_type; + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + switch (proto) { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + default: + return (NULL); + break; + } + + lnk = FindLinkOut(la, src_addr, dst_addr, src_port, 0, link_type, 1); + + if (lnk == NULL) { + struct in_addr alias_addr; + + alias_addr = FindAliasAddress(la, src_addr); + lnk = AddLink(la, src_addr, dst_addr, alias_addr, + src_port, 0, alias_port, + link_type); + } + return (lnk); +} + + +struct in_addr +FindOriginalAddress(struct libalias *la, struct in_addr alias_addr) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = FindLinkIn(la, la->nullAddress, alias_addr, + 0, 0, LINK_ADDR, 0); + if (lnk == NULL) { + la->newDefaultLink = 1; + if (la->targetAddress.s_addr == INADDR_ANY) + return (alias_addr); + else if (la->targetAddress.s_addr == INADDR_NONE) + return (la->aliasAddress.s_addr != INADDR_ANY) ? + la->aliasAddress : alias_addr; + else + return (la->targetAddress); + } else { + if (lnk->server != NULL) { /* LSNAT link */ + struct in_addr src_addr; + + src_addr = lnk->server->addr; + lnk->server = lnk->server->next; + return (src_addr); + } else if (lnk->src_addr.s_addr == INADDR_ANY) + return (la->aliasAddress.s_addr != INADDR_ANY) ? + la->aliasAddress : alias_addr; + else + return (lnk->src_addr); + } +} + + +struct in_addr +FindAliasAddress(struct libalias *la, struct in_addr original_addr) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK_ASSERT(la); + lnk = FindLinkOut(la, original_addr, la->nullAddress, + 0, 0, LINK_ADDR, 0); + if (lnk == NULL) { + return (la->aliasAddress.s_addr != INADDR_ANY) ? + la->aliasAddress : original_addr; + } else { + if (lnk->alias_addr.s_addr == INADDR_ANY) + return (la->aliasAddress.s_addr != INADDR_ANY) ? + la->aliasAddress : original_addr; + else + return (lnk->alias_addr); + } +} + + +/* External routines for getting or changing link data + (external to alias_db.c, but internal to alias*.c) + + SetFragmentData(), GetFragmentData() + SetFragmentPtr(), GetFragmentPtr() + SetStateIn(), SetStateOut(), GetStateIn(), GetStateOut() + GetOriginalAddress(), GetDestAddress(), GetAliasAddress() + GetOriginalPort(), GetAliasPort() + SetAckModified(), GetAckModified() + GetDeltaAckIn(), GetDeltaSeqOut(), AddSeq() + SetProtocolFlags(), GetProtocolFlags() + SetDestCallId() +*/ + + +void +SetFragmentAddr(struct alias_link *lnk, struct in_addr src_addr) +{ + lnk->data.frag_addr = src_addr; +} + + +void +GetFragmentAddr(struct alias_link *lnk, struct in_addr *src_addr) +{ + *src_addr = lnk->data.frag_addr; +} + + +void +SetFragmentPtr(struct alias_link *lnk, char *fptr) +{ + lnk->data.frag_ptr = fptr; +} + + +void +GetFragmentPtr(struct alias_link *lnk, char **fptr) +{ + *fptr = lnk->data.frag_ptr; +} + + +void +SetStateIn(struct alias_link *lnk, int state) +{ + /* TCP input state */ + switch (state) { + case ALIAS_TCP_STATE_DISCONNECTED: + if (lnk->data.tcp->state.out != ALIAS_TCP_STATE_CONNECTED) + lnk->expire_time = TCP_EXPIRE_DEAD; + else + lnk->expire_time = TCP_EXPIRE_SINGLEDEAD; + break; + case ALIAS_TCP_STATE_CONNECTED: + if (lnk->data.tcp->state.out == ALIAS_TCP_STATE_CONNECTED) + lnk->expire_time = TCP_EXPIRE_CONNECTED; + break; + default: +#ifdef _KERNEL + panic("libalias:SetStateIn() unknown state"); +#else + abort(); +#endif + } + lnk->data.tcp->state.in = state; +} + + +void +SetStateOut(struct alias_link *lnk, int state) +{ + /* TCP output state */ + switch (state) { + case ALIAS_TCP_STATE_DISCONNECTED: + if (lnk->data.tcp->state.in != ALIAS_TCP_STATE_CONNECTED) + lnk->expire_time = TCP_EXPIRE_DEAD; + else + lnk->expire_time = TCP_EXPIRE_SINGLEDEAD; + break; + case ALIAS_TCP_STATE_CONNECTED: + if (lnk->data.tcp->state.in == ALIAS_TCP_STATE_CONNECTED) + lnk->expire_time = TCP_EXPIRE_CONNECTED; + break; + default: +#ifdef _KERNEL + panic("libalias:SetStateOut() unknown state"); +#else + abort(); +#endif + } + lnk->data.tcp->state.out = state; +} + + +int +GetStateIn(struct alias_link *lnk) +{ + /* TCP input state */ + return (lnk->data.tcp->state.in); +} + + +int +GetStateOut(struct alias_link *lnk) +{ + /* TCP output state */ + return (lnk->data.tcp->state.out); +} + + +struct in_addr +GetOriginalAddress(struct alias_link *lnk) +{ + if (lnk->src_addr.s_addr == INADDR_ANY) + return (lnk->la->aliasAddress); + else + return (lnk->src_addr); +} + + +struct in_addr +GetDestAddress(struct alias_link *lnk) +{ + return (lnk->dst_addr); +} + + +struct in_addr +GetAliasAddress(struct alias_link *lnk) +{ + if (lnk->alias_addr.s_addr == INADDR_ANY) + return (lnk->la->aliasAddress); + else + return (lnk->alias_addr); +} + + +struct in_addr +GetDefaultAliasAddress(struct libalias *la) +{ + + LIBALIAS_LOCK_ASSERT(la); + return (la->aliasAddress); +} + + +void +SetDefaultAliasAddress(struct libalias *la, struct in_addr alias_addr) +{ + + LIBALIAS_LOCK_ASSERT(la); + la->aliasAddress = alias_addr; +} + + +u_short +GetOriginalPort(struct alias_link *lnk) +{ + return (lnk->src_port); +} + + +u_short +GetAliasPort(struct alias_link *lnk) +{ + return (lnk->alias_port); +} + +#ifndef NO_FW_PUNCH +static u_short +GetDestPort(struct alias_link *lnk) +{ + return (lnk->dst_port); +} + +#endif + +void +SetAckModified(struct alias_link *lnk) +{ +/* Indicate that ACK numbers have been modified in a TCP connection */ + lnk->data.tcp->state.ack_modified = 1; +} + + +struct in_addr +GetProxyAddress(struct alias_link *lnk) +{ + return (lnk->proxy_addr); +} + + +void +SetProxyAddress(struct alias_link *lnk, struct in_addr addr) +{ + lnk->proxy_addr = addr; +} + + +u_short +GetProxyPort(struct alias_link *lnk) +{ + return (lnk->proxy_port); +} + + +void +SetProxyPort(struct alias_link *lnk, u_short port) +{ + lnk->proxy_port = port; +} + + +int +GetAckModified(struct alias_link *lnk) +{ +/* See if ACK numbers have been modified */ + return (lnk->data.tcp->state.ack_modified); +} + +// XXX ip free +int +GetDeltaAckIn(u_long ack, struct alias_link *lnk) +{ +/* +Find out how much the ACK number has been altered for an incoming +TCP packet. To do this, a circular list of ACK numbers where the TCP +packet size was altered is searched. +*/ + + int i; + int delta, ack_diff_min; + + delta = 0; + ack_diff_min = -1; + for (i = 0; i < N_LINK_TCP_DATA; i++) { + struct ack_data_record x; + + x = lnk->data.tcp->ack[i]; + if (x.active == 1) { + int ack_diff; + + ack_diff = SeqDiff(x.ack_new, ack); + if (ack_diff >= 0) { + if (ack_diff_min >= 0) { + if (ack_diff < ack_diff_min) { + delta = x.delta; + ack_diff_min = ack_diff; + } + } else { + delta = x.delta; + ack_diff_min = ack_diff; + } + } + } + } + return (delta); +} + +// XXX ip free +int +GetDeltaSeqOut(u_long seq, struct alias_link *lnk) +{ +/* +Find out how much the sequence number has been altered for an outgoing +TCP packet. To do this, a circular list of ACK numbers where the TCP +packet size was altered is searched. +*/ + + int i; + int delta, seq_diff_min; + + delta = 0; + seq_diff_min = -1; + for (i = 0; i < N_LINK_TCP_DATA; i++) { + struct ack_data_record x; + + x = lnk->data.tcp->ack[i]; + if (x.active == 1) { + int seq_diff; + + seq_diff = SeqDiff(x.ack_old, seq); + if (seq_diff >= 0) { + if (seq_diff_min >= 0) { + if (seq_diff < seq_diff_min) { + delta = x.delta; + seq_diff_min = seq_diff; + } + } else { + delta = x.delta; + seq_diff_min = seq_diff; + } + } + } + } + return (delta); +} + +// XXX ip free +void +AddSeq(struct alias_link *lnk, int delta, u_int ip_hl, u_short ip_len, + u_long th_seq, u_int th_off) +{ +/* +When a TCP packet has been altered in length, save this +information in a circular list. If enough packets have +been altered, then this list will begin to overwrite itself. +*/ + + struct ack_data_record x; + int hlen, tlen, dlen; + int i; + + hlen = (ip_hl + th_off) << 2; + tlen = ntohs(ip_len); + dlen = tlen - hlen; + + x.ack_old = htonl(ntohl(th_seq) + dlen); + x.ack_new = htonl(ntohl(th_seq) + dlen + delta); + x.delta = delta; + x.active = 1; + + i = lnk->data.tcp->state.index; + lnk->data.tcp->ack[i] = x; + + i++; + if (i == N_LINK_TCP_DATA) + lnk->data.tcp->state.index = 0; + else + lnk->data.tcp->state.index = i; +} + +void +SetExpire(struct alias_link *lnk, int expire) +{ + if (expire == 0) { + lnk->flags &= ~LINK_PERMANENT; + DeleteLink(lnk); + } else if (expire == -1) { + lnk->flags |= LINK_PERMANENT; + } else if (expire > 0) { + lnk->expire_time = expire; + } else { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "PacketAlias/SetExpire(): "); + fprintf(stderr, "error in expire parameter\n"); +#endif + } +} + +void +ClearCheckNewLink(struct libalias *la) +{ + + LIBALIAS_LOCK_ASSERT(la); + la->newDefaultLink = 0; +} + +void +SetProtocolFlags(struct alias_link *lnk, int pflags) +{ + + lnk->pflags = pflags; +} + +int +GetProtocolFlags(struct alias_link *lnk) +{ + + return (lnk->pflags); +} + +void +SetDestCallId(struct alias_link *lnk, u_int16_t cid) +{ + struct libalias *la = lnk->la; + + LIBALIAS_LOCK_ASSERT(la); + la->deleteAllLinks = 1; + ReLink(lnk, lnk->src_addr, lnk->dst_addr, lnk->alias_addr, + lnk->src_port, cid, lnk->alias_port, lnk->link_type); + la->deleteAllLinks = 0; +} + + +/* Miscellaneous Functions + + HouseKeeping() + InitPacketAliasLog() + UninitPacketAliasLog() +*/ + +/* + Whenever an outgoing or incoming packet is handled, HouseKeeping() + is called to find and remove timed-out aliasing links. Logic exists + to sweep through the entire table and linked list structure + every 60 seconds. + + (prototype in alias_local.h) +*/ + +void +HouseKeeping(struct libalias *la) +{ + int i, n; +#ifndef _KERNEL + struct timeval tv; + struct timezone tz; +#endif + + LIBALIAS_LOCK_ASSERT(la); + /* + * Save system time (seconds) in global variable timeStamp for use + * by other functions. This is done so as not to unnecessarily + * waste timeline by making system calls. + */ +#ifdef _KERNEL + la->timeStamp = time_uptime; +#else + gettimeofday(&tv, &tz); + la->timeStamp = tv.tv_sec; +#endif + + /* Compute number of spokes (output table link chains) to cover */ + n = LINK_TABLE_OUT_SIZE * (la->timeStamp - la->lastCleanupTime); + n /= ALIAS_CLEANUP_INTERVAL_SECS; + + /* Handle different cases */ + if (n > 0) { + if (n > ALIAS_CLEANUP_MAX_SPOKES) + n = ALIAS_CLEANUP_MAX_SPOKES; + la->lastCleanupTime = la->timeStamp; + for (i = 0; i < n; i++) + IncrementalCleanup(la); + } else if (n < 0) { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "PacketAlias/HouseKeeping(): "); + fprintf(stderr, "something unexpected in time values\n"); +#endif + la->lastCleanupTime = la->timeStamp; + } +} + +/* Init the log file and enable logging */ +static int +InitPacketAliasLog(struct libalias *la) +{ + + LIBALIAS_LOCK_ASSERT(la); + if (~la->packetAliasMode & PKT_ALIAS_LOG) { +#ifdef _KERNEL + if ((la->logDesc = malloc(LIBALIAS_BUF_SIZE))) + ; +#else + if ((la->logDesc = fopen("/var/log/alias.log", "w"))) + fprintf(la->logDesc, "PacketAlias/InitPacketAliasLog: Packet alias logging enabled.\n"); +#endif + else + return (ENOMEM); /* log initialization failed */ + la->packetAliasMode |= PKT_ALIAS_LOG; + } + + return (1); +} + +/* Close the log-file and disable logging. */ +static void +UninitPacketAliasLog(struct libalias *la) +{ + + LIBALIAS_LOCK_ASSERT(la); + if (la->logDesc) { +#ifdef _KERNEL + free(la->logDesc); +#else + fclose(la->logDesc); +#endif + la->logDesc = NULL; + } + la->packetAliasMode &= ~PKT_ALIAS_LOG; +} + +/* Outside world interfaces + +-- "outside world" means other than alias*.c routines -- + + PacketAliasRedirectPort() + PacketAliasAddServer() + PacketAliasRedirectProto() + PacketAliasRedirectAddr() + PacketAliasRedirectDynamic() + PacketAliasRedirectDelete() + PacketAliasSetAddress() + PacketAliasInit() + PacketAliasUninit() + PacketAliasSetMode() + +(prototypes in alias.h) +*/ + +/* Redirection from a specific public addr:port to a + private addr:port */ +struct alias_link * +LibAliasRedirectPort(struct libalias *la, struct in_addr src_addr, u_short src_port, + struct in_addr dst_addr, u_short dst_port, + struct in_addr alias_addr, u_short alias_port, + u_char proto) +{ + int link_type; + struct alias_link *lnk; + + LIBALIAS_LOCK(la); + switch (proto) { + case IPPROTO_UDP: + link_type = LINK_UDP; + break; + case IPPROTO_TCP: + link_type = LINK_TCP; + break; + case IPPROTO_SCTP: + link_type = LINK_SCTP; + break; + default: +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "PacketAliasRedirectPort(): "); + fprintf(stderr, "only SCTP, TCP and UDP protocols allowed\n"); +#endif + lnk = NULL; + goto getout; + } + + lnk = AddLink(la, src_addr, dst_addr, alias_addr, + src_port, dst_port, alias_port, + link_type); + + if (lnk != NULL) { + lnk->flags |= LINK_PERMANENT; + } +#ifdef LIBALIAS_DEBUG + else { + fprintf(stderr, "PacketAliasRedirectPort(): " + "call to AddLink() failed\n"); + } +#endif + +getout: + LIBALIAS_UNLOCK(la); + return (lnk); +} + +/* Add server to the pool of servers */ +int +LibAliasAddServer(struct libalias *la, struct alias_link *lnk, struct in_addr addr, u_short port) +{ + struct server *server; + int res; + + LIBALIAS_LOCK(la); + (void)la; + + server = malloc(sizeof(struct server)); + + if (server != NULL) { + struct server *head; + + server->addr = addr; + server->port = port; + + head = lnk->server; + if (head == NULL) + server->next = server; + else { + struct server *s; + + for (s = head; s->next != head; s = s->next); + s->next = server; + server->next = head; + } + lnk->server = server; + res = 0; + } else + res = -1; + + LIBALIAS_UNLOCK(la); + return (res); +} + +/* Redirect packets of a given IP protocol from a specific + public address to a private address */ +struct alias_link * +LibAliasRedirectProto(struct libalias *la, struct in_addr src_addr, + struct in_addr dst_addr, + struct in_addr alias_addr, + u_char proto) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK(la); + lnk = AddLink(la, src_addr, dst_addr, alias_addr, + NO_SRC_PORT, NO_DEST_PORT, 0, + proto); + + if (lnk != NULL) { + lnk->flags |= LINK_PERMANENT; + } +#ifdef LIBALIAS_DEBUG + else { + fprintf(stderr, "PacketAliasRedirectProto(): " + "call to AddLink() failed\n"); + } +#endif + + LIBALIAS_UNLOCK(la); + return (lnk); +} + +/* Static address translation */ +struct alias_link * +LibAliasRedirectAddr(struct libalias *la, struct in_addr src_addr, + struct in_addr alias_addr) +{ + struct alias_link *lnk; + + LIBALIAS_LOCK(la); + lnk = AddLink(la, src_addr, la->nullAddress, alias_addr, + 0, 0, 0, + LINK_ADDR); + + if (lnk != NULL) { + lnk->flags |= LINK_PERMANENT; + } +#ifdef LIBALIAS_DEBUG + else { + fprintf(stderr, "PacketAliasRedirectAddr(): " + "call to AddLink() failed\n"); + } +#endif + + LIBALIAS_UNLOCK(la); + return (lnk); +} + + +/* Mark the aliasing link dynamic */ +int +LibAliasRedirectDynamic(struct libalias *la, struct alias_link *lnk) +{ + int res; + + LIBALIAS_LOCK(la); + (void)la; + + if (lnk->flags & LINK_PARTIALLY_SPECIFIED) + res = -1; + else { + lnk->flags &= ~LINK_PERMANENT; + res = 0; + } + LIBALIAS_UNLOCK(la); + return (res); +} + + +void +LibAliasRedirectDelete(struct libalias *la, struct alias_link *lnk) +{ +/* This is a dangerous function to put in the API, + because an invalid pointer can crash the program. */ + + LIBALIAS_LOCK(la); + la->deleteAllLinks = 1; + DeleteLink(lnk); + la->deleteAllLinks = 0; + LIBALIAS_UNLOCK(la); +} + + +void +LibAliasSetAddress(struct libalias *la, struct in_addr addr) +{ + + LIBALIAS_LOCK(la); + if (la->packetAliasMode & PKT_ALIAS_RESET_ON_ADDR_CHANGE + && la->aliasAddress.s_addr != addr.s_addr) + CleanupAliasData(la); + + la->aliasAddress = addr; + LIBALIAS_UNLOCK(la); +} + + +void +LibAliasSetTarget(struct libalias *la, struct in_addr target_addr) +{ + + LIBALIAS_LOCK(la); + la->targetAddress = target_addr; + LIBALIAS_UNLOCK(la); +} + +static void +finishoff(void) +{ + + while (!LIST_EMPTY(&instancehead)) + LibAliasUninit(LIST_FIRST(&instancehead)); +} + +struct libalias * +LibAliasInit(struct libalias *la) +{ + int i; +#ifndef _KERNEL + struct timeval tv; + struct timezone tz; +#endif + + if (la == NULL) { + la = calloc(sizeof *la, 1); + if (la == NULL) + return (la); + +#ifndef _KERNEL /* kernel cleans up on module unload */ + if (LIST_EMPTY(&instancehead)) + atexit(finishoff); +#endif + LIST_INSERT_HEAD(&instancehead, la, instancelist); + +#ifdef _KERNEL + la->timeStamp = time_uptime; + la->lastCleanupTime = time_uptime; +#else + gettimeofday(&tv, &tz); + la->timeStamp = tv.tv_sec; + la->lastCleanupTime = tv.tv_sec; +#endif + + for (i = 0; i < LINK_TABLE_OUT_SIZE; i++) + LIST_INIT(&la->linkTableOut[i]); + for (i = 0; i < LINK_TABLE_IN_SIZE; i++) + LIST_INIT(&la->linkTableIn[i]); +#ifdef _KERNEL + AliasSctpInit(la); +#endif + LIBALIAS_LOCK_INIT(la); + LIBALIAS_LOCK(la); + } else { + LIBALIAS_LOCK(la); + la->deleteAllLinks = 1; + CleanupAliasData(la); + la->deleteAllLinks = 0; +#ifdef _KERNEL + AliasSctpTerm(la); + AliasSctpInit(la); +#endif + } + + la->aliasAddress.s_addr = INADDR_ANY; + la->targetAddress.s_addr = INADDR_ANY; + + la->icmpLinkCount = 0; + la->udpLinkCount = 0; + la->tcpLinkCount = 0; + la->sctpLinkCount = 0; + la->pptpLinkCount = 0; + la->protoLinkCount = 0; + la->fragmentIdLinkCount = 0; + la->fragmentPtrLinkCount = 0; + la->sockCount = 0; + + la->cleanupIndex = 0; + + la->packetAliasMode = PKT_ALIAS_SAME_PORTS +#ifndef NO_USE_SOCKETS + | PKT_ALIAS_USE_SOCKETS +#endif + | PKT_ALIAS_RESET_ON_ADDR_CHANGE; +#ifndef NO_FW_PUNCH + la->fireWallFD = -1; +#endif +#ifndef _KERNEL + LibAliasRefreshModules(); +#endif + LIBALIAS_UNLOCK(la); + return (la); +} + +void +LibAliasUninit(struct libalias *la) +{ + + LIBALIAS_LOCK(la); +#ifdef _KERNEL + AliasSctpTerm(la); +#endif + la->deleteAllLinks = 1; + CleanupAliasData(la); + la->deleteAllLinks = 0; + UninitPacketAliasLog(la); +#ifndef NO_FW_PUNCH + UninitPunchFW(la); +#endif + LIST_REMOVE(la, instancelist); + LIBALIAS_UNLOCK(la); + LIBALIAS_LOCK_DESTROY(la); + free(la); +} + +/* Change mode for some operations */ +unsigned int +LibAliasSetMode( + struct libalias *la, + unsigned int flags, /* Which state to bring flags to */ + unsigned int mask /* Mask of which flags to affect (use 0 to + * do a probe for flag values) */ +) +{ + int res = -1; + + LIBALIAS_LOCK(la); +/* Enable logging? */ + if (flags & mask & PKT_ALIAS_LOG) { + /* Do the enable */ + if (InitPacketAliasLog(la) == ENOMEM) + goto getout; + } else +/* _Disable_ logging? */ + if (~flags & mask & PKT_ALIAS_LOG) { + UninitPacketAliasLog(la); + } +#ifndef NO_FW_PUNCH +/* Start punching holes in the firewall? */ + if (flags & mask & PKT_ALIAS_PUNCH_FW) { + InitPunchFW(la); + } else +/* Stop punching holes in the firewall? */ + if (~flags & mask & PKT_ALIAS_PUNCH_FW) { + UninitPunchFW(la); + } +#endif + +/* Other flags can be set/cleared without special action */ + la->packetAliasMode = (flags & mask) | (la->packetAliasMode & ~mask); + res = la->packetAliasMode; +getout: + LIBALIAS_UNLOCK(la); + return (res); +} + + +int +LibAliasCheckNewLink(struct libalias *la) +{ + int res; + + LIBALIAS_LOCK(la); + res = la->newDefaultLink; + LIBALIAS_UNLOCK(la); + return (res); +} + + +#ifndef NO_FW_PUNCH + +/***************** + Code to support firewall punching. This shouldn't really be in this + file, but making variables global is evil too. + ****************/ + +/* Firewall include files */ +#include +#include +#include +#include + +/* + * helper function, updates the pointer to cmd with the length + * of the current command, and also cleans up the first word of + * the new command in case it has been clobbered before. + */ +static ipfw_insn * +next_cmd(ipfw_insn * cmd) +{ + cmd += F_LEN(cmd); + bzero(cmd, sizeof(*cmd)); + return (cmd); +} + +/* + * A function to fill simple commands of size 1. + * Existing flags are preserved. + */ +static ipfw_insn * +fill_cmd(ipfw_insn * cmd, enum ipfw_opcodes opcode, int size, + int flags, u_int16_t arg) +{ + cmd->opcode = opcode; + cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | (size & F_LEN_MASK); + cmd->arg1 = arg; + return next_cmd(cmd); +} + +static ipfw_insn * +fill_ip(ipfw_insn * cmd1, enum ipfw_opcodes opcode, u_int32_t addr) +{ + ipfw_insn_ip *cmd = (ipfw_insn_ip *) cmd1; + + cmd->addr.s_addr = addr; + return fill_cmd(cmd1, opcode, F_INSN_SIZE(ipfw_insn_u32), 0, 0); +} + +static ipfw_insn * +fill_one_port(ipfw_insn * cmd1, enum ipfw_opcodes opcode, u_int16_t port) +{ + ipfw_insn_u16 *cmd = (ipfw_insn_u16 *) cmd1; + + cmd->ports[0] = cmd->ports[1] = port; + return fill_cmd(cmd1, opcode, F_INSN_SIZE(ipfw_insn_u16), 0, 0); +} + +static int +fill_rule(void *buf, int bufsize, int rulenum, + enum ipfw_opcodes action, int proto, + struct in_addr sa, u_int16_t sp, struct in_addr da, u_int16_t dp) +{ + struct ip_fw *rule = (struct ip_fw *)buf; + ipfw_insn *cmd = (ipfw_insn *) rule->cmd; + + bzero(buf, bufsize); + rule->rulenum = rulenum; + + cmd = fill_cmd(cmd, O_PROTO, F_INSN_SIZE(ipfw_insn), 0, proto); + cmd = fill_ip(cmd, O_IP_SRC, sa.s_addr); + cmd = fill_one_port(cmd, O_IP_SRCPORT, sp); + cmd = fill_ip(cmd, O_IP_DST, da.s_addr); + cmd = fill_one_port(cmd, O_IP_DSTPORT, dp); + + rule->act_ofs = (u_int32_t *) cmd - (u_int32_t *) rule->cmd; + cmd = fill_cmd(cmd, action, F_INSN_SIZE(ipfw_insn), 0, 0); + + rule->cmd_len = (u_int32_t *) cmd - (u_int32_t *) rule->cmd; + + return ((char *)cmd - (char *)buf); +} + +static void ClearAllFWHoles(struct libalias *la); + + +#define fw_setfield(la, field, num) \ +do { \ + (field)[(num) - la->fireWallBaseNum] = 1; \ +} /*lint -save -e717 */ while(0)/* lint -restore */ + +#define fw_clrfield(la, field, num) \ +do { \ + (field)[(num) - la->fireWallBaseNum] = 0; \ +} /*lint -save -e717 */ while(0)/* lint -restore */ + +#define fw_tstfield(la, field, num) ((field)[(num) - la->fireWallBaseNum]) + +static void +InitPunchFW(struct libalias *la) +{ + + LIBALIAS_LOCK_ASSERT(la); + la->fireWallField = malloc(la->fireWallNumNums); + if (la->fireWallField) { + memset(la->fireWallField, 0, la->fireWallNumNums); + if (la->fireWallFD < 0) { + la->fireWallFD = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); + } + ClearAllFWHoles(la); + la->fireWallActiveNum = la->fireWallBaseNum; + } +} + +static void +UninitPunchFW(struct libalias *la) +{ + + LIBALIAS_LOCK_ASSERT(la); + ClearAllFWHoles(la); + if (la->fireWallFD >= 0) + close(la->fireWallFD); + la->fireWallFD = -1; + if (la->fireWallField) + free(la->fireWallField); + la->fireWallField = NULL; + la->packetAliasMode &= ~PKT_ALIAS_PUNCH_FW; +} + +/* Make a certain link go through the firewall */ +void +PunchFWHole(struct alias_link *lnk) +{ + struct libalias *la; + int r; /* Result code */ + struct ip_fw rule; /* On-the-fly built rule */ + int fwhole; /* Where to punch hole */ + + LIBALIAS_LOCK_ASSERT(la); + la = lnk->la; + +/* Don't do anything unless we are asked to */ + if (!(la->packetAliasMode & PKT_ALIAS_PUNCH_FW) || + la->fireWallFD < 0 || + lnk->link_type != LINK_TCP) + return; + + memset(&rule, 0, sizeof rule); + +/** Build rule **/ + + /* Find empty slot */ + for (fwhole = la->fireWallActiveNum; + fwhole < la->fireWallBaseNum + la->fireWallNumNums && + fw_tstfield(la, la->fireWallField, fwhole); + fwhole++); + if (fwhole == la->fireWallBaseNum + la->fireWallNumNums) { + for (fwhole = la->fireWallBaseNum; + fwhole < la->fireWallActiveNum && + fw_tstfield(la, la->fireWallField, fwhole); + fwhole++); + if (fwhole == la->fireWallActiveNum) { + /* No rule point empty - we can't punch more holes. */ + la->fireWallActiveNum = la->fireWallBaseNum; +#ifdef LIBALIAS_DEBUG + fprintf(stderr, "libalias: Unable to create firewall hole!\n"); +#endif + return; + } + } + /* Start next search at next position */ + la->fireWallActiveNum = fwhole + 1; + + /* + * generate two rules of the form + * + * add fwhole accept tcp from OAddr OPort to DAddr DPort add fwhole + * accept tcp from DAddr DPort to OAddr OPort + */ + if (GetOriginalPort(lnk) != 0 && GetDestPort(lnk) != 0) { + u_int32_t rulebuf[255]; + int i; + + i = fill_rule(rulebuf, sizeof(rulebuf), fwhole, + O_ACCEPT, IPPROTO_TCP, + GetOriginalAddress(lnk), ntohs(GetOriginalPort(lnk)), + GetDestAddress(lnk), ntohs(GetDestPort(lnk))); + r = setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_ADD, rulebuf, i); + if (r) + err(1, "alias punch inbound(1) setsockopt(IP_FW_ADD)"); + + i = fill_rule(rulebuf, sizeof(rulebuf), fwhole, + O_ACCEPT, IPPROTO_TCP, + GetDestAddress(lnk), ntohs(GetDestPort(lnk)), + GetOriginalAddress(lnk), ntohs(GetOriginalPort(lnk))); + r = setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_ADD, rulebuf, i); + if (r) + err(1, "alias punch inbound(2) setsockopt(IP_FW_ADD)"); + } + +/* Indicate hole applied */ + lnk->data.tcp->fwhole = fwhole; + fw_setfield(la, la->fireWallField, fwhole); +} + +/* Remove a hole in a firewall associated with a particular alias + lnk. Calling this too often is harmless. */ +static void +ClearFWHole(struct alias_link *lnk) +{ + struct libalias *la; + + LIBALIAS_LOCK_ASSERT(la); + la = lnk->la; + if (lnk->link_type == LINK_TCP) { + int fwhole = lnk->data.tcp->fwhole; /* Where is the firewall + * hole? */ + struct ip_fw rule; + + if (fwhole < 0) + return; + + memset(&rule, 0, sizeof rule); /* useless for ipfw2 */ + while (!setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_DEL, + &fwhole, sizeof fwhole)); + fw_clrfield(la, la->fireWallField, fwhole); + lnk->data.tcp->fwhole = -1; + } +} + +/* Clear out the entire range dedicated to firewall holes. */ +static void +ClearAllFWHoles(struct libalias *la) +{ + struct ip_fw rule; /* On-the-fly built rule */ + int i; + + LIBALIAS_LOCK_ASSERT(la); + if (la->fireWallFD < 0) + return; + + memset(&rule, 0, sizeof rule); + for (i = la->fireWallBaseNum; i < la->fireWallBaseNum + la->fireWallNumNums; i++) { + int r = i; + + while (!setsockopt(la->fireWallFD, IPPROTO_IP, IP_FW_DEL, &r, sizeof r)); + } + /* XXX: third arg correct here ? /phk */ + memset(la->fireWallField, 0, la->fireWallNumNums); +} + +#endif + +void +LibAliasSetFWBase(struct libalias *la, unsigned int base, unsigned int num) +{ + + LIBALIAS_LOCK(la); +#ifndef NO_FW_PUNCH + la->fireWallBaseNum = base; + la->fireWallNumNums = num; +#endif + LIBALIAS_UNLOCK(la); +} + +void +LibAliasSetSkinnyPort(struct libalias *la, unsigned int port) +{ + + LIBALIAS_LOCK(la); + la->skinnyPort = port; + LIBALIAS_UNLOCK(la); +} + +/* + * Find the address to redirect incoming packets + */ +struct in_addr +FindSctpRedirectAddress(struct libalias *la, struct sctp_nat_msg *sm) +{ + struct alias_link *lnk; + struct in_addr redir; + + LIBALIAS_LOCK_ASSERT(la); + lnk = FindLinkIn(la, sm->ip_hdr->ip_src, sm->ip_hdr->ip_dst, + sm->sctp_hdr->dest_port,sm->sctp_hdr->dest_port, LINK_SCTP, 1); + if (lnk != NULL) { + return(lnk->src_addr); /* port redirect */ + } else { + redir = FindOriginalAddress(la,sm->ip_hdr->ip_dst); + if (redir.s_addr == la->aliasAddress.s_addr || + redir.s_addr == la->targetAddress.s_addr) { /* No address found */ + lnk = FindLinkIn(la, sm->ip_hdr->ip_src, sm->ip_hdr->ip_dst, + NO_DEST_PORT, 0, LINK_SCTP, 1); + if (lnk != NULL) + return(lnk->src_addr); /* redirect proto */ + } + return(redir); /* address redirect */ + } +} diff --git a/freebsd/sys/netinet/libalias/alias_dummy.c b/freebsd/sys/netinet/libalias/alias_dummy.c new file mode 100644 index 00000000..c5a316d4 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_dummy.c @@ -0,0 +1,155 @@ +#include + +/*- + * Copyright (c) 2005 Paolo Pisati + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + * Alias_dummy is just an empty skeleton used to demostrate how to write + * a module for libalias, that will run unalterated in userland or in + * kernel land. + */ + +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include +#include +#endif + +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#else +#include +#include +#endif + +static void +AliasHandleDummy(struct libalias *la, struct ip *ip, struct alias_data *ah); + +static int +fingerprint(struct libalias *la, struct alias_data *ah) +{ + + /* + * Check here all the data that will be used later, if any field + * is empy/NULL, return a -1 value. + */ + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || + ah->maxpktsize == 0) + return (-1); + /* + * Fingerprint the incoming packet, if it matches any conditions + * return an OK value. + */ + if (ntohs(*ah->dport) == 123 + || ntohs(*ah->sport) == 456) + return (0); /* I know how to handle it. */ + return (-1); /* I don't recognize this packet. */ +} + +/* + * Wrap in this general purpose function, the real function used to alias the + * packets. + */ + +static int +protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + AliasHandleDummy(la, pip, ah); + return (0); +} + +/* + * NOTA BENE: the next variable MUST NOT be renamed in any case if you want + * your module to work in userland, cause it's used to find and use all + * the protocol handlers present in every module. + * So WATCH OUT, your module needs this variables and it needs it with + * ITS EXACT NAME: handlers. + */ + +struct proto_handler handlers [] = { + { + .pri = 666, + .dir = IN|OUT, + .proto = UDP|TCP, + .fingerprint = &fingerprint, + .protohandler = &protohandler + }, + { EOH } +}; + +static int +mod_handler(module_t mod, int type, void *data) +{ + int error; + + switch (type) { + case MOD_LOAD: + error = 0; + LibAliasAttachHandlers(handlers); + break; + case MOD_UNLOAD: + error = 0; + LibAliasDetachHandlers(handlers); + break; + default: + error = EINVAL; + } + return (error); +} + +#ifdef _KERNEL +static +#endif +moduledata_t alias_mod = { + "alias_dummy", mod_handler, NULL +}; + +#ifdef _KERNEL +DECLARE_MODULE(alias_dummy, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); +MODULE_VERSION(alias_dummy, 1); +MODULE_DEPEND(alias_dummy, libalias, 1, 1, 1); +#endif + +static void +AliasHandleDummy(struct libalias *la, struct ip *ip, struct alias_data *ah) +{ + ; /* Dummy. */ +} + diff --git a/freebsd/sys/netinet/libalias/alias_ftp.c b/freebsd/sys/netinet/libalias/alias_ftp.c new file mode 100644 index 00000000..4e8b7177 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_ftp.c @@ -0,0 +1,696 @@ +#include + +/*- + * Copyright (c) 2001 Charles Mott + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + Alias_ftp.c performs special processing for FTP sessions under + TCP. Specifically, when a PORT/EPRT command from the client + side or 227/229 reply from the server is sent, it is intercepted + and modified. The address is changed to the gateway machine + and an aliasing port is used. + + For this routine to work, the message must fit entirely into a + single TCP packet. This is typically the case, but exceptions + can easily be envisioned under the actual specifications. + + Probably the most troubling aspect of the approach taken here is + that the new message will typically be a different length, and + this causes a certain amount of bookkeeping to keep track of the + changes of sequence and acknowledgment numbers, since the client + machine is totally unaware of the modification to the TCP stream. + + + References: RFC 959, RFC 2428. + + Initial version: August, 1996 (cjm) + + Version 1.6 + Brian Somers and Martin Renters identified an IP checksum + error for modified IP packets. + + Version 1.7: January 9, 1996 (cjm) + Differential checksum computation for change + in IP packet length. + + Version 2.1: May, 1997 (cjm) + Very minor changes to conform with + local/global/function naming conventions + within the packet aliasing module. + + Version 3.1: May, 2000 (eds) + Add support for passive mode, alias the 227 replies. + + See HISTORY file for record of revisions. +*/ + +/* Includes */ +#ifdef _KERNEL +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include +#endif + +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include +#endif + +#define FTP_CONTROL_PORT_NUMBER 21 + +static void +AliasHandleFtpOut(struct libalias *, struct ip *, struct alias_link *, + int maxpacketsize); + +static int +fingerprint(struct libalias *la, struct alias_data *ah) +{ + + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || + ah->maxpktsize == 0) + return (-1); + if (ntohs(*ah->dport) == FTP_CONTROL_PORT_NUMBER + || ntohs(*ah->sport) == FTP_CONTROL_PORT_NUMBER) + return (0); + return (-1); +} + +static int +protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + AliasHandleFtpOut(la, pip, ah->lnk, ah->maxpktsize); + return (0); +} + +struct proto_handler handlers[] = { + { + .pri = 80, + .dir = OUT, + .proto = TCP, + .fingerprint = &fingerprint, + .protohandler = &protohandler + }, + { EOH } +}; + +static int +mod_handler(module_t mod, int type, void *data) +{ + int error; + + switch (type) { + case MOD_LOAD: + error = 0; + LibAliasAttachHandlers(handlers); + break; + case MOD_UNLOAD: + error = 0; + LibAliasDetachHandlers(handlers); + break; + default: + error = EINVAL; + } + return (error); +} + +#ifdef _KERNEL +static +#endif +moduledata_t alias_mod = { + "alias_ftp", mod_handler, NULL +}; + +#ifdef _KERNEL +DECLARE_MODULE(alias_ftp, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); +MODULE_VERSION(alias_ftp, 1); +MODULE_DEPEND(alias_ftp, libalias, 1, 1, 1); +#endif + +#define FTP_CONTROL_PORT_NUMBER 21 +#define MAX_MESSAGE_SIZE 128 + +/* FTP protocol flags. */ +#define WAIT_CRLF 0x01 + +enum ftp_message_type { + FTP_PORT_COMMAND, + FTP_EPRT_COMMAND, + FTP_227_REPLY, + FTP_229_REPLY, + FTP_UNKNOWN_MESSAGE +}; + +static int ParseFtpPortCommand(struct libalias *la, char *, int); +static int ParseFtpEprtCommand(struct libalias *la, char *, int); +static int ParseFtp227Reply(struct libalias *la, char *, int); +static int ParseFtp229Reply(struct libalias *la, char *, int); +static void NewFtpMessage(struct libalias *la, struct ip *, struct alias_link *, int, int); + +static void +AliasHandleFtpOut( + struct libalias *la, + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *lnk, /* The link to go through (aliased port) */ + int maxpacketsize /* The maximum size this packet can grow to + (including headers) */ ) +{ + int hlen, tlen, dlen, pflags; + char *sptr; + struct tcphdr *tc; + int ftp_message_type; + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *)ip_next(pip); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + +/* Place string pointer and beginning of data */ + sptr = (char *)pip; + sptr += hlen; + +/* + * Check that data length is not too long and previous message was + * properly terminated with CRLF. + */ + pflags = GetProtocolFlags(lnk); + if (dlen <= MAX_MESSAGE_SIZE && !(pflags & WAIT_CRLF)) { + ftp_message_type = FTP_UNKNOWN_MESSAGE; + + if (ntohs(tc->th_dport) == FTP_CONTROL_PORT_NUMBER) { +/* + * When aliasing a client, check for the PORT/EPRT command. + */ + if (ParseFtpPortCommand(la, sptr, dlen)) + ftp_message_type = FTP_PORT_COMMAND; + else if (ParseFtpEprtCommand(la, sptr, dlen)) + ftp_message_type = FTP_EPRT_COMMAND; + } else { +/* + * When aliasing a server, check for the 227/229 reply. + */ + if (ParseFtp227Reply(la, sptr, dlen)) + ftp_message_type = FTP_227_REPLY; + else if (ParseFtp229Reply(la, sptr, dlen)) { + ftp_message_type = FTP_229_REPLY; + la->true_addr.s_addr = pip->ip_src.s_addr; + } + } + + if (ftp_message_type != FTP_UNKNOWN_MESSAGE) + NewFtpMessage(la, pip, lnk, maxpacketsize, ftp_message_type); + } +/* Track the msgs which are CRLF term'd for PORT/PASV FW breach */ + + if (dlen) { /* only if there's data */ + sptr = (char *)pip; /* start over at beginning */ + tlen = ntohs(pip->ip_len); /* recalc tlen, pkt may + * have grown */ + if (sptr[tlen - 2] == '\r' && sptr[tlen - 1] == '\n') + pflags &= ~WAIT_CRLF; + else + pflags |= WAIT_CRLF; + SetProtocolFlags(lnk, pflags); + } +} + +static int +ParseFtpPortCommand(struct libalias *la, char *sptr, int dlen) +{ + char ch; + int i, state; + u_int32_t addr; + u_short port; + u_int8_t octet; + + /* Format: "PORT A,D,D,R,PO,RT". */ + + /* Return if data length is too short. */ + if (dlen < 18) + return (0); + + if (strncasecmp("PORT ", sptr, 5)) + return (0); + + addr = port = octet = 0; + state = 0; + for (i = 5; i < dlen; i++) { + ch = sptr[i]; + switch (state) { + case 0: + if (isspace(ch)) + break; + else + state++; + case 1: + case 3: + case 5: + case 7: + case 9: + case 11: + if (isdigit(ch)) { + octet = ch - '0'; + state++; + } else + return (0); + break; + case 2: + case 4: + case 6: + case 8: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',') { + addr = (addr << 8) + octet; + state++; + } else + return (0); + break; + case 10: + case 12: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',' || state == 12) { + port = (port << 8) + octet; + state++; + } else + return (0); + break; + } + } + + if (state == 13) { + la->true_addr.s_addr = htonl(addr); + la->true_port = port; + return (1); + } else + return (0); +} + +static int +ParseFtpEprtCommand(struct libalias *la, char *sptr, int dlen) +{ + char ch, delim; + int i, state; + u_int32_t addr; + u_short port; + u_int8_t octet; + + /* Format: "EPRT |1|A.D.D.R|PORT|". */ + + /* Return if data length is too short. */ + if (dlen < 18) + return (0); + + if (strncasecmp("EPRT ", sptr, 5)) + return (0); + + addr = port = octet = 0; + delim = '|'; /* XXX gcc -Wuninitialized */ + state = 0; + for (i = 5; i < dlen; i++) { + ch = sptr[i]; + switch (state) { + case 0: + if (!isspace(ch)) { + delim = ch; + state++; + } + break; + case 1: + if (ch == '1') /* IPv4 address */ + state++; + else + return (0); + break; + case 2: + if (ch == delim) + state++; + else + return (0); + break; + case 3: + case 5: + case 7: + case 9: + if (isdigit(ch)) { + octet = ch - '0'; + state++; + } else + return (0); + break; + case 4: + case 6: + case 8: + case 10: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == '.' || state == 10) { + addr = (addr << 8) + octet; + state++; + } else + return (0); + break; + case 11: + if (isdigit(ch)) { + port = ch - '0'; + state++; + } else + return (0); + break; + case 12: + if (isdigit(ch)) + port = 10 * port + ch - '0'; + else if (ch == delim) + state++; + else + return (0); + break; + } + } + + if (state == 13) { + la->true_addr.s_addr = htonl(addr); + la->true_port = port; + return (1); + } else + return (0); +} + +static int +ParseFtp227Reply(struct libalias *la, char *sptr, int dlen) +{ + char ch; + int i, state; + u_int32_t addr; + u_short port; + u_int8_t octet; + + /* Format: "227 Entering Passive Mode (A,D,D,R,PO,RT)" */ + + /* Return if data length is too short. */ + if (dlen < 17) + return (0); + + if (strncmp("227 ", sptr, 4)) + return (0); + + addr = port = octet = 0; + + state = 0; + for (i = 4; i < dlen; i++) { + ch = sptr[i]; + switch (state) { + case 0: + if (ch == '(') + state++; + break; + case 1: + case 3: + case 5: + case 7: + case 9: + case 11: + if (isdigit(ch)) { + octet = ch - '0'; + state++; + } else + return (0); + break; + case 2: + case 4: + case 6: + case 8: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',') { + addr = (addr << 8) + octet; + state++; + } else + return (0); + break; + case 10: + case 12: + if (isdigit(ch)) + octet = 10 * octet + ch - '0'; + else if (ch == ',' || (state == 12 && ch == ')')) { + port = (port << 8) + octet; + state++; + } else + return (0); + break; + } + } + + if (state == 13) { + la->true_port = port; + la->true_addr.s_addr = htonl(addr); + return (1); + } else + return (0); +} + +static int +ParseFtp229Reply(struct libalias *la, char *sptr, int dlen) +{ + char ch, delim; + int i, state; + u_short port; + + /* Format: "229 Entering Extended Passive Mode (|||PORT|)" */ + + /* Return if data length is too short. */ + if (dlen < 11) + return (0); + + if (strncmp("229 ", sptr, 4)) + return (0); + + port = 0; + delim = '|'; /* XXX gcc -Wuninitialized */ + + state = 0; + for (i = 4; i < dlen; i++) { + ch = sptr[i]; + switch (state) { + case 0: + if (ch == '(') + state++; + break; + case 1: + delim = ch; + state++; + break; + case 2: + case 3: + if (ch == delim) + state++; + else + return (0); + break; + case 4: + if (isdigit(ch)) { + port = ch - '0'; + state++; + } else + return (0); + break; + case 5: + if (isdigit(ch)) + port = 10 * port + ch - '0'; + else if (ch == delim) + state++; + else + return (0); + break; + case 6: + if (ch == ')') + state++; + else + return (0); + break; + } + } + + if (state == 7) { + la->true_port = port; + return (1); + } else + return (0); +} + +static void +NewFtpMessage(struct libalias *la, struct ip *pip, + struct alias_link *lnk, + int maxpacketsize, + int ftp_message_type) +{ + struct alias_link *ftp_lnk; + +/* Security checks. */ + if (pip->ip_src.s_addr != la->true_addr.s_addr) + return; + + if (la->true_port < IPPORT_RESERVED) + return; + +/* Establish link to address and port found in FTP control message. */ + ftp_lnk = FindUdpTcpOut(la, la->true_addr, GetDestAddress(lnk), + htons(la->true_port), 0, IPPROTO_TCP, 1); + + if (ftp_lnk != NULL) { + int slen, hlen, tlen, dlen; + struct tcphdr *tc; + +#ifndef NO_FW_PUNCH + /* Punch hole in firewall */ + PunchFWHole(ftp_lnk); +#endif + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *)ip_next(pip); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + +/* Create new FTP message. */ + { + char stemp[MAX_MESSAGE_SIZE + 1]; + char *sptr; + u_short alias_port; + u_char *ptr; + int a1, a2, a3, a4, p1, p2; + struct in_addr alias_address; + +/* Decompose alias address into quad format */ + alias_address = GetAliasAddress(lnk); + ptr = (u_char *) & alias_address.s_addr; + a1 = *ptr++; + a2 = *ptr++; + a3 = *ptr++; + a4 = *ptr; + + alias_port = GetAliasPort(ftp_lnk); + +/* Prepare new command */ + switch (ftp_message_type) { + case FTP_PORT_COMMAND: + case FTP_227_REPLY: + /* Decompose alias port into pair format. */ + ptr = (char *)&alias_port; + p1 = *ptr++; + p2 = *ptr; + + if (ftp_message_type == FTP_PORT_COMMAND) { + /* Generate PORT command string. */ + sprintf(stemp, "PORT %d,%d,%d,%d,%d,%d\r\n", + a1, a2, a3, a4, p1, p2); + } else { + /* Generate 227 reply string. */ + sprintf(stemp, + "227 Entering Passive Mode (%d,%d,%d,%d,%d,%d)\r\n", + a1, a2, a3, a4, p1, p2); + } + break; + case FTP_EPRT_COMMAND: + /* Generate EPRT command string. */ + sprintf(stemp, "EPRT |1|%d.%d.%d.%d|%d|\r\n", + a1, a2, a3, a4, ntohs(alias_port)); + break; + case FTP_229_REPLY: + /* Generate 229 reply string. */ + sprintf(stemp, "229 Entering Extended Passive Mode (|||%d|)\r\n", + ntohs(alias_port)); + break; + } + +/* Save string length for IP header modification */ + slen = strlen(stemp); + +/* Copy modified buffer into IP packet. */ + sptr = (char *)pip; + sptr += hlen; + strncpy(sptr, stemp, maxpacketsize - hlen); + } + +/* Save information regarding modified seq and ack numbers */ + { + int delta; + + SetAckModified(lnk); + tc = (struct tcphdr *)ip_next(pip); + delta = GetDeltaSeqOut(tc->th_seq, lnk); + AddSeq(lnk, delta + slen - dlen, pip->ip_hl, + pip->ip_len, tc->th_seq, tc->th_off); + } + +/* Revise IP header */ + { + u_short new_len; + + new_len = htons(hlen + slen); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + } + +/* Compute TCP checksum for revised packet */ + tc->th_sum = 0; +#ifdef _KERNEL + tc->th_x2 = 1; +#else + tc->th_sum = TcpChecksum(pip); +#endif + } else { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/HandleFtpOut: Cannot allocate FTP data port\n"); +#endif + } +} diff --git a/freebsd/sys/netinet/libalias/alias_irc.c b/freebsd/sys/netinet/libalias/alias_irc.c new file mode 100644 index 00000000..05db0f4f --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_irc.c @@ -0,0 +1,490 @@ +#include + +/*- + * Copyright (c) 2001 Charles Mott + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* Alias_irc.c intercepts packages contain IRC CTCP commands, and + changes DCC commands to export a port on the aliasing host instead + of an aliased host. + + For this routine to work, the DCC command must fit entirely into a + single TCP packet. This will usually happen, but is not + guaranteed. + + The interception is likely to change the length of the packet. + The handling of this is copied more-or-less verbatim from + ftp_alias.c + + Initial version: Eivind Eklund (ee) 97-01-29 + + Version 2.1: May, 1997 (cjm) + Very minor changes to conform with + local/global/function naming conventions + withing the packet alising module. +*/ + +/* Includes */ +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include +#endif + +#define IRC_CONTROL_PORT_NUMBER_1 6667 +#define IRC_CONTROL_PORT_NUMBER_2 6668 + +#define PKTSIZE (IP_MAXPACKET + 1) +char *newpacket; + +/* Local defines */ +#define DBprintf(a) + +static void +AliasHandleIrcOut(struct libalias *, struct ip *, struct alias_link *, + int maxpacketsize); + +static int +fingerprint(struct libalias *la, struct alias_data *ah) +{ + + if (ah->dport == NULL || ah->dport == NULL || ah->lnk == NULL || + ah->maxpktsize == 0) + return (-1); + if (ntohs(*ah->dport) == IRC_CONTROL_PORT_NUMBER_1 + || ntohs(*ah->dport) == IRC_CONTROL_PORT_NUMBER_2) + return (0); + return (-1); +} + +static int +protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + newpacket = malloc(PKTSIZE); + if (newpacket) { + AliasHandleIrcOut(la, pip, ah->lnk, ah->maxpktsize); + free(newpacket); + } + return (0); +} + +struct proto_handler handlers[] = { + { + .pri = 90, + .dir = OUT, + .proto = TCP, + .fingerprint = &fingerprint, + .protohandler = &protohandler + }, + { EOH } +}; + +static int +mod_handler(module_t mod, int type, void *data) +{ + int error; + + switch (type) { + case MOD_LOAD: + error = 0; + LibAliasAttachHandlers(handlers); + break; + case MOD_UNLOAD: + error = 0; + LibAliasDetachHandlers(handlers); + break; + default: + error = EINVAL; + } + return (error); +} + +#ifdef _KERNEL +static +#endif +moduledata_t alias_mod = { + "alias_irc", mod_handler, NULL +}; + +/* Kernel module definition. */ +#ifdef _KERNEL +DECLARE_MODULE(alias_irc, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); +MODULE_VERSION(alias_irc, 1); +MODULE_DEPEND(alias_irc, libalias, 1, 1, 1); +#endif + +static void +AliasHandleIrcOut(struct libalias *la, + struct ip *pip, /* IP packet to examine */ + struct alias_link *lnk, /* Which link are we on? */ + int maxsize /* Maximum size of IP packet including + * headers */ +) +{ + int hlen, tlen, dlen; + struct in_addr true_addr; + u_short true_port; + char *sptr; + struct tcphdr *tc; + int i; /* Iterator through the source */ + +/* Calculate data length of TCP packet */ + tc = (struct tcphdr *)ip_next(pip); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + /* + * Return if data length is too short - assume an entire PRIVMSG in + * each packet. + */ + if (dlen < (int)sizeof(":A!a@n.n PRIVMSG A :aDCC 1 1a") - 1) + return; + +/* Place string pointer at beginning of data */ + sptr = (char *)pip; + sptr += hlen; + maxsize -= hlen; /* We're interested in maximum size of + * data, not packet */ + + /* Search for a CTCP command [Note 1] */ + for (i = 0; i < dlen; i++) { + if (sptr[i] == '\001') + goto lFOUND_CTCP; + } + return; /* No CTCP commands in */ + /* Handle CTCP commands - the buffer may have to be copied */ +lFOUND_CTCP: + { + unsigned int copyat = i; + unsigned int iCopy = 0; /* How much data have we written to + * copy-back string? */ + unsigned long org_addr; /* Original IP address */ + unsigned short org_port; /* Original source port + * address */ + +lCTCP_START: + if (i >= dlen || iCopy >= PKTSIZE) + goto lPACKET_DONE; + newpacket[iCopy++] = sptr[i++]; /* Copy the CTCP start + * character */ + /* Start of a CTCP */ + if (i + 4 >= dlen) /* Too short for DCC */ + goto lBAD_CTCP; + if (sptr[i + 0] != 'D') + goto lBAD_CTCP; + if (sptr[i + 1] != 'C') + goto lBAD_CTCP; + if (sptr[i + 2] != 'C') + goto lBAD_CTCP; + if (sptr[i + 3] != ' ') + goto lBAD_CTCP; + /* We have a DCC command - handle it! */ + i += 4; /* Skip "DCC " */ + if (iCopy + 4 > PKTSIZE) + goto lPACKET_DONE; + newpacket[iCopy++] = 'D'; + newpacket[iCopy++] = 'C'; + newpacket[iCopy++] = 'C'; + newpacket[iCopy++] = ' '; + + DBprintf(("Found DCC\n")); + /* + * Skip any extra spaces (should not occur according to + * protocol, but DCC breaks CTCP protocol anyway + */ + while (sptr[i] == ' ') { + if (++i >= dlen) { + DBprintf(("DCC packet terminated in just spaces\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Transferring command...\n")); + while (sptr[i] != ' ') { + newpacket[iCopy++] = sptr[i]; + if (++i >= dlen || iCopy >= PKTSIZE) { + DBprintf(("DCC packet terminated during command\n")); + goto lPACKET_DONE; + } + } + /* Copy _one_ space */ + if (i + 1 < dlen && iCopy < PKTSIZE) + newpacket[iCopy++] = sptr[i++]; + + DBprintf(("Done command - removing spaces\n")); + /* + * Skip any extra spaces (should not occur according to + * protocol, but DCC breaks CTCP protocol anyway + */ + while (sptr[i] == ' ') { + if (++i >= dlen) { + DBprintf(("DCC packet terminated in just spaces (post-command)\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Transferring filename...\n")); + while (sptr[i] != ' ') { + newpacket[iCopy++] = sptr[i]; + if (++i >= dlen || iCopy >= PKTSIZE) { + DBprintf(("DCC packet terminated during filename\n")); + goto lPACKET_DONE; + } + } + /* Copy _one_ space */ + if (i + 1 < dlen && iCopy < PKTSIZE) + newpacket[iCopy++] = sptr[i++]; + + DBprintf(("Done filename - removing spaces\n")); + /* + * Skip any extra spaces (should not occur according to + * protocol, but DCC breaks CTCP protocol anyway + */ + while (sptr[i] == ' ') { + if (++i >= dlen) { + DBprintf(("DCC packet terminated in just spaces (post-filename)\n")); + goto lPACKET_DONE; + } + } + + DBprintf(("Fetching IP address\n")); + /* Fetch IP address */ + org_addr = 0; + while (i < dlen && isdigit(sptr[i])) { + if (org_addr > ULONG_MAX / 10UL) { /* Terminate on overflow */ + DBprintf(("DCC Address overflow (org_addr == 0x%08lx, next char %c\n", org_addr, sptr[i])); + goto lBAD_CTCP; + } + org_addr *= 10; + org_addr += sptr[i++] - '0'; + } + DBprintf(("Skipping space\n")); + if (i + 1 >= dlen || sptr[i] != ' ') { + DBprintf(("Overflow (%d >= %d) or bad character (%02x) terminating IP address\n", i + 1, dlen, sptr[i])); + goto lBAD_CTCP; + } + /* + * Skip any extra spaces (should not occur according to + * protocol, but DCC breaks CTCP protocol anyway, so we + * might as well play it safe + */ + while (sptr[i] == ' ') { + if (++i >= dlen) { + DBprintf(("Packet failure - space overflow.\n")); + goto lPACKET_DONE; + } + } + DBprintf(("Fetching port number\n")); + /* Fetch source port */ + org_port = 0; + while (i < dlen && isdigit(sptr[i])) { + if (org_port > 6554) { /* Terminate on overflow + * (65536/10 rounded up */ + DBprintf(("DCC: port number overflow\n")); + goto lBAD_CTCP; + } + org_port *= 10; + org_port += sptr[i++] - '0'; + } + /* Skip illegal addresses (or early termination) */ + if (i >= dlen || (sptr[i] != '\001' && sptr[i] != ' ')) { + DBprintf(("Bad port termination\n")); + goto lBAD_CTCP; + } + DBprintf(("Got IP %lu and port %u\n", org_addr, (unsigned)org_port)); + + /* We've got the address and port - now alias it */ + { + struct alias_link *dcc_lnk; + struct in_addr destaddr; + + + true_port = htons(org_port); + true_addr.s_addr = htonl(org_addr); + destaddr.s_addr = 0; + + /* Sanity/Security checking */ + if (!org_addr || !org_port || + pip->ip_src.s_addr != true_addr.s_addr || + org_port < IPPORT_RESERVED) + goto lBAD_CTCP; + + /* + * Steal the FTP_DATA_PORT - it doesn't really + * matter, and this would probably allow it through + * at least _some_ firewalls. + */ + dcc_lnk = FindUdpTcpOut(la, true_addr, destaddr, + true_port, 0, + IPPROTO_TCP, 1); + DBprintf(("Got a DCC link\n")); + if (dcc_lnk) { + struct in_addr alias_address; /* Address from aliasing */ + u_short alias_port; /* Port given by + * aliasing */ + int n; + +#ifndef NO_FW_PUNCH + /* Generate firewall hole as appropriate */ + PunchFWHole(dcc_lnk); +#endif + + alias_address = GetAliasAddress(lnk); + n = snprintf(&newpacket[iCopy], + PKTSIZE - iCopy, + "%lu ", (u_long) htonl(alias_address.s_addr)); + if (n < 0) { + DBprintf(("DCC packet construct failure.\n")); + goto lBAD_CTCP; + } + if ((iCopy += n) >= PKTSIZE) { /* Truncated/fit exactly + * - bad news */ + DBprintf(("DCC constructed packet overflow.\n")); + goto lBAD_CTCP; + } + alias_port = GetAliasPort(dcc_lnk); + n = snprintf(&newpacket[iCopy], + PKTSIZE - iCopy, + "%u", htons(alias_port)); + if (n < 0) { + DBprintf(("DCC packet construct failure.\n")); + goto lBAD_CTCP; + } + iCopy += n; + /* + * Done - truncated cases will be taken + * care of by lBAD_CTCP + */ + DBprintf(("Aliased IP %lu and port %u\n", alias_address.s_addr, (unsigned)alias_port)); + } + } + /* + * An uninteresting CTCP - state entered right after '\001' + * has been pushed. Also used to copy the rest of a DCC, + * after IP address and port has been handled + */ +lBAD_CTCP: + for (; i < dlen && iCopy < PKTSIZE; i++, iCopy++) { + newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */ + if (sptr[i] == '\001') { + goto lNORMAL_TEXT; + } + } + goto lPACKET_DONE; + /* Normal text */ +lNORMAL_TEXT: + for (; i < dlen && iCopy < PKTSIZE; i++, iCopy++) { + newpacket[iCopy] = sptr[i]; /* Copy CTCP unchanged */ + if (sptr[i] == '\001') { + goto lCTCP_START; + } + } + /* Handle the end of a packet */ +lPACKET_DONE: + iCopy = iCopy > maxsize - copyat ? maxsize - copyat : iCopy; + memcpy(sptr + copyat, newpacket, iCopy); + +/* Save information regarding modified seq and ack numbers */ + { + int delta; + + SetAckModified(lnk); + tc = (struct tcphdr *)ip_next(pip); + delta = GetDeltaSeqOut(tc->th_seq, lnk); + AddSeq(lnk, delta + copyat + iCopy - dlen, pip->ip_hl, + pip->ip_len, tc->th_seq, tc->th_off); + } + + /* Revise IP header */ + { + u_short new_len; + + new_len = htons(hlen + iCopy + copyat); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + } + + /* Compute TCP checksum for revised packet */ + tc->th_sum = 0; +#ifdef _KERNEL + tc->th_x2 = 1; +#else + tc->th_sum = TcpChecksum(pip); +#endif + return; + } +} + +/* Notes: + [Note 1] + The initial search will most often fail; it could be replaced with a 32-bit specific search. + Such a search would be done for 32-bit unsigned value V: + V ^= 0x01010101; (Search is for null bytes) + if( ((V-0x01010101)^V) & 0x80808080 ) { + (found a null bytes which was a 01 byte) + } + To assert that the processor is 32-bits, do + extern int ircdccar[32]; (32 bits) + extern int ircdccar[CHAR_BIT*sizeof(unsigned int)]; + which will generate a type-error on all but 32-bit machines. + + [Note 2] This routine really ought to be replaced with one that + creates a transparent proxy on the aliasing host, to allow arbitary + changes in the TCP stream. This should not be too difficult given + this base; I (ee) will try to do this some time later. + */ diff --git a/freebsd/sys/netinet/libalias/alias_local.h b/freebsd/sys/netinet/libalias/alias_local.h new file mode 100644 index 00000000..e24ece49 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_local.h @@ -0,0 +1,397 @@ +/*- + * Copyright (c) 2001 Charles Mott + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Alias_local.h contains the function prototypes for alias.c, + * alias_db.c, alias_util.c and alias_ftp.c, alias_irc.c (as well + * as any future add-ons). It also includes macros, globals and + * struct definitions shared by more than one alias*.c file. + * + * This include file is intended to be used only within the aliasing + * software. Outside world interfaces are defined in alias.h + * + * This software is placed into the public domain with no restrictions + * on its distribution. + * + * Initial version: August, 1996 (cjm) + * + * + */ + +#ifndef _ALIAS_LOCAL_HH_ +#define _ALIAS_LOCAL_HH_ + +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#include + +/* XXX: LibAliasSetTarget() uses this constant. */ +#define INADDR_NONE 0xffffffff + +#include +#else +#include +#endif + +/* Sizes of input and output link tables */ +#define LINK_TABLE_OUT_SIZE 4001 +#define LINK_TABLE_IN_SIZE 4001 + +struct proxy_entry; + +struct libalias { + LIST_ENTRY(libalias) instancelist; + + int packetAliasMode; /* Mode flags */ + /* - documented in alias.h */ + + struct in_addr aliasAddress; /* Address written onto source */ + /* field of IP packet. */ + + struct in_addr targetAddress; /* IP address incoming packets */ + /* are sent to if no aliasing */ + /* link already exists */ + + struct in_addr nullAddress; /* Used as a dummy parameter for */ + /* some function calls */ + + LIST_HEAD (, alias_link) linkTableOut[LINK_TABLE_OUT_SIZE]; + /* Lookup table of pointers to */ + /* chains of link records. Each */ + + LIST_HEAD (, alias_link) linkTableIn[LINK_TABLE_IN_SIZE]; + /* link record is doubly indexed */ + /* into input and output lookup */ + /* tables. */ + + /* Link statistics */ + int icmpLinkCount; + int udpLinkCount; + int tcpLinkCount; + int pptpLinkCount; + int protoLinkCount; + int fragmentIdLinkCount; + int fragmentPtrLinkCount; + int sockCount; + + int cleanupIndex; /* Index to chain of link table */ + /* being inspected for old links */ + + int timeStamp; /* System time in seconds for */ + /* current packet */ + + int lastCleanupTime; /* Last time + * IncrementalCleanup() */ + /* was called */ + + int deleteAllLinks; /* If equal to zero, DeleteLink() */ + /* will not remove permanent links */ + + /* log descriptor */ +#ifdef _KERNEL + char *logDesc; +#else + FILE *logDesc; +#endif + /* statistics monitoring */ + + int newDefaultLink; /* Indicates if a new aliasing */ + /* link has been created after a */ + /* call to PacketAliasIn/Out(). */ + +#ifndef NO_FW_PUNCH + int fireWallFD; /* File descriptor to be able to */ + /* control firewall. Opened by */ + /* PacketAliasSetMode on first */ + /* setting the PKT_ALIAS_PUNCH_FW */ + /* flag. */ + int fireWallBaseNum; /* The first firewall entry + * free for our use */ + int fireWallNumNums; /* How many entries can we + * use? */ + int fireWallActiveNum; /* Which entry did we last + * use? */ + char *fireWallField; /* bool array for entries */ +#endif + + unsigned int skinnyPort; /* TCP port used by the Skinny */ + /* protocol. */ + + struct proxy_entry *proxyList; + + struct in_addr true_addr; /* in network byte order. */ + u_short true_port; /* in host byte order. */ + + /* + * sctp code support + */ + + /* counts associations that have progressed to UP and not yet removed */ + int sctpLinkCount; +#ifdef _KERNEL + /* timing queue for keeping track of association timeouts */ + struct sctp_nat_timer sctpNatTimer; + + /* size of hash table used in this instance */ + u_int sctpNatTableSize; + +/* + * local look up table sorted by l_vtag/l_port + */ + LIST_HEAD(sctpNatTableL, sctp_nat_assoc) *sctpTableLocal; +/* + * global look up table sorted by g_vtag/g_port + */ + LIST_HEAD(sctpNatTableG, sctp_nat_assoc) *sctpTableGlobal; + + /* + * avoid races in libalias: every public function has to use it. + */ + struct mtx mutex; +#endif +}; + +/* Macros */ + +#ifdef _KERNEL +#define LIBALIAS_LOCK_INIT(l) \ + mtx_init(&l->mutex, "per-instance libalias mutex", NULL, MTX_DEF) +#define LIBALIAS_LOCK_ASSERT(l) mtx_assert(&l->mutex, MA_OWNED) +#define LIBALIAS_LOCK(l) mtx_lock(&l->mutex) +#define LIBALIAS_UNLOCK(l) mtx_unlock(&l->mutex) +#define LIBALIAS_LOCK_DESTROY(l) mtx_destroy(&l->mutex) +#else +#define LIBALIAS_LOCK_INIT(l) +#define LIBALIAS_LOCK_ASSERT(l) +#define LIBALIAS_LOCK(l) +#define LIBALIAS_UNLOCK(l) +#define LIBALIAS_LOCK_DESTROY(l) +#endif + +/* + * The following macro is used to update an + * internet checksum. "delta" is a 32-bit + * accumulation of all the changes to the + * checksum (adding in new 16-bit words and + * subtracting out old words), and "cksum" + * is the checksum value to be updated. + */ +#define ADJUST_CHECKSUM(acc, cksum) \ + do { \ + acc += cksum; \ + if (acc < 0) { \ + acc = -acc; \ + acc = (acc >> 16) + (acc & 0xffff); \ + acc += acc >> 16; \ + cksum = (u_short) ~acc; \ + } else { \ + acc = (acc >> 16) + (acc & 0xffff); \ + acc += acc >> 16; \ + cksum = (u_short) acc; \ + } \ + } while (0) + + +/* Prototypes */ + +/* + * SctpFunction prototypes + * + */ +void AliasSctpInit(struct libalias *la); +void AliasSctpTerm(struct libalias *la); +int SctpAlias(struct libalias *la, struct ip *ip, int direction); + +/* + * We do not calculate TCP checksums when libalias is a kernel + * module, since it has no idea about checksum offloading. + * If TCP data has changed, then we just set checksum to zero, + * and caller must recalculate it himself. + * In case if libalias will edit UDP data, the same approach + * should be used. + */ +#ifndef _KERNEL +u_short IpChecksum(struct ip *_pip); +u_short TcpChecksum(struct ip *_pip); +#endif +void +DifferentialChecksum(u_short * _cksum, void * _new, void * _old, int _n); + +/* Internal data access */ +struct alias_link * +FindIcmpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _id_alias, int _create); +struct alias_link * +FindIcmpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, + u_short _id, int _create); +struct alias_link * +FindFragmentIn1(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _ip_id); +struct alias_link * +FindFragmentIn2(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _ip_id); +struct alias_link * + AddFragmentPtrLink(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id); +struct alias_link * + FindFragmentPtr(struct libalias *la, struct in_addr _dst_addr, u_short _ip_id); +struct alias_link * +FindProtoIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, + u_char _proto); +struct alias_link * +FindProtoOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, + u_char _proto); +struct alias_link * +FindUdpTcpIn(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _dst_port, u_short _alias_port, u_char _proto, int _create); +struct alias_link * +FindUdpTcpOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, + u_short _src_port, u_short _dst_port, u_char _proto, int _create); +struct alias_link * +AddPptp(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, + struct in_addr _alias_addr, u_int16_t _src_call_id); +struct alias_link * +FindPptpOutByCallId(struct libalias *la, struct in_addr _src_addr, + struct in_addr _dst_addr, u_int16_t _src_call_id); +struct alias_link * +FindPptpInByCallId(struct libalias *la, struct in_addr _dst_addr, + struct in_addr _alias_addr, u_int16_t _dst_call_id); +struct alias_link * +FindPptpOutByPeerCallId(struct libalias *la, struct in_addr _src_addr, + struct in_addr _dst_addr, u_int16_t _dst_call_id); +struct alias_link * +FindPptpInByPeerCallId(struct libalias *la, struct in_addr _dst_addr, + struct in_addr _alias_addr, u_int16_t _alias_call_id); +struct alias_link * +FindRtspOut(struct libalias *la, struct in_addr _src_addr, struct in_addr _dst_addr, + u_short _src_port, u_short _alias_port, u_char _proto); +struct in_addr + FindOriginalAddress(struct libalias *la, struct in_addr _alias_addr); +struct in_addr + FindAliasAddress(struct libalias *la, struct in_addr _original_addr); +struct in_addr +FindSctpRedirectAddress(struct libalias *la, struct sctp_nat_msg *sm); + +/* External data access/modification */ +int +FindNewPortGroup(struct libalias *la, struct in_addr _dst_addr, struct in_addr _alias_addr, + u_short _src_port, u_short _dst_port, u_short _port_count, + u_char _proto, u_char _align); +void GetFragmentAddr(struct alias_link *_lnk, struct in_addr *_src_addr); +void SetFragmentAddr(struct alias_link *_lnk, struct in_addr _src_addr); +void GetFragmentPtr(struct alias_link *_lnk, char **_fptr); +void SetFragmentPtr(struct alias_link *_lnk, char *fptr); +void SetStateIn(struct alias_link *_lnk, int _state); +void SetStateOut(struct alias_link *_lnk, int _state); +int GetStateIn (struct alias_link *_lnk); +int GetStateOut(struct alias_link *_lnk); +struct in_addr + GetOriginalAddress(struct alias_link *_lnk); +struct in_addr + GetDestAddress(struct alias_link *_lnk); +struct in_addr + GetAliasAddress(struct alias_link *_lnk); +struct in_addr + GetDefaultAliasAddress(struct libalias *la); +void SetDefaultAliasAddress(struct libalias *la, struct in_addr _alias_addr); +u_short GetOriginalPort(struct alias_link *_lnk); +u_short GetAliasPort(struct alias_link *_lnk); +struct in_addr + GetProxyAddress(struct alias_link *_lnk); +void SetProxyAddress(struct alias_link *_lnk, struct in_addr _addr); +u_short GetProxyPort(struct alias_link *_lnk); +void SetProxyPort(struct alias_link *_lnk, u_short _port); +void SetAckModified(struct alias_link *_lnk); +int GetAckModified(struct alias_link *_lnk); +int GetDeltaAckIn(u_long, struct alias_link *_lnk); +int GetDeltaSeqOut(u_long, struct alias_link *lnk); +void AddSeq(struct alias_link *lnk, int delta, u_int ip_hl, + u_short ip_len, u_long th_seq, u_int th_off); +void SetExpire (struct alias_link *_lnk, int _expire); +void ClearCheckNewLink(struct libalias *la); +void SetProtocolFlags(struct alias_link *_lnk, int _pflags); +int GetProtocolFlags(struct alias_link *_lnk); +void SetDestCallId(struct alias_link *_lnk, u_int16_t _cid); + +#ifndef NO_FW_PUNCH +void PunchFWHole(struct alias_link *_lnk); + +#endif + +/* Housekeeping function */ +void HouseKeeping(struct libalias *); + +/* Tcp specfic routines */ +/* lint -save -library Suppress flexelint warnings */ + +/* Transparent proxy routines */ +int +ProxyCheck(struct libalias *la, struct in_addr *proxy_server_addr, + u_short * proxy_server_port, struct in_addr src_addr, + struct in_addr dst_addr, u_short dst_port, u_char ip_p); +void +ProxyModify(struct libalias *la, struct alias_link *_lnk, struct ip *_pip, + int _maxpacketsize, int _proxy_type); + +enum alias_tcp_state { + ALIAS_TCP_STATE_NOT_CONNECTED, + ALIAS_TCP_STATE_CONNECTED, + ALIAS_TCP_STATE_DISCONNECTED +}; + +#if defined(_NETINET_IP_HH_) +static __inline void * +ip_next(struct ip *iphdr) +{ + char *p = (char *)iphdr; + return (&p[iphdr->ip_hl * 4]); +} +#endif + +#if defined(_NETINET_TCP_HH_) +static __inline void * +tcp_next(struct tcphdr *tcphdr) +{ + char *p = (char *)tcphdr; + return (&p[tcphdr->th_off * 4]); +} +#endif + +#if defined(_NETINET_UDP_HH_) +static __inline void * +udp_next(struct udphdr *udphdr) +{ + return ((void *)(udphdr + 1)); +} +#endif + +#endif /* !_ALIAS_LOCAL_HH_ */ diff --git a/freebsd/sys/netinet/libalias/alias_mod.c b/freebsd/sys/netinet/libalias/alias_mod.c new file mode 100644 index 00000000..fa15b2e4 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_mod.c @@ -0,0 +1,292 @@ +#include + +/*- + * Copyright (c) 2005 Paolo Pisati + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +#include +__FBSDID("$FreeBSD$"); + +#ifdef _KERNEL +#include +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#else +#include +#include +#endif + +/* Protocol and userland module handlers chains. */ +LIST_HEAD(handler_chain, proto_handler) handler_chain = LIST_HEAD_INITIALIZER(handler_chain); +#ifdef _KERNEL +struct rwlock handler_rw; +#endif +SLIST_HEAD(dll_chain, dll) dll_chain = SLIST_HEAD_INITIALIZER(dll_chain); + +#ifdef _KERNEL + +#define LIBALIAS_RWLOCK_INIT() \ + rw_init(&handler_rw, "Libalias_modules_rwlock") +#define LIBALIAS_RWLOCK_DESTROY() rw_destroy(&handler_rw) +#define LIBALIAS_WLOCK_ASSERT() \ + rw_assert(&handler_rw, RA_WLOCKED) + +static __inline void +LIBALIAS_RLOCK(void) +{ + rw_rlock(&handler_rw); +} + +static __inline void +LIBALIAS_RUNLOCK(void) +{ + rw_runlock(&handler_rw); +} + +static __inline void +LIBALIAS_WLOCK(void) +{ + rw_wlock(&handler_rw); +} + +static __inline void +LIBALIAS_WUNLOCK(void) +{ + rw_wunlock(&handler_rw); +} + +static void +_handler_chain_init(void) +{ + + if (!rw_initialized(&handler_rw)) + LIBALIAS_RWLOCK_INIT(); +} + +static void +_handler_chain_destroy(void) +{ + + if (rw_initialized(&handler_rw)) + LIBALIAS_RWLOCK_DESTROY(); +} + +#else +#define LIBALIAS_RWLOCK_INIT() ; +#define LIBALIAS_RWLOCK_DESTROY() ; +#define LIBALIAS_WLOCK_ASSERT() ; +#define LIBALIAS_RLOCK() ; +#define LIBALIAS_RUNLOCK() ; +#define LIBALIAS_WLOCK() ; +#define LIBALIAS_WUNLOCK() ; +#define _handler_chain_init() ; +#define _handler_chain_destroy() ; +#endif + +void +handler_chain_init(void) +{ + _handler_chain_init(); +} + +void +handler_chain_destroy(void) +{ + _handler_chain_destroy(); +} + +static int +_attach_handler(struct proto_handler *p) +{ + struct proto_handler *b; + + LIBALIAS_WLOCK_ASSERT(); + b = NULL; + LIST_FOREACH(b, &handler_chain, entries) { + if ((b->pri == p->pri) && + (b->dir == p->dir) && + (b->proto == p->proto)) + return (EEXIST); /* Priority conflict. */ + if (b->pri > p->pri) { + LIST_INSERT_BEFORE(b, p, entries); + return (0); + } + } + /* End of list or found right position, inserts here. */ + if (b) + LIST_INSERT_AFTER(b, p, entries); + else + LIST_INSERT_HEAD(&handler_chain, p, entries); + return (0); +} + +static int +_detach_handler(struct proto_handler *p) +{ + struct proto_handler *b, *b_tmp; + + LIBALIAS_WLOCK_ASSERT(); + LIST_FOREACH_SAFE(b, &handler_chain, entries, b_tmp) { + if (b == p) { + LIST_REMOVE(b, entries); + return (0); + } + } + return (ENOENT); /* Handler not found. */ +} + +int +LibAliasAttachHandlers(struct proto_handler *_p) +{ + int i, error; + + LIBALIAS_WLOCK(); + error = -1; + for (i = 0; 1; i++) { + if (*((int *)&_p[i]) == EOH) + break; + error = _attach_handler(&_p[i]); + if (error != 0) + break; + } + LIBALIAS_WUNLOCK(); + return (error); +} + +int +LibAliasDetachHandlers(struct proto_handler *_p) +{ + int i, error; + + LIBALIAS_WLOCK(); + error = -1; + for (i = 0; 1; i++) { + if (*((int *)&_p[i]) == EOH) + break; + error = _detach_handler(&_p[i]); + if (error != 0) + break; + } + LIBALIAS_WUNLOCK(); + return (error); +} + +int +detach_handler(struct proto_handler *_p) +{ + int error; + + LIBALIAS_WLOCK(); + error = -1; + error = _detach_handler(_p); + LIBALIAS_WUNLOCK(); + return (error); +} + +int +find_handler(int8_t dir, int8_t proto, struct libalias *la, __unused struct ip *pip, + struct alias_data *ad) +{ + struct proto_handler *p; + int error; + + LIBALIAS_RLOCK(); + error = ENOENT; + LIST_FOREACH(p, &handler_chain, entries) { + if ((p->dir & dir) && (p->proto & proto)) + if (p->fingerprint(la, ad) == 0) { + error = p->protohandler(la, pip, ad); + break; + } + } + LIBALIAS_RUNLOCK(); + return (error); +} + +struct proto_handler * +first_handler(void) +{ + + return (LIST_FIRST(&handler_chain)); +} + +/* Dll manipulation code - this code is not thread safe... */ + +int +attach_dll(struct dll *p) +{ + struct dll *b; + + SLIST_FOREACH(b, &dll_chain, next) { + if (!strncmp(b->name, p->name, DLL_LEN)) + return (EEXIST); /* Dll name conflict. */ + } + SLIST_INSERT_HEAD(&dll_chain, p, next); + return (0); +} + +void * +detach_dll(char *p) +{ + struct dll *b, *b_tmp; + void *error; + + b = NULL; + error = NULL; + SLIST_FOREACH_SAFE(b, &dll_chain, next, b_tmp) + if (!strncmp(b->name, p, DLL_LEN)) { + SLIST_REMOVE(&dll_chain, b, dll, next); + error = b; + break; + } + return (error); +} + +struct dll * +walk_dll_chain(void) +{ + struct dll *t; + + t = SLIST_FIRST(&dll_chain); + if (t == NULL) + return (NULL); + SLIST_REMOVE_HEAD(&dll_chain, next); + return (t); +} diff --git a/freebsd/sys/netinet/libalias/alias_mod.h b/freebsd/sys/netinet/libalias/alias_mod.h new file mode 100644 index 00000000..f5f98cc3 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_mod.h @@ -0,0 +1,163 @@ +/*- + * Copyright (c) 2005 Paolo Pisati + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Alias_mod.h defines the outside world interfaces for the packet aliasing + * modular framework + */ + +#ifndef _ALIAS_MOD_HH_ +#define _ALIAS_MOD_HH_ + +#ifdef _KERNEL +MALLOC_DECLARE(M_ALIAS); + +/* Use kernel allocator. */ +#if defined(_SYS_MALLOC_HH_) +#ifndef __rtems__ +#define malloc(x) malloc(x, M_ALIAS, M_NOWAIT|M_ZERO) +#define calloc(x, n) malloc(x*n) +#define free(x) free(x, M_ALIAS) +#else /* __rtems__ */ +#define malloc(x) _bsd_malloc(x, M_ALIAS, M_NOWAIT|M_ZERO) +#define calloc(x, n) malloc(x*n) +#define free(x) _bsd_free(x, M_ALIAS) +#endif /* __rtems__ */ +#endif +#endif + +/* Protocol handlers struct & function. */ + +/* Packet flow direction. */ +#define IN 1 +#define OUT 2 + +/* Working protocol. */ +#define IP 1 +#define TCP 2 +#define UDP 4 + +/* + * Data passed to protocol handler module, it must be filled + * right before calling find_handler() to determine which + * module is elegible to be called. + */ + +struct alias_data { + struct alias_link *lnk; + struct in_addr *oaddr; /* Original address. */ + struct in_addr *aaddr; /* Alias address. */ + uint16_t *aport; /* Alias port. */ + uint16_t *sport, *dport; /* Source & destination port */ + uint16_t maxpktsize; /* Max packet size. */ +}; + +/* + * This structure contains all the information necessary to make + * a protocol handler correctly work. + */ + +struct proto_handler { + u_int pri; /* Handler priority. */ + int16_t dir; /* Flow direction. */ + uint8_t proto; /* Working protocol. */ + int (*fingerprint)(struct libalias *, /* Fingerprint * function. */ + struct alias_data *); + int (*protohandler)(struct libalias *, /* Aliasing * function. */ + struct ip *, struct alias_data *); + LIST_ENTRY(proto_handler) entries; +}; + + +/* + * Used only in userland when libalias needs to keep track of all + * module loaded. In kernel land (kld mode) we don't need to care + * care about libalias modules cause it's kld to do it for us. + */ + +#define DLL_LEN 32 +struct dll { + char name[DLL_LEN]; /* Name of module. */ + void *handle; /* + * Ptr to shared obj obtained through + * dlopen() - use this ptr to get access + * to any symbols from a loaded module + * via dlsym(). + */ + SLIST_ENTRY(dll) next; +}; + +/* Functions used with protocol handlers. */ + +void handler_chain_init(void); +void handler_chain_destroy(void); +int LibAliasAttachHandlers(struct proto_handler *); +int LibAliasDetachHandlers(struct proto_handler *); +int detach_handler(struct proto_handler *); +int find_handler(int8_t, int8_t, struct libalias *, + struct ip *, struct alias_data *); +struct proto_handler *first_handler(void); + +/* Functions used with dll module. */ + +void dll_chain_init(void); +void dll_chain_destroy(void); +int attach_dll(struct dll *); +void *detach_dll(char *); +struct dll *walk_dll_chain(void); + +/* End of handlers. */ +#define EOH -1 + +/* + * Some defines borrowed from sys/module.h used to compile a kld + * in userland as a shared lib. + */ + +#ifndef _KERNEL +typedef enum modeventtype { + MOD_LOAD, + MOD_UNLOAD, + MOD_SHUTDOWN, + MOD_QUIESCE +} modeventtype_t; + +typedef struct module *module_t; +typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *); + +/* + * Struct for registering modules statically via SYSINIT. + */ +typedef struct moduledata { + const char *name; /* module name */ + modeventhand_t evhand; /* event handler */ + void *priv; /* extra data */ +} moduledata_t; +#endif + +#endif /* !_ALIAS_MOD_HH_ */ diff --git a/freebsd/sys/netinet/libalias/alias_nbt.c b/freebsd/sys/netinet/libalias/alias_nbt.c new file mode 100644 index 00000000..31ee0006 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_nbt.c @@ -0,0 +1,855 @@ +#include + +/*- + * Written by Atsushi Murai + * Copyright (c) 1998, System Planning and Engineering Co. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * TODO: + * oClean up. + * oConsidering for word alignment for other platform. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + alias_nbt.c performs special processing for NetBios over TCP/IP + sessions by UDP. + + Initial version: May, 1998 (Atsushi Murai ) + + See HISTORY file for record of revisions. +*/ + +/* Includes */ +#ifdef _KERNEL +#include +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#else +#include +#include +#endif + +#define NETBIOS_NS_PORT_NUMBER 137 +#define NETBIOS_DGM_PORT_NUMBER 138 + +static int +AliasHandleUdpNbt(struct libalias *, struct ip *, struct alias_link *, + struct in_addr *, u_short); + +static int +AliasHandleUdpNbtNS(struct libalias *, struct ip *, struct alias_link *, + struct in_addr *, u_short *, struct in_addr *, u_short *); +static int +fingerprint1(struct libalias *la, struct alias_data *ah) +{ + + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || + ah->aaddr == NULL || ah->aport == NULL) + return (-1); + if (ntohs(*ah->dport) == NETBIOS_DGM_PORT_NUMBER + || ntohs(*ah->sport) == NETBIOS_DGM_PORT_NUMBER) + return (0); + return (-1); +} + +static int +protohandler1(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + return (AliasHandleUdpNbt(la, pip, ah->lnk, ah->aaddr, *ah->aport)); +} + +static int +fingerprint2(struct libalias *la, struct alias_data *ah) +{ + + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || + ah->aaddr == NULL || ah->aport == NULL) + return (-1); + if (ntohs(*ah->dport) == NETBIOS_NS_PORT_NUMBER + || ntohs(*ah->sport) == NETBIOS_NS_PORT_NUMBER) + return (0); + return (-1); +} + +static int +protohandler2in(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + AliasHandleUdpNbtNS(la, pip, ah->lnk, ah->aaddr, ah->aport, + ah->oaddr, ah->dport); + return (0); +} + +static int +protohandler2out(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + return (AliasHandleUdpNbtNS(la, pip, ah->lnk, &pip->ip_src, ah->sport, + ah->aaddr, ah->aport)); +} + +/* Kernel module definition. */ +struct proto_handler handlers[] = { + { + .pri = 130, + .dir = IN|OUT, + .proto = UDP, + .fingerprint = &fingerprint1, + .protohandler = &protohandler1 + }, + { + .pri = 140, + .dir = IN, + .proto = UDP, + .fingerprint = &fingerprint2, + .protohandler = &protohandler2in + }, + { + .pri = 140, + .dir = OUT, + .proto = UDP, + .fingerprint = &fingerprint2, + .protohandler = &protohandler2out + }, + { EOH } +}; + +static int +mod_handler(module_t mod, int type, void *data) +{ + int error; + + switch (type) { + case MOD_LOAD: + error = 0; + LibAliasAttachHandlers(handlers); + break; + case MOD_UNLOAD: + error = 0; + LibAliasDetachHandlers(handlers); + break; + default: + error = EINVAL; + } + return (error); +} + +#ifdef _KERNEL +static +#endif +moduledata_t alias_mod = { + "alias_nbt", mod_handler, NULL +}; + +#ifdef _KERNEL +DECLARE_MODULE(alias_nbt, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); +MODULE_VERSION(alias_nbt, 1); +MODULE_DEPEND(alias_nbt, libalias, 1, 1, 1); +#endif + +typedef struct { + struct in_addr oldaddr; + u_short oldport; + struct in_addr newaddr; + u_short newport; + u_short *uh_sum; +} NBTArguments; + +typedef struct { + unsigned char type; + unsigned char flags; + u_short id; + struct in_addr source_ip; + u_short source_port; + u_short len; + u_short offset; +} NbtDataHeader; + +#define OpQuery 0 +#define OpUnknown 4 +#define OpRegist 5 +#define OpRelease 6 +#define OpWACK 7 +#define OpRefresh 8 +typedef struct { + u_short nametrid; + u_short dir: 1, opcode:4, nmflags:7, rcode:4; + u_short qdcount; + u_short ancount; + u_short nscount; + u_short arcount; +} NbtNSHeader; + +#define FMT_ERR 0x1 +#define SRV_ERR 0x2 +#define IMP_ERR 0x4 +#define RFS_ERR 0x5 +#define ACT_ERR 0x6 +#define CFT_ERR 0x7 + + +#ifdef LIBALIAS_DEBUG +static void +PrintRcode(u_char rcode) +{ + + switch (rcode) { + case FMT_ERR: + printf("\nFormat Error."); + case SRV_ERR: + printf("\nSever failure."); + case IMP_ERR: + printf("\nUnsupported request error.\n"); + case RFS_ERR: + printf("\nRefused error.\n"); + case ACT_ERR: + printf("\nActive error.\n"); + case CFT_ERR: + printf("\nName in conflict error.\n"); + default: + printf("\n?%c?=%0x\n", '?', rcode); + + } +} + +#endif + + +/* Handling Name field */ +static u_char * +AliasHandleName(u_char * p, char *pmax) +{ + + u_char *s; + u_char c; + int compress; + + /* Following length field */ + + if (p == NULL || (char *)p >= pmax) + return (NULL); + + if (*p & 0xc0) { + p = p + 2; + if ((char *)p > pmax) + return (NULL); + return ((u_char *) p); + } + while ((*p & 0x3f) != 0x00) { + s = p + 1; + if (*p == 0x20) + compress = 1; + else + compress = 0; + + /* Get next length field */ + p = (u_char *) (p + (*p & 0x3f) + 1); + if ((char *)p > pmax) { + p = NULL; + break; + } +#ifdef LIBALIAS_DEBUG + printf(":"); +#endif + while (s < p) { + if (compress == 1) { + c = (u_char) (((((*s & 0x0f) << 4) | (*(s + 1) & 0x0f)) - 0x11)); +#ifdef LIBALIAS_DEBUG + if (isprint(c)) + printf("%c", c); + else + printf("<0x%02x>", c); +#endif + s += 2; + } else { +#ifdef LIBALIAS_DEBUG + printf("%c", *s); +#endif + s++; + } + } +#ifdef LIBALIAS_DEBUG + printf(":"); + fflush(stdout); +#endif + } + + /* Set up to out of Name field */ + if (p == NULL || (char *)p >= pmax) + p = NULL; + else + p++; + return ((u_char *) p); +} + +/* + * NetBios Datagram Handler (IP/UDP) + */ +#define DGM_DIRECT_UNIQ 0x10 +#define DGM_DIRECT_GROUP 0x11 +#define DGM_BROADCAST 0x12 +#define DGM_ERROR 0x13 +#define DGM_QUERY 0x14 +#define DGM_POSITIVE_RES 0x15 +#define DGM_NEGATIVE_RES 0x16 + +static int +AliasHandleUdpNbt( + struct libalias *la, + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *lnk, + struct in_addr *alias_address, + u_short alias_port +) +{ + struct udphdr *uh; + NbtDataHeader *ndh; + u_char *p = NULL; + char *pmax; + + (void)la; + (void)lnk; + + /* Calculate data length of UDP packet */ + uh = (struct udphdr *)ip_next(pip); + pmax = (char *)uh + ntohs(uh->uh_ulen); + + ndh = (NbtDataHeader *)udp_next(uh); + if ((char *)(ndh + 1) > pmax) + return (-1); +#ifdef LIBALIAS_DEBUG + printf("\nType=%02x,", ndh->type); +#endif + switch (ndh->type) { + case DGM_DIRECT_UNIQ: + case DGM_DIRECT_GROUP: + case DGM_BROADCAST: + p = (u_char *) ndh + 14; + p = AliasHandleName(p, pmax); /* Source Name */ + p = AliasHandleName(p, pmax); /* Destination Name */ + break; + case DGM_ERROR: + p = (u_char *) ndh + 11; + break; + case DGM_QUERY: + case DGM_POSITIVE_RES: + case DGM_NEGATIVE_RES: + p = (u_char *) ndh + 10; + p = AliasHandleName(p, pmax); /* Destination Name */ + break; + } + if (p == NULL || (char *)p > pmax) + p = NULL; +#ifdef LIBALIAS_DEBUG + printf("%s:%d-->", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port)); +#endif + /* Doing an IP address and Port number Translation */ + if (uh->uh_sum != 0) { + int acc; + u_short *sptr; + + acc = ndh->source_port; + acc -= alias_port; + sptr = (u_short *) & (ndh->source_ip); + acc += *sptr++; + acc += *sptr; + sptr = (u_short *) alias_address; + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, uh->uh_sum); + } + ndh->source_ip = *alias_address; + ndh->source_port = alias_port; +#ifdef LIBALIAS_DEBUG + printf("%s:%d\n", inet_ntoa(ndh->source_ip), ntohs(ndh->source_port)); + fflush(stdout); +#endif + return ((p == NULL) ? -1 : 0); +} + +/* Question Section */ +#define QS_TYPE_NB 0x0020 +#define QS_TYPE_NBSTAT 0x0021 +#define QS_CLAS_IN 0x0001 +typedef struct { + u_short type; /* The type of Request */ + u_short class; /* The class of Request */ +} NBTNsQuestion; + +static u_char * +AliasHandleQuestion( + u_short count, + NBTNsQuestion * q, + char *pmax, + NBTArguments * nbtarg) +{ + + (void)nbtarg; + + while (count != 0) { + /* Name Filed */ + q = (NBTNsQuestion *) AliasHandleName((u_char *) q, pmax); + + if (q == NULL || (char *)(q + 1) > pmax) { + q = NULL; + break; + } + /* Type and Class filed */ + switch (ntohs(q->type)) { + case QS_TYPE_NB: + case QS_TYPE_NBSTAT: + q = q + 1; + break; + default: +#ifdef LIBALIAS_DEBUG + printf("\nUnknown Type on Question %0x\n", ntohs(q->type)); +#endif + break; + } + count--; + } + + /* Set up to out of Question Section */ + return ((u_char *) q); +} + +/* Resource Record */ +#define RR_TYPE_A 0x0001 +#define RR_TYPE_NS 0x0002 +#define RR_TYPE_NULL 0x000a +#define RR_TYPE_NB 0x0020 +#define RR_TYPE_NBSTAT 0x0021 +#define RR_CLAS_IN 0x0001 +#define SizeOfNsResource 8 +typedef struct { + u_short type; + u_short class; + unsigned int ttl; + u_short rdlen; +} NBTNsResource; + +#define SizeOfNsRNB 6 +typedef struct { + u_short g: 1 , ont:2, resv:13; + struct in_addr addr; +} NBTNsRNB; + +static u_char * +AliasHandleResourceNB( + NBTNsResource * q, + char *pmax, + NBTArguments * nbtarg) +{ + NBTNsRNB *nb; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return (NULL); + /* Check out a length */ + bcount = ntohs(q->rdlen); + + /* Forward to Resource NB position */ + nb = (NBTNsRNB *) ((u_char *) q + SizeOfNsResource); + + /* Processing all in_addr array */ +#ifdef LIBALIAS_DEBUG + printf("NB rec[%s", inet_ntoa(nbtarg->oldaddr)); + printf("->%s, %dbytes] ", inet_ntoa(nbtarg->newaddr), bcount); +#endif + while (nb != NULL && bcount != 0) { + if ((char *)(nb + 1) > pmax) { + nb = NULL; + break; + } +#ifdef LIBALIAS_DEBUG + printf("<%s>", inet_ntoa(nb->addr)); +#endif + if (!bcmp(&nbtarg->oldaddr, &nb->addr, sizeof(struct in_addr))) { + if (*nbtarg->uh_sum != 0) { + int acc; + u_short *sptr; + + sptr = (u_short *) & (nb->addr); + acc = *sptr++; + acc += *sptr; + sptr = (u_short *) & (nbtarg->newaddr); + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, *nbtarg->uh_sum); + } + nb->addr = nbtarg->newaddr; +#ifdef LIBALIAS_DEBUG + printf("O"); +#endif + } +#ifdef LIBALIAS_DEBUG + else { + printf("."); + } +#endif + nb = (NBTNsRNB *) ((u_char *) nb + SizeOfNsRNB); + bcount -= SizeOfNsRNB; + } + if (nb == NULL || (char *)(nb + 1) > pmax) { + nb = NULL; + } + return ((u_char *) nb); +} + +#define SizeOfResourceA 6 +typedef struct { + struct in_addr addr; +} NBTNsResourceA; + +static u_char * +AliasHandleResourceA( + NBTNsResource * q, + char *pmax, + NBTArguments * nbtarg) +{ + NBTNsResourceA *a; + u_short bcount; + + if (q == NULL || (char *)(q + 1) > pmax) + return (NULL); + + /* Forward to Resource A position */ + a = (NBTNsResourceA *) ((u_char *) q + sizeof(NBTNsResource)); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Processing all in_addr array */ +#ifdef LIBALIAS_DEBUG + printf("Arec [%s", inet_ntoa(nbtarg->oldaddr)); + printf("->%s]", inet_ntoa(nbtarg->newaddr)); +#endif + while (bcount != 0) { + if (a == NULL || (char *)(a + 1) > pmax) + return (NULL); +#ifdef LIBALIAS_DEBUG + printf("..%s", inet_ntoa(a->addr)); +#endif + if (!bcmp(&nbtarg->oldaddr, &a->addr, sizeof(struct in_addr))) { + if (*nbtarg->uh_sum != 0) { + int acc; + u_short *sptr; + + sptr = (u_short *) & (a->addr); /* Old */ + acc = *sptr++; + acc += *sptr; + sptr = (u_short *) & nbtarg->newaddr; /* New */ + acc -= *sptr++; + acc -= *sptr; + ADJUST_CHECKSUM(acc, *nbtarg->uh_sum); + } + a->addr = nbtarg->newaddr; + } + a++; /* XXXX */ + bcount -= SizeOfResourceA; + } + if (a == NULL || (char *)(a + 1) > pmax) + a = NULL; + return ((u_char *) a); +} + +typedef struct { + u_short opcode:4, flags:8, resv:4; +} NBTNsResourceNULL; + +static u_char * +AliasHandleResourceNULL( + NBTNsResource * q, + char *pmax, + NBTArguments * nbtarg) +{ + NBTNsResourceNULL *n; + u_short bcount; + + (void)nbtarg; + + if (q == NULL || (char *)(q + 1) > pmax) + return (NULL); + + /* Forward to Resource NULL position */ + n = (NBTNsResourceNULL *) ((u_char *) q + sizeof(NBTNsResource)); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Processing all in_addr array */ + while (bcount != 0) { + if ((char *)(n + 1) > pmax) { + n = NULL; + break; + } + n++; + bcount -= sizeof(NBTNsResourceNULL); + } + if ((char *)(n + 1) > pmax) + n = NULL; + + return ((u_char *) n); +} + +static u_char * +AliasHandleResourceNS( + NBTNsResource * q, + char *pmax, + NBTArguments * nbtarg) +{ + NBTNsResourceNULL *n; + u_short bcount; + + (void)nbtarg; + + if (q == NULL || (char *)(q + 1) > pmax) + return (NULL); + + /* Forward to Resource NULL position */ + n = (NBTNsResourceNULL *) ((u_char *) q + sizeof(NBTNsResource)); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + /* Resource Record Name Filed */ + q = (NBTNsResource *) AliasHandleName((u_char *) n, pmax); /* XXX */ + + if (q == NULL || (char *)((u_char *) n + bcount) > pmax) + return (NULL); + else + return ((u_char *) n + bcount); +} + +typedef struct { + u_short numnames; +} NBTNsResourceNBSTAT; + +static u_char * +AliasHandleResourceNBSTAT( + NBTNsResource * q, + char *pmax, + NBTArguments * nbtarg) +{ + NBTNsResourceNBSTAT *n; + u_short bcount; + + (void)nbtarg; + + if (q == NULL || (char *)(q + 1) > pmax) + return (NULL); + + /* Forward to Resource NBSTAT position */ + n = (NBTNsResourceNBSTAT *) ((u_char *) q + sizeof(NBTNsResource)); + + /* Check out of length */ + bcount = ntohs(q->rdlen); + + if (q == NULL || (char *)((u_char *) n + bcount) > pmax) + return (NULL); + else + return ((u_char *) n + bcount); +} + +static u_char * +AliasHandleResource( + u_short count, + NBTNsResource * q, + char *pmax, + NBTArguments + * nbtarg) +{ + while (count != 0) { + /* Resource Record Name Filed */ + q = (NBTNsResource *) AliasHandleName((u_char *) q, pmax); + + if (q == NULL || (char *)(q + 1) > pmax) + break; +#ifdef LIBALIAS_DEBUG + printf("type=%02x, count=%d\n", ntohs(q->type), count); +#endif + + /* Type and Class filed */ + switch (ntohs(q->type)) { + case RR_TYPE_NB: + q = (NBTNsResource *) AliasHandleResourceNB( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_A: + q = (NBTNsResource *) AliasHandleResourceA( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NS: + q = (NBTNsResource *) AliasHandleResourceNS( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NULL: + q = (NBTNsResource *) AliasHandleResourceNULL( + q, + pmax, + nbtarg + ); + break; + case RR_TYPE_NBSTAT: + q = (NBTNsResource *) AliasHandleResourceNBSTAT( + q, + pmax, + nbtarg + ); + break; + default: +#ifdef LIBALIAS_DEBUG + printf( + "\nUnknown Type of Resource %0x\n", + ntohs(q->type) + ); + fflush(stdout); +#endif + break; + } + count--; + } + return ((u_char *) q); +} + +static int +AliasHandleUdpNbtNS( + struct libalias *la, + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *lnk, + struct in_addr *alias_address, + u_short * alias_port, + struct in_addr *original_address, + u_short * original_port) +{ + struct udphdr *uh; + NbtNSHeader *nsh; + u_char *p; + char *pmax; + NBTArguments nbtarg; + + (void)la; + (void)lnk; + + /* Set up Common Parameter */ + nbtarg.oldaddr = *alias_address; + nbtarg.oldport = *alias_port; + nbtarg.newaddr = *original_address; + nbtarg.newport = *original_port; + + /* Calculate data length of UDP packet */ + uh = (struct udphdr *)ip_next(pip); + nbtarg.uh_sum = &(uh->uh_sum); + nsh = (NbtNSHeader *)udp_next(uh); + p = (u_char *) (nsh + 1); + pmax = (char *)uh + ntohs(uh->uh_ulen); + + if ((char *)(nsh + 1) > pmax) + return (-1); + +#ifdef LIBALIAS_DEBUG + printf(" [%s] ID=%02x, op=%01x, flag=%02x, rcode=%01x, qd=%04x" + ", an=%04x, ns=%04x, ar=%04x, [%d]-->", + nsh->dir ? "Response" : "Request", + nsh->nametrid, + nsh->opcode, + nsh->nmflags, + nsh->rcode, + ntohs(nsh->qdcount), + ntohs(nsh->ancount), + ntohs(nsh->nscount), + ntohs(nsh->arcount), + (u_char *) p - (u_char *) nsh + ); +#endif + + /* Question Entries */ + if (ntohs(nsh->qdcount) != 0) { + p = AliasHandleQuestion( + ntohs(nsh->qdcount), + (NBTNsQuestion *) p, + pmax, + &nbtarg + ); + } + /* Answer Resource Records */ + if (ntohs(nsh->ancount) != 0) { + p = AliasHandleResource( + ntohs(nsh->ancount), + (NBTNsResource *) p, + pmax, + &nbtarg + ); + } + /* Authority Resource Recodrs */ + if (ntohs(nsh->nscount) != 0) { + p = AliasHandleResource( + ntohs(nsh->nscount), + (NBTNsResource *) p, + pmax, + &nbtarg + ); + } + /* Additional Resource Recodrs */ + if (ntohs(nsh->arcount) != 0) { + p = AliasHandleResource( + ntohs(nsh->arcount), + (NBTNsResource *) p, + pmax, + &nbtarg + ); + } +#ifdef LIBALIAS_DEBUG + PrintRcode(nsh->rcode); +#endif + return ((p == NULL) ? -1 : 0); +} diff --git a/freebsd/sys/netinet/libalias/alias_pptp.c b/freebsd/sys/netinet/libalias/alias_pptp.c new file mode 100644 index 00000000..f6c7f199 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_pptp.c @@ -0,0 +1,525 @@ +#include + +/* + * alias_pptp.c + * + * Copyright (c) 2000 Whistle Communications, Inc. + * All rights reserved. + * + * Subject to the following obligations and disclaimer of warranty, use and + * redistribution of this software, in source or object code forms, with or + * without modifications are expressly permitted by Whistle Communications; + * provided, however, that: + * 1. Any and all reproductions of the source or object code must include the + * copyright notice above and the following disclaimer of warranties; and + * 2. No rights are granted, in any manner or form, to use Whistle + * Communications, Inc. trademarks, including the mark "WHISTLE + * COMMUNICATIONS" on advertising, endorsements, or otherwise except as + * such appears in the above copyright notice or in the software. + * + * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND + * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO + * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, + * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY + * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS + * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. + * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES + * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING + * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * Author: Erik Salander + */ + +#include +__FBSDID("$FreeBSD$"); + +/* Includes */ +#ifdef _KERNEL +#include +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +#include + +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include +#include +#endif + +#define PPTP_CONTROL_PORT_NUMBER 1723 + +static void +AliasHandlePptpOut(struct libalias *, struct ip *, struct alias_link *); + +static void +AliasHandlePptpIn(struct libalias *, struct ip *, struct alias_link *); + +static int +AliasHandlePptpGreOut(struct libalias *, struct ip *); + +static int +AliasHandlePptpGreIn(struct libalias *, struct ip *); + +static int +fingerprint(struct libalias *la, struct alias_data *ah) +{ + + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL) + return (-1); + if (ntohs(*ah->dport) == PPTP_CONTROL_PORT_NUMBER + || ntohs(*ah->sport) == PPTP_CONTROL_PORT_NUMBER) + return (0); + return (-1); +} + +static int +fingerprintgre(struct libalias *la, struct alias_data *ah) +{ + + return (0); +} + +static int +protohandlerin(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + AliasHandlePptpIn(la, pip, ah->lnk); + return (0); +} + +static int +protohandlerout(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + AliasHandlePptpOut(la, pip, ah->lnk); + return (0); +} + +static int +protohandlergrein(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + if (la->packetAliasMode & PKT_ALIAS_PROXY_ONLY || + AliasHandlePptpGreIn(la, pip) == 0) + return (0); + return (-1); +} + +static int +protohandlergreout(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + if (AliasHandlePptpGreOut(la, pip) == 0) + return (0); + return (-1); +} + +/* Kernel module definition. */ +struct proto_handler handlers[] = { + { + .pri = 200, + .dir = IN, + .proto = TCP, + .fingerprint = &fingerprint, + .protohandler = &protohandlerin + }, + { + .pri = 210, + .dir = OUT, + .proto = TCP, + .fingerprint = &fingerprint, + .protohandler = &protohandlerout + }, +/* + * WATCH OUT!!! these 2 handlers NEED a priority of INT_MAX (highest possible) + * cause they will ALWAYS process packets, so they must be the last one + * in chain: look fingerprintgre() above. + */ + { + .pri = INT_MAX, + .dir = IN, + .proto = IP, + .fingerprint = &fingerprintgre, + .protohandler = &protohandlergrein + }, + { + .pri = INT_MAX, + .dir = OUT, + .proto = IP, + .fingerprint = &fingerprintgre, + .protohandler = &protohandlergreout + }, + { EOH } +}; +static int +mod_handler(module_t mod, int type, void *data) +{ + int error; + + switch (type) { + case MOD_LOAD: + error = 0; + LibAliasAttachHandlers(handlers); + break; + case MOD_UNLOAD: + error = 0; + LibAliasDetachHandlers(handlers); + break; + default: + error = EINVAL; + } + return (error); +} + +#ifdef _KERNEL +static +#endif +moduledata_t alias_mod = { + "alias_pptp", mod_handler, NULL +}; + +#ifdef _KERNEL +DECLARE_MODULE(alias_pptp, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); +MODULE_VERSION(alias_pptp, 1); +MODULE_DEPEND(alias_pptp, libalias, 1, 1, 1); +#endif + +/* + Alias_pptp.c performs special processing for PPTP sessions under TCP. + Specifically, watch PPTP control messages and alias the Call ID or the + Peer's Call ID in the appropriate messages. Note, PPTP requires + "de-aliasing" of incoming packets, this is different than any other + TCP applications that are currently (ie. FTP, IRC and RTSP) aliased. + + For Call IDs encountered for the first time, a PPTP alias link is created. + The PPTP alias link uses the Call ID in place of the original port number. + An alias Call ID is created. + + For this routine to work, the PPTP control messages must fit entirely + into a single TCP packet. This is typically the case, but is not + required by the spec. + + Unlike some of the other TCP applications that are aliased (ie. FTP, + IRC and RTSP), the PPTP control messages that need to be aliased are + guaranteed to remain the same length. The aliased Call ID is a fixed + length field. + + Reference: RFC 2637 + + Initial version: May, 2000 (eds) + +*/ + +/* + * PPTP definitions + */ + +struct grehdr { /* Enhanced GRE header. */ + u_int16_t gh_flags; /* Flags. */ + u_int16_t gh_protocol; /* Protocol type. */ + u_int16_t gh_length; /* Payload length. */ + u_int16_t gh_call_id; /* Call ID. */ + u_int32_t gh_seq_no; /* Sequence number (optional). */ + u_int32_t gh_ack_no; /* Acknowledgment number + * (optional). */ +}; +typedef struct grehdr GreHdr; + +/* The PPTP protocol ID used in the GRE 'proto' field. */ +#define PPTP_GRE_PROTO 0x880b + +/* Bits that must be set a certain way in all PPTP/GRE packets. */ +#define PPTP_INIT_VALUE ((0x2001 << 16) | PPTP_GRE_PROTO) +#define PPTP_INIT_MASK 0xef7fffff + +#define PPTP_MAGIC 0x1a2b3c4d +#define PPTP_CTRL_MSG_TYPE 1 + +enum { + PPTP_StartCtrlConnRequest = 1, + PPTP_StartCtrlConnReply = 2, + PPTP_StopCtrlConnRequest = 3, + PPTP_StopCtrlConnReply = 4, + PPTP_EchoRequest = 5, + PPTP_EchoReply = 6, + PPTP_OutCallRequest = 7, + PPTP_OutCallReply = 8, + PPTP_InCallRequest = 9, + PPTP_InCallReply = 10, + PPTP_InCallConn = 11, + PPTP_CallClearRequest = 12, + PPTP_CallDiscNotify = 13, + PPTP_WanErrorNotify = 14, + PPTP_SetLinkInfo = 15 +}; + + /* Message structures */ +struct pptpMsgHead { + u_int16_t length; /* total length */ + u_int16_t msgType;/* PPTP message type */ + u_int32_t magic; /* magic cookie */ + u_int16_t type; /* control message type */ + u_int16_t resv0; /* reserved */ +}; +typedef struct pptpMsgHead *PptpMsgHead; + +struct pptpCodes { + u_int8_t resCode;/* Result Code */ + u_int8_t errCode;/* Error Code */ +}; +typedef struct pptpCodes *PptpCode; + +struct pptpCallIds { + u_int16_t cid1; /* Call ID field #1 */ + u_int16_t cid2; /* Call ID field #2 */ +}; +typedef struct pptpCallIds *PptpCallId; + +static PptpCallId AliasVerifyPptp(struct ip *, u_int16_t *); + + +static void +AliasHandlePptpOut(struct libalias *la, + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *lnk) +{ /* The PPTP control link */ + struct alias_link *pptp_lnk; + PptpCallId cptr; + PptpCode codes; + u_int16_t ctl_type; /* control message type */ + struct tcphdr *tc; + + /* Verify valid PPTP control message */ + if ((cptr = AliasVerifyPptp(pip, &ctl_type)) == NULL) + return; + + /* Modify certain PPTP messages */ + switch (ctl_type) { + case PPTP_OutCallRequest: + case PPTP_OutCallReply: + case PPTP_InCallRequest: + case PPTP_InCallReply: + /* + * Establish PPTP link for address and Call ID found in + * control message. + */ + pptp_lnk = AddPptp(la, GetOriginalAddress(lnk), GetDestAddress(lnk), + GetAliasAddress(lnk), cptr->cid1); + break; + case PPTP_CallClearRequest: + case PPTP_CallDiscNotify: + /* + * Find PPTP link for address and Call ID found in control + * message. + */ + pptp_lnk = FindPptpOutByCallId(la, GetOriginalAddress(lnk), + GetDestAddress(lnk), + cptr->cid1); + break; + default: + return; + } + + if (pptp_lnk != NULL) { + int accumulate = cptr->cid1; + + /* alias the Call Id */ + cptr->cid1 = GetAliasPort(pptp_lnk); + + /* Compute TCP checksum for revised packet */ + tc = (struct tcphdr *)ip_next(pip); + accumulate -= cptr->cid1; + ADJUST_CHECKSUM(accumulate, tc->th_sum); + + switch (ctl_type) { + case PPTP_OutCallReply: + case PPTP_InCallReply: + codes = (PptpCode) (cptr + 1); + if (codes->resCode == 1) /* Connection + * established, */ + SetDestCallId(pptp_lnk, /* note the Peer's Call + * ID. */ + cptr->cid2); + else + SetExpire(pptp_lnk, 0); /* Connection refused. */ + break; + case PPTP_CallDiscNotify: /* Connection closed. */ + SetExpire(pptp_lnk, 0); + break; + } + } +} + +static void +AliasHandlePptpIn(struct libalias *la, + struct ip *pip, /* IP packet to examine/patch */ + struct alias_link *lnk) +{ /* The PPTP control link */ + struct alias_link *pptp_lnk; + PptpCallId cptr; + u_int16_t *pcall_id; + u_int16_t ctl_type; /* control message type */ + struct tcphdr *tc; + + /* Verify valid PPTP control message */ + if ((cptr = AliasVerifyPptp(pip, &ctl_type)) == NULL) + return; + + /* Modify certain PPTP messages */ + switch (ctl_type) { + case PPTP_InCallConn: + case PPTP_WanErrorNotify: + case PPTP_SetLinkInfo: + pcall_id = &cptr->cid1; + break; + case PPTP_OutCallReply: + case PPTP_InCallReply: + pcall_id = &cptr->cid2; + break; + case PPTP_CallDiscNotify: /* Connection closed. */ + pptp_lnk = FindPptpInByCallId(la, GetDestAddress(lnk), + GetAliasAddress(lnk), + cptr->cid1); + if (pptp_lnk != NULL) + SetExpire(pptp_lnk, 0); + return; + default: + return; + } + + /* Find PPTP link for address and Call ID found in PPTP Control Msg */ + pptp_lnk = FindPptpInByPeerCallId(la, GetDestAddress(lnk), + GetAliasAddress(lnk), + *pcall_id); + + if (pptp_lnk != NULL) { + int accumulate = *pcall_id; + + /* De-alias the Peer's Call Id. */ + *pcall_id = GetOriginalPort(pptp_lnk); + + /* Compute TCP checksum for modified packet */ + tc = (struct tcphdr *)ip_next(pip); + accumulate -= *pcall_id; + ADJUST_CHECKSUM(accumulate, tc->th_sum); + + if (ctl_type == PPTP_OutCallReply || ctl_type == PPTP_InCallReply) { + PptpCode codes = (PptpCode) (cptr + 1); + + if (codes->resCode == 1) /* Connection + * established, */ + SetDestCallId(pptp_lnk, /* note the Call ID. */ + cptr->cid1); + else + SetExpire(pptp_lnk, 0); /* Connection refused. */ + } + } +} + +static PptpCallId +AliasVerifyPptp(struct ip *pip, u_int16_t * ptype) +{ /* IP packet to examine/patch */ + int hlen, tlen, dlen; + PptpMsgHead hptr; + struct tcphdr *tc; + + /* Calculate some lengths */ + tc = (struct tcphdr *)ip_next(pip); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + /* Verify data length */ + if (dlen < (int)(sizeof(struct pptpMsgHead) + sizeof(struct pptpCallIds))) + return (NULL); + + /* Move up to PPTP message header */ + hptr = (PptpMsgHead) tcp_next(tc); + + /* Return the control message type */ + *ptype = ntohs(hptr->type); + + /* Verify PPTP Control Message */ + if ((ntohs(hptr->msgType) != PPTP_CTRL_MSG_TYPE) || + (ntohl(hptr->magic) != PPTP_MAGIC)) + return (NULL); + + /* Verify data length. */ + if ((*ptype == PPTP_OutCallReply || *ptype == PPTP_InCallReply) && + (dlen < (int)(sizeof(struct pptpMsgHead) + sizeof(struct pptpCallIds) + + sizeof(struct pptpCodes)))) + return (NULL); + else + return (PptpCallId) (hptr + 1); +} + +static int +AliasHandlePptpGreOut(struct libalias *la, struct ip *pip) +{ + GreHdr *gr; + struct alias_link *lnk; + + gr = (GreHdr *) ip_next(pip); + + /* Check GRE header bits. */ + if ((ntohl(*((u_int32_t *) gr)) & PPTP_INIT_MASK) != PPTP_INIT_VALUE) + return (-1); + + lnk = FindPptpOutByPeerCallId(la, pip->ip_src, pip->ip_dst, gr->gh_call_id); + if (lnk != NULL) { + struct in_addr alias_addr = GetAliasAddress(lnk); + + /* Change source IP address. */ + DifferentialChecksum(&pip->ip_sum, + &alias_addr, &pip->ip_src, 2); + pip->ip_src = alias_addr; + } + return (0); +} + +static int +AliasHandlePptpGreIn(struct libalias *la, struct ip *pip) +{ + GreHdr *gr; + struct alias_link *lnk; + + gr = (GreHdr *) ip_next(pip); + + /* Check GRE header bits. */ + if ((ntohl(*((u_int32_t *) gr)) & PPTP_INIT_MASK) != PPTP_INIT_VALUE) + return (-1); + + lnk = FindPptpInByPeerCallId(la, pip->ip_src, pip->ip_dst, gr->gh_call_id); + if (lnk != NULL) { + struct in_addr src_addr = GetOriginalAddress(lnk); + + /* De-alias the Peer's Call Id. */ + gr->gh_call_id = GetOriginalPort(lnk); + + /* Restore original IP address. */ + DifferentialChecksum(&pip->ip_sum, + &src_addr, &pip->ip_dst, 2); + pip->ip_dst = src_addr; + } + return (0); +} diff --git a/freebsd/sys/netinet/libalias/alias_proxy.c b/freebsd/sys/netinet/libalias/alias_proxy.c new file mode 100644 index 00000000..f4f2b643 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_proxy.c @@ -0,0 +1,870 @@ +#include + +/*- + * Copyright (c) 2001 Charles Mott + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +/* file: alias_proxy.c + + This file encapsulates special operations related to transparent + proxy redirection. This is where packets with a particular destination, + usually tcp port 80, are redirected to a proxy server. + + When packets are proxied, the destination address and port are + modified. In certain cases, it is necessary to somehow encode + the original address/port info into the packet. Two methods are + presently supported: addition of a [DEST addr port] string at the + beginning of a tcp stream, or inclusion of an optional field + in the IP header. + + There is one public API function: + + PacketAliasProxyRule() -- Adds and deletes proxy + rules. + + Rules are stored in a linear linked list, so lookup efficiency + won't be too good for large lists. + + + Initial development: April, 1998 (cjm) +*/ + + +/* System includes */ +#ifdef _KERNEL +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + +#include + +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include /* Public API functions for libalias */ +#include /* Functions used by alias*.c */ +#endif + +/* + Data structures + */ + +/* + * A linked list of arbitrary length, based on struct proxy_entry is + * used to store proxy rules. + */ +struct proxy_entry { + struct libalias *la; +#define PROXY_TYPE_ENCODE_NONE 1 +#define PROXY_TYPE_ENCODE_TCPSTREAM 2 +#define PROXY_TYPE_ENCODE_IPHDR 3 + int rule_index; + int proxy_type; + u_char proto; + u_short proxy_port; + u_short server_port; + + struct in_addr server_addr; + + struct in_addr src_addr; + struct in_addr src_mask; + + struct in_addr dst_addr; + struct in_addr dst_mask; + + struct proxy_entry *next; + struct proxy_entry *last; +}; + + + +/* + File scope variables +*/ + + + +/* Local (static) functions: + + IpMask() -- Utility function for creating IP + masks from integer (1-32) specification. + IpAddr() -- Utility function for converting string + to IP address + IpPort() -- Utility function for converting string + to port number + RuleAdd() -- Adds an element to the rule list. + RuleDelete() -- Removes an element from the rule list. + RuleNumberDelete() -- Removes all elements from the rule list + having a certain rule number. + ProxyEncodeTcpStream() -- Adds [DEST x.x.x.x xxxx] to the beginning + of a TCP stream. + ProxyEncodeIpHeader() -- Adds an IP option indicating the true + destination of a proxied IP packet +*/ + +static int IpMask(int, struct in_addr *); +static int IpAddr(char *, struct in_addr *); +static int IpPort(char *, int, int *); +static void RuleAdd(struct libalias *la, struct proxy_entry *); +static void RuleDelete(struct proxy_entry *); +static int RuleNumberDelete(struct libalias *la, int); +static void ProxyEncodeTcpStream(struct alias_link *, struct ip *, int); +static void ProxyEncodeIpHeader(struct ip *, int); + +static int +IpMask(int nbits, struct in_addr *mask) +{ + int i; + u_int imask; + + if (nbits < 0 || nbits > 32) + return (-1); + + imask = 0; + for (i = 0; i < nbits; i++) + imask = (imask >> 1) + 0x80000000; + mask->s_addr = htonl(imask); + + return (0); +} + +static int +IpAddr(char *s, struct in_addr *addr) +{ + if (inet_aton(s, addr) == 0) + return (-1); + else + return (0); +} + +static int +IpPort(char *s, int proto, int *port) +{ + int n; + + n = sscanf(s, "%d", port); + if (n != 1) +#ifndef _KERNEL /* XXX: we accept only numeric ports in kernel */ + { + struct servent *se; + + if (proto == IPPROTO_TCP) + se = getservbyname(s, "tcp"); + else if (proto == IPPROTO_UDP) + se = getservbyname(s, "udp"); + else + return (-1); + + if (se == NULL) + return (-1); + + *port = (u_int) ntohs(se->s_port); + } +#else + return (-1); +#endif + return (0); +} + +void +RuleAdd(struct libalias *la, struct proxy_entry *entry) +{ + int rule_index; + struct proxy_entry *ptr; + struct proxy_entry *ptr_last; + + LIBALIAS_LOCK_ASSERT(la); + + if (la->proxyList == NULL) { + la->proxyList = entry; + entry->last = NULL; + entry->next = NULL; + return; + } + entry->la = la; + + rule_index = entry->rule_index; + ptr = la->proxyList; + ptr_last = NULL; + while (ptr != NULL) { + if (ptr->rule_index >= rule_index) { + if (ptr_last == NULL) { + entry->next = la->proxyList; + entry->last = NULL; + la->proxyList->last = entry; + la->proxyList = entry; + return; + } + ptr_last->next = entry; + ptr->last = entry; + entry->last = ptr->last; + entry->next = ptr; + return; + } + ptr_last = ptr; + ptr = ptr->next; + } + + ptr_last->next = entry; + entry->last = ptr_last; + entry->next = NULL; +} + +static void +RuleDelete(struct proxy_entry *entry) +{ + struct libalias *la; + + la = entry->la; + LIBALIAS_LOCK_ASSERT(la); + if (entry->last != NULL) + entry->last->next = entry->next; + else + la->proxyList = entry->next; + + if (entry->next != NULL) + entry->next->last = entry->last; + + free(entry); +} + +static int +RuleNumberDelete(struct libalias *la, int rule_index) +{ + int err; + struct proxy_entry *ptr; + + LIBALIAS_LOCK_ASSERT(la); + err = -1; + ptr = la->proxyList; + while (ptr != NULL) { + struct proxy_entry *ptr_next; + + ptr_next = ptr->next; + if (ptr->rule_index == rule_index) { + err = 0; + RuleDelete(ptr); + } + ptr = ptr_next; + } + + return (err); +} + +static void +ProxyEncodeTcpStream(struct alias_link *lnk, + struct ip *pip, + int maxpacketsize) +{ + int slen; + char buffer[40]; + struct tcphdr *tc; + +/* Compute pointer to tcp header */ + tc = (struct tcphdr *)ip_next(pip); + +/* Don't modify if once already modified */ + + if (GetAckModified(lnk)) + return; + +/* Translate destination address and port to string form */ + snprintf(buffer, sizeof(buffer) - 2, "[DEST %s %d]", + inet_ntoa(GetProxyAddress(lnk)), (u_int) ntohs(GetProxyPort(lnk))); + +/* Pad string out to a multiple of two in length */ + slen = strlen(buffer); + switch (slen % 2) { + case 0: + strcat(buffer, " \n"); + slen += 2; + break; + case 1: + strcat(buffer, "\n"); + slen += 1; + } + +/* Check for packet overflow */ + if ((int)(ntohs(pip->ip_len) + strlen(buffer)) > maxpacketsize) + return; + +/* Shift existing TCP data and insert destination string */ + { + int dlen; + int hlen; + char *p; + + hlen = (pip->ip_hl + tc->th_off) << 2; + dlen = ntohs(pip->ip_len) - hlen; + +/* Modify first packet that has data in it */ + + if (dlen == 0) + return; + + p = (char *)pip; + p += hlen; + + bcopy(p, p + slen, dlen); + memcpy(p, buffer, slen); + } + +/* Save information about modfied sequence number */ + { + int delta; + + SetAckModified(lnk); + tc = (struct tcphdr *)ip_next(pip); + delta = GetDeltaSeqOut(tc->th_seq, lnk); + AddSeq(lnk, delta + slen, pip->ip_hl, pip->ip_len, tc->th_seq, + tc->th_off); + } + +/* Update IP header packet length and checksum */ + { + int accumulate; + + accumulate = pip->ip_len; + pip->ip_len = htons(ntohs(pip->ip_len) + slen); + accumulate -= pip->ip_len; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } + +/* Update TCP checksum, Use TcpChecksum since so many things have + already changed. */ + + tc->th_sum = 0; +#ifdef _KERNEL + tc->th_x2 = 1; +#else + tc->th_sum = TcpChecksum(pip); +#endif +} + +static void +ProxyEncodeIpHeader(struct ip *pip, + int maxpacketsize) +{ +#define OPTION_LEN_BYTES 8 +#define OPTION_LEN_INT16 4 +#define OPTION_LEN_INT32 2 + u_char option[OPTION_LEN_BYTES]; + +#ifdef LIBALIAS_DEBUG + fprintf(stdout, " ip cksum 1 = %x\n", (u_int) IpChecksum(pip)); + fprintf(stdout, "tcp cksum 1 = %x\n", (u_int) TcpChecksum(pip)); +#endif + + (void)maxpacketsize; + +/* Check to see that there is room to add an IP option */ + if (pip->ip_hl > (0x0f - OPTION_LEN_INT32)) + return; + +/* Build option and copy into packet */ + { + u_char *ptr; + struct tcphdr *tc; + + ptr = (u_char *) pip; + ptr += 20; + memcpy(ptr + OPTION_LEN_BYTES, ptr, ntohs(pip->ip_len) - 20); + + option[0] = 0x64; /* class: 3 (reserved), option 4 */ + option[1] = OPTION_LEN_BYTES; + + memcpy(&option[2], (u_char *) & pip->ip_dst, 4); + + tc = (struct tcphdr *)ip_next(pip); + memcpy(&option[6], (u_char *) & tc->th_sport, 2); + + memcpy(ptr, option, 8); + } + +/* Update checksum, header length and packet length */ + { + int i; + int accumulate; + u_short *sptr; + + sptr = (u_short *) option; + accumulate = 0; + for (i = 0; i < OPTION_LEN_INT16; i++) + accumulate -= *(sptr++); + + sptr = (u_short *) pip; + accumulate += *sptr; + pip->ip_hl += OPTION_LEN_INT32; + accumulate -= *sptr; + + accumulate += pip->ip_len; + pip->ip_len = htons(ntohs(pip->ip_len) + OPTION_LEN_BYTES); + accumulate -= pip->ip_len; + + ADJUST_CHECKSUM(accumulate, pip->ip_sum); + } +#undef OPTION_LEN_BYTES +#undef OPTION_LEN_INT16 +#undef OPTION_LEN_INT32 +#ifdef LIBALIAS_DEBUG + fprintf(stdout, " ip cksum 2 = %x\n", (u_int) IpChecksum(pip)); + fprintf(stdout, "tcp cksum 2 = %x\n", (u_int) TcpChecksum(pip)); +#endif +} + + +/* Functions by other packet alias source files + + ProxyCheck() -- Checks whether an outgoing packet should + be proxied. + ProxyModify() -- Encodes the original destination address/port + for a packet which is to be redirected to + a proxy server. +*/ + +int +ProxyCheck(struct libalias *la, struct in_addr *proxy_server_addr, + u_short * proxy_server_port, struct in_addr src_addr, + struct in_addr dst_addr, u_short dst_port, u_char ip_p) +{ + struct proxy_entry *ptr; + + LIBALIAS_LOCK_ASSERT(la); + + ptr = la->proxyList; + while (ptr != NULL) { + u_short proxy_port; + + proxy_port = ptr->proxy_port; + if ((dst_port == proxy_port || proxy_port == 0) + && ip_p == ptr->proto + && src_addr.s_addr != ptr->server_addr.s_addr) { + struct in_addr src_addr_masked; + struct in_addr dst_addr_masked; + + src_addr_masked.s_addr = src_addr.s_addr & ptr->src_mask.s_addr; + dst_addr_masked.s_addr = dst_addr.s_addr & ptr->dst_mask.s_addr; + + if ((src_addr_masked.s_addr == ptr->src_addr.s_addr) + && (dst_addr_masked.s_addr == ptr->dst_addr.s_addr)) { + if ((*proxy_server_port = ptr->server_port) == 0) + *proxy_server_port = dst_port; + *proxy_server_addr = ptr->server_addr; + return (ptr->proxy_type); + } + } + ptr = ptr->next; + } + + return (0); +} + +void +ProxyModify(struct libalias *la, struct alias_link *lnk, + struct ip *pip, + int maxpacketsize, + int proxy_type) +{ + + LIBALIAS_LOCK_ASSERT(la); + (void)la; + + switch (proxy_type) { + case PROXY_TYPE_ENCODE_IPHDR: + ProxyEncodeIpHeader(pip, maxpacketsize); + break; + + case PROXY_TYPE_ENCODE_TCPSTREAM: + ProxyEncodeTcpStream(lnk, pip, maxpacketsize); + break; + } +} + + +/* + Public API functions +*/ + +int +LibAliasProxyRule(struct libalias *la, const char *cmd) +{ +/* + * This function takes command strings of the form: + * + * server [:] + * [port ] + * [rule n] + * [proto tcp|udp] + * [src [/n]] + * [dst [/n]] + * [type encode_tcp_stream|encode_ip_hdr|no_encode] + * + * delete + * + * Subfields can be in arbitrary order. Port numbers and addresses + * must be in either numeric or symbolic form. An optional rule number + * is used to control the order in which rules are searched. If two + * rules have the same number, then search order cannot be guaranteed, + * and the rules should be disjoint. If no rule number is specified, + * then 0 is used, and group 0 rules are always checked before any + * others. + */ + int i, n, len, ret; + int cmd_len; + int token_count; + int state; + char *token; + char buffer[256]; + char str_port[sizeof(buffer)]; + char str_server_port[sizeof(buffer)]; + char *res = buffer; + + int rule_index; + int proto; + int proxy_type; + int proxy_port; + int server_port; + struct in_addr server_addr; + struct in_addr src_addr, src_mask; + struct in_addr dst_addr, dst_mask; + struct proxy_entry *proxy_entry; + + LIBALIAS_LOCK(la); + ret = 0; +/* Copy command line into a buffer */ + cmd += strspn(cmd, " \t"); + cmd_len = strlen(cmd); + if (cmd_len > (int)(sizeof(buffer) - 1)) { + ret = -1; + goto getout; + } + strcpy(buffer, cmd); + +/* Convert to lower case */ + len = strlen(buffer); + for (i = 0; i < len; i++) + buffer[i] = tolower((unsigned char)buffer[i]); + +/* Set default proxy type */ + +/* Set up default values */ + rule_index = 0; + proxy_type = PROXY_TYPE_ENCODE_NONE; + proto = IPPROTO_TCP; + proxy_port = 0; + server_addr.s_addr = 0; + server_port = 0; + src_addr.s_addr = 0; + IpMask(0, &src_mask); + dst_addr.s_addr = 0; + IpMask(0, &dst_mask); + + str_port[0] = 0; + str_server_port[0] = 0; + +/* Parse command string with state machine */ +#define STATE_READ_KEYWORD 0 +#define STATE_READ_TYPE 1 +#define STATE_READ_PORT 2 +#define STATE_READ_SERVER 3 +#define STATE_READ_RULE 4 +#define STATE_READ_DELETE 5 +#define STATE_READ_PROTO 6 +#define STATE_READ_SRC 7 +#define STATE_READ_DST 8 + state = STATE_READ_KEYWORD; + token = strsep(&res, " \t"); + token_count = 0; + while (token != NULL) { + token_count++; + switch (state) { + case STATE_READ_KEYWORD: + if (strcmp(token, "type") == 0) + state = STATE_READ_TYPE; + else if (strcmp(token, "port") == 0) + state = STATE_READ_PORT; + else if (strcmp(token, "server") == 0) + state = STATE_READ_SERVER; + else if (strcmp(token, "rule") == 0) + state = STATE_READ_RULE; + else if (strcmp(token, "delete") == 0) + state = STATE_READ_DELETE; + else if (strcmp(token, "proto") == 0) + state = STATE_READ_PROTO; + else if (strcmp(token, "src") == 0) + state = STATE_READ_SRC; + else if (strcmp(token, "dst") == 0) + state = STATE_READ_DST; + else { + ret = -1; + goto getout; + } + break; + + case STATE_READ_TYPE: + if (strcmp(token, "encode_ip_hdr") == 0) + proxy_type = PROXY_TYPE_ENCODE_IPHDR; + else if (strcmp(token, "encode_tcp_stream") == 0) + proxy_type = PROXY_TYPE_ENCODE_TCPSTREAM; + else if (strcmp(token, "no_encode") == 0) + proxy_type = PROXY_TYPE_ENCODE_NONE; + else { + ret = -1; + goto getout; + } + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_PORT: + strcpy(str_port, token); + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_SERVER: + { + int err; + char *p; + char s[sizeof(buffer)]; + + p = token; + while (*p != ':' && *p != 0) + p++; + + if (*p != ':') { + err = IpAddr(token, &server_addr); + if (err) { + ret = -1; + goto getout; + } + } else { + *p = ' '; + + n = sscanf(token, "%s %s", s, str_server_port); + if (n != 2) { + ret = -1; + goto getout; + } + + err = IpAddr(s, &server_addr); + if (err) { + ret = -1; + goto getout; + } + } + } + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_RULE: + n = sscanf(token, "%d", &rule_index); + if (n != 1 || rule_index < 0) { + ret = -1; + goto getout; + } + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_DELETE: + { + int err; + int rule_to_delete; + + if (token_count != 2) { + ret = -1; + goto getout; + } + + n = sscanf(token, "%d", &rule_to_delete); + if (n != 1) { + ret = -1; + goto getout; + } + err = RuleNumberDelete(la, rule_to_delete); + if (err) + ret = -1; + ret = 0; + goto getout; + } + + case STATE_READ_PROTO: + if (strcmp(token, "tcp") == 0) + proto = IPPROTO_TCP; + else if (strcmp(token, "udp") == 0) + proto = IPPROTO_UDP; + else { + ret = -1; + goto getout; + } + state = STATE_READ_KEYWORD; + break; + + case STATE_READ_SRC: + case STATE_READ_DST: + { + int err; + char *p; + struct in_addr mask; + struct in_addr addr; + + p = token; + while (*p != '/' && *p != 0) + p++; + + if (*p != '/') { + IpMask(32, &mask); + err = IpAddr(token, &addr); + if (err) { + ret = -1; + goto getout; + } + } else { + int nbits; + char s[sizeof(buffer)]; + + *p = ' '; + n = sscanf(token, "%s %d", s, &nbits); + if (n != 2) { + ret = -1; + goto getout; + } + + err = IpAddr(s, &addr); + if (err) { + ret = -1; + goto getout; + } + + err = IpMask(nbits, &mask); + if (err) { + ret = -1; + goto getout; + } + } + + if (state == STATE_READ_SRC) { + src_addr = addr; + src_mask = mask; + } else { + dst_addr = addr; + dst_mask = mask; + } + } + state = STATE_READ_KEYWORD; + break; + + default: + ret = -1; + goto getout; + break; + } + + do { + token = strsep(&res, " \t"); + } while (token != NULL && !*token); + } +#undef STATE_READ_KEYWORD +#undef STATE_READ_TYPE +#undef STATE_READ_PORT +#undef STATE_READ_SERVER +#undef STATE_READ_RULE +#undef STATE_READ_DELETE +#undef STATE_READ_PROTO +#undef STATE_READ_SRC +#undef STATE_READ_DST + +/* Convert port strings to numbers. This needs to be done after + the string is parsed, because the prototype might not be designated + before the ports (which might be symbolic entries in /etc/services) */ + + if (strlen(str_port) != 0) { + int err; + + err = IpPort(str_port, proto, &proxy_port); + if (err) { + ret = -1; + goto getout; + } + } else { + proxy_port = 0; + } + + if (strlen(str_server_port) != 0) { + int err; + + err = IpPort(str_server_port, proto, &server_port); + if (err) { + ret = -1; + goto getout; + } + } else { + server_port = 0; + } + +/* Check that at least the server address has been defined */ + if (server_addr.s_addr == 0) { + ret = -1; + goto getout; + } + +/* Add to linked list */ + proxy_entry = malloc(sizeof(struct proxy_entry)); + if (proxy_entry == NULL) { + ret = -1; + goto getout; + } + + proxy_entry->proxy_type = proxy_type; + proxy_entry->rule_index = rule_index; + proxy_entry->proto = proto; + proxy_entry->proxy_port = htons(proxy_port); + proxy_entry->server_port = htons(server_port); + proxy_entry->server_addr = server_addr; + proxy_entry->src_addr.s_addr = src_addr.s_addr & src_mask.s_addr; + proxy_entry->dst_addr.s_addr = dst_addr.s_addr & dst_mask.s_addr; + proxy_entry->src_mask = src_mask; + proxy_entry->dst_mask = dst_mask; + + RuleAdd(la, proxy_entry); + +getout: + LIBALIAS_UNLOCK(la); + return (ret); +} diff --git a/freebsd/sys/netinet/libalias/alias_sctp.c b/freebsd/sys/netinet/libalias/alias_sctp.c new file mode 100644 index 00000000..cdec258c --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_sctp.c @@ -0,0 +1,2700 @@ +#include + +/*- + * Copyright (c) 2008 + * Swinburne University of Technology, Melbourne, Australia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Alias_sctp forms part of the libalias kernel module to handle + * Network Address Translation (NAT) for the SCTP protocol. + * + * This software was developed by David A. Hayes and Jason But + * + * The design is outlined in CAIA technical report number 080618A + * (D. Hayes and J. But, "Alias_sctp Version 0.1: SCTP NAT implementation in IPFW") + * + * Development is part of the CAIA SONATA project, + * proposed by Jason But and Grenville Armitage: + * http://caia.swin.edu.au/urp/sonata/ + * + * + * This project has been made possible in part by a grant from + * the Cisco University Research Program Fund at Community + * Foundation Silicon Valley. + * + */ +/** @mainpage + * Alias_sctp is part of the SONATA (http://caia.swin.edu.au/urp/sonata) project + * to develop and release a BSD licensed implementation of a Network Address + * Translation (NAT) module that supports the Stream Control Transmission + * Protocol (SCTP). + * + * Traditional address and port number look ups are inadequate for SCTP's + * operation due to both processing requirements and issues with multi-homing. + * Alias_sctp integrates with FreeBSD's ipfw/libalias NAT system. + * + * Version 0.2 features include: + * - Support for global multi-homing + * - Support for ASCONF modification from Internet Draft + * (draft-stewart-behave-sctpnat-04, R. Stewart and M. Tuexen, "Stream control + * transmission protocol (SCTP) network address translation," Jul. 2008) to + * provide support for multi-homed privately addressed hosts + * - Support for forwarding of T-flagged packets + * - Generation and delivery of AbortM/ErrorM packets upon detection of NAT + * collisions + * - Per-port forwarding rules + * - Dynamically controllable logging and statistics + * - Dynamic management of timers + * - Dynamic control of hash-table size + */ + +/* $FreeBSD$ */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif //#ifdef _KERNEL + +/* ---------------------------------------------------------------------- + * FUNCTION PROTOTYPES + * ---------------------------------------------------------------------- + */ +/* Packet Parsing Functions */ +static int sctp_PktParser(struct libalias *la, int direction, struct ip *pip, + struct sctp_nat_msg *sm, struct sctp_nat_assoc **passoc); +static int GetAsconfVtags(struct libalias *la, struct sctp_nat_msg *sm, + uint32_t *l_vtag, uint32_t *g_vtag, int direction); +static int IsASCONFack(struct libalias *la, struct sctp_nat_msg *sm, int direction); + +static void AddGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction); +static int Add_Global_Address_to_List(struct sctp_nat_assoc *assoc, struct sctp_GlobalAddress *G_addr); +static void RmGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction); +static int IsADDorDEL(struct libalias *la, struct sctp_nat_msg *sm, int direction); + +/* State Machine Functions */ +static int ProcessSctpMsg(struct libalias *la, int direction, \ + struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc); + +static int ID_process(struct libalias *la, int direction,\ + struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm); +static int INi_process(struct libalias *la, int direction,\ + struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm); +static int INa_process(struct libalias *la, int direction,\ + struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm); +static int UP_process(struct libalias *la, int direction,\ + struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm); +static int CL_process(struct libalias *la, int direction,\ + struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm); +static void TxAbortErrorM(struct libalias *la, struct sctp_nat_msg *sm,\ + struct sctp_nat_assoc *assoc, int sndrply, int direction); + +/* Hash Table Functions */ +static struct sctp_nat_assoc* +FindSctpLocal(struct libalias *la, struct in_addr l_addr, struct in_addr g_addr, uint32_t l_vtag, uint16_t l_port, uint16_t g_port); +static struct sctp_nat_assoc* +FindSctpGlobal(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t g_port, uint16_t l_port, int *partial_match); +static struct sctp_nat_assoc* +FindSctpGlobalClash(struct libalias *la, struct sctp_nat_assoc *Cassoc); +static struct sctp_nat_assoc* +FindSctpLocalT(struct libalias *la, struct in_addr g_addr, uint32_t l_vtag, uint16_t g_port, uint16_t l_port); +static struct sctp_nat_assoc* +FindSctpGlobalT(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t l_port, uint16_t g_port); + +static int AddSctpAssocLocal(struct libalias *la, struct sctp_nat_assoc *assoc, struct in_addr g_addr); +static int AddSctpAssocGlobal(struct libalias *la, struct sctp_nat_assoc *assoc); +static void RmSctpAssoc(struct libalias *la, struct sctp_nat_assoc *assoc); +static void freeGlobalAddressList(struct sctp_nat_assoc *assoc); + +/* Timer Queue Functions */ +static void sctp_AddTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc); +static void sctp_RmTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc); +static void sctp_ResetTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc, int newexp); +void sctp_CheckTimers(struct libalias *la); + + +/* Logging Functions */ +static void logsctperror(char* errormsg, uint32_t vtag, int error, int direction); +static void logsctpparse(int direction, struct sctp_nat_msg *sm); +static void logsctpassoc(struct sctp_nat_assoc *assoc, char *s); +static void logTimerQ(struct libalias *la); +static void logSctpGlobal(struct libalias *la); +static void logSctpLocal(struct libalias *la); +#ifdef _KERNEL +static void SctpAliasLog(const char *format, ...); +#endif + +/** @defgroup external External code changes and modifications + * + * Some changes have been made to files external to alias_sctp.(c|h). These + * changes are primarily due to code needing to call static functions within + * those files or to perform extra functionality that can only be performed + * within these files. + */ +/** @ingroup external + * @brief Log current statistics for the libalias instance + * + * This function is defined in alias_db.c, since it calls static functions in + * this file + * + * Calls the higher level ShowAliasStats() in alias_db.c which logs all current + * statistics about the libalias instance - including SCTP statistics + * + * @param la Pointer to the libalias instance + */ +void SctpShowAliasStats(struct libalias *la); + +#ifdef _KERNEL + +MALLOC_DEFINE(M_SCTPNAT, "sctpnat", "sctp nat dbs"); +/* Use kernel allocator. */ +#ifdef _SYS_MALLOC_HH_ +#define sn_malloc(x) malloc(x, M_SCTPNAT, M_NOWAIT|M_ZERO) +#define sn_calloc(n,x) sn_malloc(x * n) +#define sn_free(x) free(x, M_SCTPNAT) +#endif// #ifdef _SYS_MALLOC_HH_ + +#else //#ifdef _KERNEL +#define sn_malloc(x) malloc(x) +#define sn_calloc(n, x) calloc(n, x) +#define sn_free(x) free(x) + +#endif //#ifdef _KERNEL + +/** @defgroup packet_parser SCTP Packet Parsing + * + * Macros to: + * - Return pointers to the first and next SCTP chunks within an SCTP Packet + * - Define possible return values of the packet parsing process + * - SCTP message types for storing in the sctp_nat_msg structure @{ + */ + +#define SN_SCTP_FIRSTCHUNK(sctphead) (struct sctp_chunkhdr *)(((char *)sctphead) + sizeof(struct sctphdr)) +/**< Returns a pointer to the first chunk in an SCTP packet given a pointer to the SCTP header */ + +#define SN_SCTP_NEXTCHUNK(chunkhead) (struct sctp_chunkhdr *)(((char *)chunkhead) + SCTP_SIZE32(ntohs(chunkhead->chunk_length))) +/**< Returns a pointer to the next chunk in an SCTP packet given a pointer to the current chunk */ + +#define SN_SCTP_NEXTPARAM(param) (struct sctp_paramhdr *)(((char *)param) + SCTP_SIZE32(ntohs(param->param_length))) +/**< Returns a pointer to the next parameter in an SCTP packet given a pointer to the current parameter */ + +#define SN_MIN_CHUNK_SIZE 4 /**< Smallest possible SCTP chunk size in bytes */ +#define SN_MIN_PARAM_SIZE 4 /**< Smallest possible SCTP param size in bytes */ +#define SN_VTAG_PARAM_SIZE 12 /**< Size of SCTP ASCONF vtag param in bytes */ +#define SN_ASCONFACK_PARAM_SIZE 8 /**< Size of SCTP ASCONF ACK param in bytes */ + +/* Packet parsing return codes */ +#define SN_PARSE_OK 0 /**< Packet parsed for SCTP messages */ +#define SN_PARSE_ERROR_IPSHL 1 /**< Packet parsing error - IP and SCTP common header len */ +#define SN_PARSE_ERROR_AS_MALLOC 2 /**< Packet parsing error - assoc malloc */ +#define SN_PARSE_ERROR_CHHL 3 /**< Packet parsing error - Chunk header len */ +#define SN_PARSE_ERROR_DIR 4 /**< Packet parsing error - Direction */ +#define SN_PARSE_ERROR_VTAG 5 /**< Packet parsing error - Vtag */ +#define SN_PARSE_ERROR_CHUNK 6 /**< Packet parsing error - Chunk */ +#define SN_PARSE_ERROR_PORT 7 /**< Packet parsing error - Port=0 */ +#define SN_PARSE_ERROR_LOOKUP 8 /**< Packet parsing error - Lookup */ +#define SN_PARSE_ERROR_PARTIALLOOKUP 9 /**< Packet parsing error - partial lookup only found */ +#define SN_PARSE_ERROR_LOOKUP_ABORT 10 /**< Packet parsing error - Lookup - but abort packet */ + +/* Alias_sctp performs its processing based on a number of key messages */ +#define SN_SCTP_ABORT 0x0000 /**< a packet containing an ABORT chunk */ +#define SN_SCTP_INIT 0x0001 /**< a packet containing an INIT chunk */ +#define SN_SCTP_INITACK 0x0002 /**< a packet containing an INIT-ACK chunk */ +#define SN_SCTP_SHUTCOMP 0x0010 /**< a packet containing a SHUTDOWN-COMPLETE chunk */ +#define SN_SCTP_SHUTACK 0x0020 /**< a packet containing a SHUTDOWN-ACK chunk */ +#define SN_SCTP_ASCONF 0x0100 /**< a packet containing an ASCONF chunk */ +#define SN_SCTP_ASCONFACK 0x0200 /**< a packet containing an ASCONF-ACK chunk */ +#define SN_SCTP_OTHER 0xFFFF /**< a packet containing a chunk that is not of interest */ + +/** @} + * @defgroup state_machine SCTP NAT State Machine + * + * Defines the various states an association can be within the NAT @{ + */ +#define SN_ID 0x0000 /**< Idle state */ +#define SN_INi 0x0010 /**< Initialising, waiting for InitAck state */ +#define SN_INa 0x0020 /**< Initialising, waiting for AddIpAck state */ +#define SN_UP 0x0100 /**< Association in UP state */ +#define SN_CL 0x1000 /**< Closing state */ +#define SN_RM 0x2000 /**< Removing state */ + +/** @} + * @defgroup Logging Logging Functionality + * + * Define various log levels and a macro to call specified log functions only if + * the current log level (sysctl_log_level) matches the specified level @{ + */ +#define SN_LOG_LOW 0 +#define SN_LOG_EVENT 1 +#define SN_LOG_INFO 2 +#define SN_LOG_DETAIL 3 +#define SN_LOG_DEBUG 4 +#define SN_LOG_DEBUG_MAX 5 + +#define SN_LOG(level, action) if (sysctl_log_level >= level) { action; } /**< Perform log action ONLY if the current log level meets the specified log level */ + +/** @} + * @defgroup Hash Hash Table Macros and Functions + * + * Defines minimum/maximum/default values for the hash table size @{ + */ +#define SN_MIN_HASH_SIZE 101 /**< Minimum hash table size (set to stop users choosing stupid values) */ +#define SN_MAX_HASH_SIZE 1000001 /**< Maximum hash table size (NB must be less than max int) */ +#define SN_DEFAULT_HASH_SIZE 2003 /**< A reasonable default size for the hash tables */ + +#define SN_LOCAL_TBL 0x01 /**< assoc in local table */ +#define SN_GLOBAL_TBL 0x02 /**< assoc in global table */ +#define SN_BOTH_TBL 0x03 /**< assoc in both tables */ +#define SN_WAIT_TOLOCAL 0x10 /**< assoc waiting for TOLOCAL asconf ACK*/ +#define SN_WAIT_TOGLOBAL 0x20 /**< assoc waiting for TOLOCAL asconf ACK*/ +#define SN_NULL_TBL 0x00 /**< assoc in No table */ +#define SN_MAX_GLOBAL_ADDRESSES 100 /**< absolute maximum global address count*/ + +#define SN_ADD_OK 0 /**< Association added to the table */ +#define SN_ADD_CLASH 1 /**< Clash when trying to add the assoc. info to the table */ + +#define SN_TABLE_HASH(vtag, port, size) (((u_int) vtag + (u_int) port) % (u_int) size) /**< Calculate the hash table lookup position */ + +/** @} + * @defgroup Timer Timer Queue Macros and Functions + * + * Timer macros set minimum/maximum timeout values and calculate timer expiry + * times for the provided libalias instance @{ + */ +#define SN_MIN_TIMER 1 +#define SN_MAX_TIMER 600 +#define SN_TIMER_QUEUE_SIZE SN_MAX_TIMER+2 + +#define SN_I_T(la) (la->timeStamp + sysctl_init_timer) /**< INIT State expiration time in seconds */ +#define SN_U_T(la) (la->timeStamp + sysctl_up_timer) /**< UP State expiration time in seconds */ +#define SN_C_T(la) (la->timeStamp + sysctl_shutdown_timer) /**< CL State expiration time in seconds */ +#define SN_X_T(la) (la->timeStamp + sysctl_holddown_timer) /**< Wait after a shutdown complete in seconds */ + +/** @} + * @defgroup sysctl SysCtl Variable and callback function declarations + * + * Sysctl variables to modify NAT functionality in real-time along with associated functions + * to manage modifications to the sysctl variables @{ + */ + +/* Callbacks */ +int sysctl_chg_loglevel(SYSCTL_HANDLER_ARGS); +int sysctl_chg_timer(SYSCTL_HANDLER_ARGS); +int sysctl_chg_hashtable_size(SYSCTL_HANDLER_ARGS); +int sysctl_chg_error_on_ootb(SYSCTL_HANDLER_ARGS); +int sysctl_chg_accept_global_ootb_addip(SYSCTL_HANDLER_ARGS); +int sysctl_chg_initialising_chunk_proc_limit(SYSCTL_HANDLER_ARGS); +int sysctl_chg_chunk_proc_limit(SYSCTL_HANDLER_ARGS); +int sysctl_chg_param_proc_limit(SYSCTL_HANDLER_ARGS); +int sysctl_chg_track_global_addresses(SYSCTL_HANDLER_ARGS); + +/* Sysctl variables */ +/** @brief net.inet.ip.alias.sctp.log_level */ +static u_int sysctl_log_level = 0; /**< Stores the current level of logging */ +/** @brief net.inet.ip.alias.sctp.init_timer */ +static u_int sysctl_init_timer = 15; /**< Seconds to hold an association in the table waiting for an INIT-ACK or AddIP-ACK */ +/** @brief net.inet.ip.alias.sctp.up_timer */ +static u_int sysctl_up_timer = 300; /**< Seconds to hold an association in the table while no packets are transmitted */ +/** @brief net.inet.ip.alias.sctp.shutdown_timer */ +static u_int sysctl_shutdown_timer = 15; /**< Seconds to hold an association in the table waiting for a SHUTDOWN-COMPLETE */ +/** @brief net.inet.ip.alias.sctp.holddown_timer */ +static u_int sysctl_holddown_timer = 0; /**< Seconds to hold an association in the table after it has been shutdown (to allow for lost SHUTDOWN-COMPLETEs) */ +/** @brief net.inet.ip.alias.sctp.hashtable_size */ +static u_int sysctl_hashtable_size = SN_DEFAULT_HASH_SIZE; /**< Sets the hash table size for any NEW NAT instances (existing instances retain their existing Hash Table */ +/** @brief net.inet.ip.alias.sctp.error_on_ootb */ +static u_int sysctl_error_on_ootb = 1; /**< NAT response to receipt of OOTB packet + (0 - No response, 1 - NAT will send ErrorM only to local side, + 2 - NAT will send local ErrorM and global ErrorM if there was a partial association match + 3 - NAT will send ErrorM to both local and global) */ +/** @brief net.inet.ip.alias.sctp.accept_global_ootb_addip */ +static u_int sysctl_accept_global_ootb_addip = 0; /** 0 - enables tracking but limits the number of global IP addresses to this value) + If set to >=1 the NAT will track that many global IP addresses. This may reduce look up table conflicts, but increases processing */ + +#define SN_NO_ERROR_ON_OOTB 0 /**< Send no errorM on out of the blue packets */ +#define SN_LOCAL_ERROR_ON_OOTB 1 /**< Send only local errorM on out of the blue packets */ +#define SN_LOCALandPARTIAL_ERROR_ON_OOTB 2 /**< Send local errorM and global errorM for out of the blue packets only if partial match found */ +#define SN_ERROR_ON_OOTB 3 /**< Send errorM on out of the blue packets */ + +#ifdef SYSCTL_NODE + +SYSCTL_DECL(_net_inet); +SYSCTL_DECL(_net_inet_ip); +SYSCTL_DECL(_net_inet_ip_alias); + +SYSCTL_NODE(_net_inet_ip_alias, OID_AUTO, sctp, CTLFLAG_RW, NULL, "SCTP NAT"); + +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, log_level, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_log_level, 0, sysctl_chg_loglevel, "IU", + "Level of detail (0 - default, 1 - event, 2 - info, 3 - detail, 4 - debug, 5 - max debug)"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, init_timer, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_init_timer, 0, sysctl_chg_timer, "IU", + "Timeout value (s) while waiting for (INIT-ACK|AddIP-ACK)"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, up_timer, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_up_timer, 0, sysctl_chg_timer, "IU", + "Timeout value (s) to keep an association up with no traffic"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, shutdown_timer, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_shutdown_timer, 0, sysctl_chg_timer, "IU", + "Timeout value (s) while waiting for SHUTDOWN-COMPLETE"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, holddown_timer, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_holddown_timer, 0, sysctl_chg_timer, "IU", + "Hold association in table for this many seconds after receiving a SHUTDOWN-COMPLETE"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, hashtable_size, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_hashtable_size, 0, sysctl_chg_hashtable_size, "IU", + "Size of hash tables used for NAT lookups (100 < prime_number > 1000001)"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, error_on_ootb, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_error_on_ootb, 0, sysctl_chg_error_on_ootb, "IU", + "ErrorM sent on receipt of ootb packet:\n\t0 - none,\n\t1 - to local only,\n\t2 - to local and global if a partial association match,\n\t3 - to local and global (DoS risk)"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, accept_global_ootb_addip, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_accept_global_ootb_addip, 0, sysctl_chg_accept_global_ootb_addip, "IU", + "NAT response to receipt of global OOTB AddIP:\n\t0 - No response,\n\t1 - NAT will accept OOTB global AddIP messages for processing (Security risk)"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, initialising_chunk_proc_limit, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_initialising_chunk_proc_limit, 0, sysctl_chg_initialising_chunk_proc_limit, "IU", + "Number of chunks that should be processed if there is no current association found:\n\t > 0 (A high value is a DoS risk)"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, chunk_proc_limit, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_chunk_proc_limit, 0, sysctl_chg_chunk_proc_limit, "IU", + "Number of chunks that should be processed to find key chunk:\n\t>= initialising_chunk_proc_limit (A high value is a DoS risk)"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, param_proc_limit, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_param_proc_limit, 0, sysctl_chg_param_proc_limit, "IU", + "Number of parameters (in a chunk) that should be processed to find key parameters:\n\t> 1 (A high value is a DoS risk)"); +SYSCTL_PROC(_net_inet_ip_alias_sctp, OID_AUTO, track_global_addresses, CTLTYPE_UINT | CTLFLAG_RW, + &sysctl_track_global_addresses, 0, sysctl_chg_track_global_addresses, "IU", + "Configures the global address tracking option within the NAT:\n\t0 - Global tracking is disabled,\n\t> 0 - enables tracking but limits the number of global IP addresses to this value"); + +#endif /* SYSCTL_NODE */ + +/** @} + * @ingroup sysctl + * @brief sysctl callback for changing net.inet.ip.fw.sctp.log_level + * + * Updates the variable sysctl_log_level to the provided value and ensures + * it is in the valid range (SN_LOG_LOW -> SN_LOG_DEBUG) + */ +int sysctl_chg_loglevel(SYSCTL_HANDLER_ARGS) +{ + u_int level = *(u_int *)arg1; + int error; + + error = sysctl_handle_int(oidp, &level, 0, req); + if (error) return (error); + + sysctl_log_level = (level > SN_LOG_DEBUG_MAX)?(SN_LOG_DEBUG_MAX):(level); + sysctl_log_level = (level < SN_LOG_LOW)?(SN_LOG_LOW):(level); + + return (0); +} + +/** @ingroup sysctl + * @brief sysctl callback for changing net.inet.ip.fw.sctp.(init_timer|up_timer|shutdown_timer) + * + * Updates the timer-based sysctl variables. The new values are sanity-checked + * to make sure that they are within the range SN_MIN_TIMER-SN_MAX_TIMER. The + * holddown timer is allowed to be 0 + */ +int sysctl_chg_timer(SYSCTL_HANDLER_ARGS) +{ + u_int timer = *(u_int *)arg1; + int error; + + error = sysctl_handle_int(oidp, &timer, 0, req); + if (error) return (error); + + timer = (timer > SN_MAX_TIMER)?(SN_MAX_TIMER):(timer); + + if (((u_int *)arg1) != &sysctl_holddown_timer) + { + timer = (timer < SN_MIN_TIMER)?(SN_MIN_TIMER):(timer); + } + + *(u_int *)arg1 = timer; + + return (0); +} + +/** @ingroup sysctl + * @brief sysctl callback for changing net.inet.ip.alias.sctp.hashtable_size + * + * Updates the hashtable_size sysctl variable. The new value should be a prime + * number. We sanity check to ensure that the size is within the range + * SN_MIN_HASH_SIZE-SN_MAX_HASH_SIZE. We then check the provided number to see + * if it is prime. We approximate by checking that (2,3,5,7,11) are not factors, + * incrementing the user provided value until we find a suitable number. + */ +int sysctl_chg_hashtable_size(SYSCTL_HANDLER_ARGS) +{ + u_int size = *(u_int *)arg1; + int error; + + error = sysctl_handle_int(oidp, &size, 0, req); + if (error) return (error); + + size = (size < SN_MIN_HASH_SIZE)?(SN_MIN_HASH_SIZE):((size > SN_MAX_HASH_SIZE)?(SN_MAX_HASH_SIZE):(size)); + + size |= 0x00000001; /* make odd */ + + for(;(((size % 3) == 0) || ((size % 5) == 0) || ((size % 7) == 0) || ((size % 11) == 0)); size+=2); + sysctl_hashtable_size = size; + + return (0); +} + +/** @ingroup sysctl + * @brief sysctl callback for changing net.inet.ip.alias.sctp.error_on_ootb + * + * Updates the error_on_clash sysctl variable. + * If set to 0, no ErrorM will be sent if there is a look up table clash + * If set to 1, an ErrorM is sent only to the local side + * If set to 2, an ErrorM is sent to the local side and global side if there is + * a partial association match + * If set to 3, an ErrorM is sent to both local and global sides (DoS) risk. + */ +int sysctl_chg_error_on_ootb(SYSCTL_HANDLER_ARGS) +{ + u_int flag = *(u_int *)arg1; + int error; + + error = sysctl_handle_int(oidp, &flag, 0, req); + if (error) return (error); + + sysctl_error_on_ootb = (flag > SN_ERROR_ON_OOTB) ? SN_ERROR_ON_OOTB: flag; + + return (0); +} + +/** @ingroup sysctl + * @brief sysctl callback for changing net.inet.ip.alias.sctp.accept_global_ootb_addip + * + * If set to 1 the NAT will accept ootb global addip messages for processing (Security risk) + * Default is 0, only responding to local ootb AddIP messages + */ +int sysctl_chg_accept_global_ootb_addip(SYSCTL_HANDLER_ARGS) +{ + u_int flag = *(u_int *)arg1; + int error; + + error = sysctl_handle_int(oidp, &flag, 0, req); + if (error) return (error); + + sysctl_accept_global_ootb_addip = (flag == 1) ? 1: 0; + + return (0); +} + +/** @ingroup sysctl + * @brief sysctl callback for changing net.inet.ip.alias.sctp.initialising_chunk_proc_limit + * + * Updates the initialising_chunk_proc_limit sysctl variable. Number of chunks + * that should be processed if there is no current association found: > 0 (A + * high value is a DoS risk) + */ +int sysctl_chg_initialising_chunk_proc_limit(SYSCTL_HANDLER_ARGS) +{ + u_int proclimit = *(u_int *)arg1; + int error; + + error = sysctl_handle_int(oidp, &proclimit, 0, req); + if (error) return (error); + + sysctl_initialising_chunk_proc_limit = (proclimit < 1) ? 1: proclimit; + sysctl_chunk_proc_limit = + (sysctl_chunk_proc_limit < sysctl_initialising_chunk_proc_limit) ? sysctl_initialising_chunk_proc_limit : sysctl_chunk_proc_limit; + + return (0); +} + +/** @ingroup sysctl + * @brief sysctl callback for changing net.inet.ip.alias.sctp.chunk_proc_limit + * + * Updates the chunk_proc_limit sysctl variable. + * Number of chunks that should be processed to find key chunk: + * >= initialising_chunk_proc_limit (A high value is a DoS risk) + */ +int sysctl_chg_chunk_proc_limit(SYSCTL_HANDLER_ARGS) +{ + u_int proclimit = *(u_int *)arg1; + int error; + + error = sysctl_handle_int(oidp, &proclimit, 0, req); + if (error) return (error); + + sysctl_chunk_proc_limit = + (proclimit < sysctl_initialising_chunk_proc_limit) ? sysctl_initialising_chunk_proc_limit : proclimit; + + return (0); +} + + +/** @ingroup sysctl + * @brief sysctl callback for changing net.inet.ip.alias.sctp.param_proc_limit + * + * Updates the param_proc_limit sysctl variable. + * Number of parameters that should be processed to find key parameters: + * > 1 (A high value is a DoS risk) + */ +int sysctl_chg_param_proc_limit(SYSCTL_HANDLER_ARGS) +{ + u_int proclimit = *(u_int *)arg1; + int error; + + error = sysctl_handle_int(oidp, &proclimit, 0, req); + if (error) return (error); + + sysctl_param_proc_limit = + (proclimit < 2) ? 2 : proclimit; + + return (0); +} + +/** @ingroup sysctl + * @brief sysctl callback for changing net.inet.ip.alias.sctp.track_global_addresses + * + *Configures the global address tracking option within the NAT (0 - Global + *tracking is disabled, > 0 - enables tracking but limits the number of global + *IP addresses to this value) + */ +int sysctl_chg_track_global_addresses(SYSCTL_HANDLER_ARGS) +{ + u_int num_to_track = *(u_int *)arg1; + int error; + + error = sysctl_handle_int(oidp, &num_to_track, 0, req); + if (error) return (error); + + sysctl_track_global_addresses = (num_to_track > SN_MAX_GLOBAL_ADDRESSES) ? SN_MAX_GLOBAL_ADDRESSES : num_to_track; + + return (0); +} + + +/* ---------------------------------------------------------------------- + * CODE BEGINS HERE + * ---------------------------------------------------------------------- + */ +/** + * @brief Initialises the SCTP NAT Implementation + * + * Creates the look-up tables and the timer queue and initialises all state + * variables + * + * @param la Pointer to the relevant libalias instance + */ +void AliasSctpInit(struct libalias *la) +{ + /* Initialise association tables*/ + int i; + la->sctpNatTableSize = sysctl_hashtable_size; + SN_LOG(SN_LOG_EVENT, + SctpAliasLog("Initialising SCTP NAT Instance (hash_table_size:%d)\n", la->sctpNatTableSize)); + la->sctpTableLocal = sn_calloc(la->sctpNatTableSize, sizeof(struct sctpNatTableL)); + la->sctpTableGlobal = sn_calloc(la->sctpNatTableSize, sizeof(struct sctpNatTableG)); + la->sctpNatTimer.TimerQ = sn_calloc(SN_TIMER_QUEUE_SIZE, sizeof(struct sctpTimerQ)); + /* Initialise hash table */ + for (i = 0; i < la->sctpNatTableSize; i++) { + LIST_INIT(&la->sctpTableLocal[i]); + LIST_INIT(&la->sctpTableGlobal[i]); + } + + /* Initialise circular timer Q*/ + for (i = 0; i < SN_TIMER_QUEUE_SIZE; i++) + LIST_INIT(&la->sctpNatTimer.TimerQ[i]); +#ifdef _KERNEL + la->sctpNatTimer.loc_time=time_uptime; /* la->timeStamp is not set yet */ +#else + la->sctpNatTimer.loc_time=la->timeStamp; +#endif + la->sctpNatTimer.cur_loc = 0; + la->sctpLinkCount = 0; +} + +/** + * @brief Cleans-up the SCTP NAT Implementation prior to unloading + * + * Removes all entries from the timer queue, freeing associations as it goes. + * We then free memory allocated to the look-up tables and the time queue + * + * NOTE: We do not need to traverse the look-up tables as each association + * will always have an entry in the timer queue, freeing this memory + * once will free all memory allocated to entries in the look-up tables + * + * @param la Pointer to the relevant libalias instance + */ +void AliasSctpTerm(struct libalias *la) +{ + struct sctp_nat_assoc *assoc1, *assoc2; + int i; + + LIBALIAS_LOCK_ASSERT(la); + SN_LOG(SN_LOG_EVENT, + SctpAliasLog("Removing SCTP NAT Instance\n")); + for (i = 0; i < SN_TIMER_QUEUE_SIZE; i++) { + assoc1 = LIST_FIRST(&la->sctpNatTimer.TimerQ[i]); + while (assoc1 != NULL) { + freeGlobalAddressList(assoc1); + assoc2 = LIST_NEXT(assoc1, timer_Q); + sn_free(assoc1); + assoc1 = assoc2; + } + } + + sn_free(la->sctpTableLocal); + sn_free(la->sctpTableGlobal); + sn_free(la->sctpNatTimer.TimerQ); +} + +/** + * @brief Handles SCTP packets passed from libalias + * + * This function needs to actually NAT/drop packets and possibly create and + * send AbortM or ErrorM packets in response. The process involves: + * - Validating the direction parameter passed by the caller + * - Checking and handling any expired timers for the NAT + * - Calling sctp_PktParser() to parse the packet + * - Call ProcessSctpMsg() to decide the appropriate outcome and to update + * the NAT tables + * - Based on the return code either: + * - NAT the packet + * - Construct and send an ErrorM|AbortM packet + * - Mark the association for removal from the tables + * - Potentially remove the association from all lookup tables + * - Return the appropriate result to libalias + * + * @param la Pointer to the relevant libalias instance + * @param pip Pointer to IP packet to process + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * + * @return PKT_ALIAS_OK | PKT_ALIAS_IGNORE | PKT_ALIAS_ERROR + */ +int +SctpAlias(struct libalias *la, struct ip *pip, int direction) +{ + int rtnval; + struct sctp_nat_msg msg; + struct sctp_nat_assoc *assoc = NULL; + + if ((direction != SN_TO_LOCAL) && (direction != SN_TO_GLOBAL)) { + SctpAliasLog("ERROR: Invalid direction\n"); + return(PKT_ALIAS_ERROR); + } + + sctp_CheckTimers(la); /* Check timers */ + + /* Parse the packet */ + rtnval = sctp_PktParser(la, direction, pip, &msg, &assoc); //using *char (change to mbuf when get code from paolo) + switch (rtnval) { + case SN_PARSE_OK: + break; + case SN_PARSE_ERROR_CHHL: + /* Not an error if there is a chunk length parsing error and this is a fragmented packet */ + if (ntohs(pip->ip_off) & IP_MF) { + rtnval = SN_PARSE_OK; + break; + } + SN_LOG(SN_LOG_EVENT, + logsctperror("SN_PARSE_ERROR", msg.sctp_hdr->v_tag, rtnval, direction)); + return(PKT_ALIAS_ERROR); + case SN_PARSE_ERROR_PARTIALLOOKUP: + if (sysctl_error_on_ootb > SN_LOCALandPARTIAL_ERROR_ON_OOTB) { + SN_LOG(SN_LOG_EVENT, + logsctperror("SN_PARSE_ERROR", msg.sctp_hdr->v_tag, rtnval, direction)); + return(PKT_ALIAS_ERROR); + } + case SN_PARSE_ERROR_LOOKUP: + if (sysctl_error_on_ootb == SN_ERROR_ON_OOTB || + (sysctl_error_on_ootb == SN_LOCALandPARTIAL_ERROR_ON_OOTB && direction == SN_TO_LOCAL) || + (sysctl_error_on_ootb == SN_LOCAL_ERROR_ON_OOTB && direction == SN_TO_GLOBAL)) { + TxAbortErrorM(la, &msg, assoc, SN_REFLECT_ERROR, direction); /*NB assoc=NULL */ + return(PKT_ALIAS_RESPOND); + } + default: + SN_LOG(SN_LOG_EVENT, + logsctperror("SN_PARSE_ERROR", msg.sctp_hdr->v_tag, rtnval, direction)); + return(PKT_ALIAS_ERROR); + } + + SN_LOG(SN_LOG_DETAIL, + logsctpassoc(assoc, "*"); + logsctpparse(direction, &msg); + ); + + /* Process the SCTP message */ + rtnval = ProcessSctpMsg(la, direction, &msg, assoc); + + SN_LOG(SN_LOG_DEBUG_MAX, + logsctpassoc(assoc, "-"); + logSctpLocal(la); + logSctpGlobal(la); + ); + SN_LOG(SN_LOG_DEBUG, logTimerQ(la)); + + switch(rtnval){ + case SN_NAT_PKT: + switch(direction) { + case SN_TO_LOCAL: + DifferentialChecksum(&(msg.ip_hdr->ip_sum), + &(assoc->l_addr), &(msg.ip_hdr->ip_dst), 2); + msg.ip_hdr->ip_dst = assoc->l_addr; /* change dst address to local address*/ + break; + case SN_TO_GLOBAL: + DifferentialChecksum(&(msg.ip_hdr->ip_sum), + &(assoc->a_addr), &(msg.ip_hdr->ip_src), 2); + msg.ip_hdr->ip_src = assoc->a_addr; /* change src to alias addr*/ + break; + default: + rtnval = SN_DROP_PKT; /* shouldn't get here, but if it does drop packet */ + SN_LOG(SN_LOG_LOW, logsctperror("ERROR: Invalid direction", msg.sctp_hdr->v_tag, rtnval, direction)); + break; + } + break; + case SN_DROP_PKT: + SN_LOG(SN_LOG_DETAIL, logsctperror("SN_DROP_PKT", msg.sctp_hdr->v_tag, rtnval, direction)); + break; + case SN_REPLY_ABORT: + case SN_REPLY_ERROR: + case SN_SEND_ABORT: + TxAbortErrorM(la, &msg, assoc, rtnval, direction); + break; + default: + // big error, remove association and go to idle and write log messages + SN_LOG(SN_LOG_LOW, logsctperror("SN_PROCESSING_ERROR", msg.sctp_hdr->v_tag, rtnval, direction)); + assoc->state=SN_RM;/* Mark for removal*/ + break; + } + + /* Remove association if tagged for removal */ + if (assoc->state == SN_RM) { + if (assoc->TableRegister) { + sctp_RmTimeOut(la, assoc); + RmSctpAssoc(la, assoc); + } + LIBALIAS_LOCK_ASSERT(la); + freeGlobalAddressList(assoc); + sn_free(assoc); + } + switch(rtnval) { + case SN_NAT_PKT: + return(PKT_ALIAS_OK); + case SN_SEND_ABORT: + return(PKT_ALIAS_OK); + case SN_REPLY_ABORT: + case SN_REPLY_ERROR: + case SN_REFLECT_ERROR: + return(PKT_ALIAS_RESPOND); + case SN_DROP_PKT: + default: + return(PKT_ALIAS_ERROR); + } +} + +/** + * @brief Send an AbortM or ErrorM + * + * We construct the new SCTP packet to send in place of the existing packet we + * have been asked to NAT. This function can only be called if the original + * packet was successfully parsed as a valid SCTP packet. + * + * An AbortM (without cause) packet is the smallest SCTP packet available and as + * such there is always space in the existing packet buffer to fit the AbortM + * packet. An ErrorM packet is 4 bytes longer than the (the error cause is not + * optional). An ErrorM is sent in response to an AddIP when the Vtag/address + * combination, if added, will produce a conflict in the association look up + * tables. It may also be used for an unexpected packet - a packet with no + * matching association in the NAT table and we are requesting an AddIP so we + * can add it. The smallest valid SCTP packet while the association is in an + * up-state is a Heartbeat packet, which is big enough to be transformed to an + * ErrorM. + * + * We create a temporary character array to store the packet as we are constructing + * it. We then populate the array with appropriate values based on: + * - Packet type (AbortM | ErrorM) + * - Initial packet direction (SN_TO_LOCAL | SN_TO_GLOBAL) + * - NAT response (Send packet | Reply packet) + * + * Once complete, we copy the contents of the temporary packet over the original + * SCTP packet we were asked to NAT + * + * @param la Pointer to the relevant libalias instance + * @param sm Pointer to sctp message information + * @param assoc Pointer to current association details + * @param sndrply SN_SEND_ABORT | SN_REPLY_ABORT | SN_REPLY_ERROR + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + */ +static uint32_t +local_sctp_finalize_crc32(uint32_t crc32c) +{ + /* This routine is duplicated from SCTP + * we need to do that since it MAY be that SCTP + * is NOT compiled into the kernel. The CRC32C routines + * however are always available in libkern. + */ + uint32_t result; +#if BYTE_ORDER == BIG_ENDIAN + uint8_t byte0, byte1, byte2, byte3; + +#endif + /* Complement the result */ + result = ~crc32c; +#if BYTE_ORDER == BIG_ENDIAN + /* + * For BIG-ENDIAN.. aka Motorola byte order the result is in + * little-endian form. So we must manually swap the bytes. Then we + * can call htonl() which does nothing... + */ + byte0 = result & 0x000000ff; + byte1 = (result >> 8) & 0x000000ff; + byte2 = (result >> 16) & 0x000000ff; + byte3 = (result >> 24) & 0x000000ff; + crc32c = ((byte0 << 24) | (byte1 << 16) | (byte2 << 8) | byte3); +#else + /* + * For INTEL platforms the result comes out in network order. No + * htonl is required or the swap above. So we optimize out both the + * htonl and the manual swap above. + */ + crc32c = result; +#endif + return (crc32c); +} + +static void +TxAbortErrorM(struct libalias *la, struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int sndrply, int direction) +{ + int sctp_size = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_error_cause); + int ip_size = sizeof(struct ip) + sctp_size; + int include_error_cause = 1; + char tmp_ip[ip_size]; + + if (ntohs(sm->ip_hdr->ip_len) < ip_size) { /* short packet, cannot send error cause */ + include_error_cause = 0; + ip_size = ip_size - sizeof(struct sctp_error_cause); + sctp_size = sctp_size - sizeof(struct sctp_error_cause); + } + /* Assign header pointers packet */ + struct ip* ip = (struct ip *) tmp_ip; + struct sctphdr* sctp_hdr = (struct sctphdr *) ((char *) ip + sizeof(*ip)); + struct sctp_chunkhdr* chunk_hdr = (struct sctp_chunkhdr *) ((char *) sctp_hdr + sizeof(*sctp_hdr)); + struct sctp_error_cause* error_cause = (struct sctp_error_cause *) ((char *) chunk_hdr + sizeof(*chunk_hdr)); + + /* construct ip header */ + ip->ip_v = sm->ip_hdr->ip_v; + ip->ip_hl = 5; /* 5*32 bit words */ + ip->ip_tos = 0; + ip->ip_len = htons(ip_size); + ip->ip_id = sm->ip_hdr->ip_id; + ip->ip_off = 0; + ip->ip_ttl = 255; + ip->ip_p = IPPROTO_SCTP; + /* + The definitions below should be removed when they make it into the SCTP stack + */ +#define SCTP_MIDDLEBOX_FLAG 0x02 +#define SCTP_NAT_TABLE_COLLISION 0x00b0 +#define SCTP_MISSING_NAT 0x00b1 + chunk_hdr->chunk_type = (sndrply & SN_TX_ABORT) ? SCTP_ABORT_ASSOCIATION : SCTP_OPERATION_ERROR; + chunk_hdr->chunk_flags = SCTP_MIDDLEBOX_FLAG; + if (include_error_cause) { + error_cause->code = htons((sndrply & SN_REFLECT_ERROR) ? SCTP_MISSING_NAT : SCTP_NAT_TABLE_COLLISION); + error_cause->length = htons(sizeof(struct sctp_error_cause)); + chunk_hdr->chunk_length = htons(sizeof(*chunk_hdr) + sizeof(struct sctp_error_cause)); + } else { + chunk_hdr->chunk_length = htons(sizeof(*chunk_hdr)); + } + + /* set specific values */ + switch(sndrply) { + case SN_REFLECT_ERROR: + chunk_hdr->chunk_flags |= SCTP_HAD_NO_TCB; /* set Tbit */ + sctp_hdr->v_tag = sm->sctp_hdr->v_tag; + break; + case SN_REPLY_ERROR: + sctp_hdr->v_tag = (direction == SN_TO_LOCAL) ? assoc->g_vtag : assoc->l_vtag ; + break; + case SN_SEND_ABORT: + sctp_hdr->v_tag = sm->sctp_hdr->v_tag; + break; + case SN_REPLY_ABORT: + sctp_hdr->v_tag = sm->sctpchnk.Init->initiate_tag; + break; + } + + /* Set send/reply values */ + if (sndrply == SN_SEND_ABORT) { /*pass through NAT */ + ip->ip_src = (direction == SN_TO_LOCAL) ? sm->ip_hdr->ip_src : assoc->a_addr; + ip->ip_dst = (direction == SN_TO_LOCAL) ? assoc->l_addr : sm->ip_hdr->ip_dst; + sctp_hdr->src_port = sm->sctp_hdr->src_port; + sctp_hdr->dest_port = sm->sctp_hdr->dest_port; + } else { /* reply and reflect */ + ip->ip_src = sm->ip_hdr->ip_dst; + ip->ip_dst = sm->ip_hdr->ip_src; + sctp_hdr->src_port = sm->sctp_hdr->dest_port; + sctp_hdr->dest_port = sm->sctp_hdr->src_port; + } + + /* Calculate IP header checksum */ + ip->ip_sum = in_cksum_hdr(ip); + + /* calculate SCTP header CRC32 */ + sctp_hdr->checksum = 0; + sctp_hdr->checksum = local_sctp_finalize_crc32(calculate_crc32c(0xffffffff, (unsigned char *) sctp_hdr, sctp_size)); + + memcpy(sm->ip_hdr, ip, ip_size); + + SN_LOG(SN_LOG_EVENT,SctpAliasLog("%s %s 0x%x (->%s:%u vtag=0x%x crc=0x%x)\n", + ((sndrply == SN_SEND_ABORT) ? "Sending" : "Replying"), + ((sndrply & SN_TX_ERROR) ? "ErrorM" : "AbortM"), + (include_error_cause ? ntohs(error_cause->code) : 0), + inet_ntoa(ip->ip_dst),ntohs(sctp_hdr->dest_port), + ntohl(sctp_hdr->v_tag), ntohl(sctp_hdr->checksum))); +} + +/* ---------------------------------------------------------------------- + * PACKET PARSER CODE + * ---------------------------------------------------------------------- + */ +/** @addtogroup packet_parser + * + * These functions parse the SCTP packet and fill a sctp_nat_msg structure + * with the parsed contents. + */ +/** @ingroup packet_parser + * @brief Parses SCTP packets for the key SCTP chunk that will be processed + * + * This module parses SCTP packets for the key SCTP chunk that will be processed + * The module completes the sctp_nat_msg structure and either retrieves the + * relevant (existing) stored association from the Hash Tables or creates a new + * association entity with state SN_ID + * + * @param la Pointer to the relevant libalias instance + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * @param pip + * @param sm Pointer to sctp message information + * @param passoc Pointer to the association this SCTP Message belongs to + * + * @return SN_PARSE_OK | SN_PARSE_ERROR_* + */ +static int +sctp_PktParser(struct libalias *la, int direction, struct ip *pip, + struct sctp_nat_msg *sm, struct sctp_nat_assoc **passoc) +//sctp_PktParser(int direction, struct mbuf *ipak, int ip_hdr_len,struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc) +{ + struct sctphdr *sctp_hdr; + struct sctp_chunkhdr *chunk_hdr; + struct sctp_paramhdr *param_hdr; + struct in_addr ipv4addr; + int bytes_left; /* bytes left in ip packet */ + int chunk_length; + int chunk_count; + int partial_match = 0; + // mbuf *mp; + // int mlen; + + // mlen = SCTP_HEADER_LEN(i_pak); + // mp = SCTP_HEADER_TO_CHAIN(i_pak); /* does nothing in bsd since header and chain not separate */ + + /* + * Note, that if the VTag is zero, it must be an INIT + * Also, I am only interested in the content of INIT and ADDIP chunks + */ + + // no mbuf stuff from Paolo yet so ... + sm->ip_hdr = pip; + /* remove ip header length from the bytes_left */ + bytes_left = ntohs(pip->ip_len) - (pip->ip_hl << 2); + + /* Check SCTP header length and move to first chunk */ + if (bytes_left < sizeof(struct sctphdr)) { + sm->sctp_hdr = NULL; + return(SN_PARSE_ERROR_IPSHL); /* packet not long enough*/ + } + + sm->sctp_hdr = sctp_hdr = (struct sctphdr *) ip_next(pip); + bytes_left -= sizeof(struct sctphdr); + + /* Check for valid ports (zero valued ports would find partially initialised associations */ + if (sctp_hdr->src_port == 0 || sctp_hdr->dest_port == 0) + return(SN_PARSE_ERROR_PORT); + + /* Check length of first chunk */ + if (bytes_left < SN_MIN_CHUNK_SIZE) /* malformed chunk - could cause endless loop*/ + return(SN_PARSE_ERROR_CHHL); /* packet not long enough for this chunk */ + + /* First chunk */ + chunk_hdr = SN_SCTP_FIRSTCHUNK(sctp_hdr); + + chunk_length = SCTP_SIZE32(ntohs(chunk_hdr->chunk_length)); + if ((chunk_length < SN_MIN_CHUNK_SIZE) || (chunk_length > bytes_left)) /* malformed chunk - could cause endless loop*/ + return(SN_PARSE_ERROR_CHHL); + + if ((chunk_hdr->chunk_flags & SCTP_HAD_NO_TCB) && + ((chunk_hdr->chunk_type == SCTP_ABORT_ASSOCIATION) || + (chunk_hdr->chunk_type == SCTP_SHUTDOWN_COMPLETE))) { + /* T-Bit set */ + if (direction == SN_TO_LOCAL) + *passoc = FindSctpGlobalT(la, pip->ip_src, sctp_hdr->v_tag, sctp_hdr->dest_port, sctp_hdr->src_port); + else + *passoc = FindSctpLocalT(la, pip->ip_dst, sctp_hdr->v_tag, sctp_hdr->dest_port, sctp_hdr->src_port); + } else { + /* Proper v_tag settings */ + if (direction == SN_TO_LOCAL) + *passoc = FindSctpGlobal(la, pip->ip_src, sctp_hdr->v_tag, sctp_hdr->src_port, sctp_hdr->dest_port, &partial_match); + else + *passoc = FindSctpLocal(la, pip->ip_src, pip->ip_dst, sctp_hdr->v_tag, sctp_hdr->src_port, sctp_hdr->dest_port); + } + + chunk_count = 1; + /* Real packet parsing occurs below */ + sm->msg = SN_SCTP_OTHER;/* Initialise to largest value*/ + sm->chunk_length = 0; /* only care about length for key chunks */ + while (IS_SCTP_CONTROL(chunk_hdr)) { + switch(chunk_hdr->chunk_type) { + case SCTP_INITIATION: + if (chunk_length < sizeof(struct sctp_init_chunk)) /* malformed chunk*/ + return(SN_PARSE_ERROR_CHHL); + sm->msg = SN_SCTP_INIT; + sm->sctpchnk.Init = (struct sctp_init *) ((char *) chunk_hdr + sizeof(struct sctp_chunkhdr)); + sm->chunk_length = chunk_length; + /* if no existing association, create a new one */ + if (*passoc == NULL) { + if (sctp_hdr->v_tag == 0){ //Init requires vtag=0 + *passoc = (struct sctp_nat_assoc *) sn_malloc(sizeof(struct sctp_nat_assoc)); + if (*passoc == NULL) {/* out of resources */ + return(SN_PARSE_ERROR_AS_MALLOC); + } + /* Initialise association - malloc initialises memory to zeros */ + (*passoc)->state = SN_ID; + LIST_INIT(&((*passoc)->Gaddr)); /* always initialise to avoid memory problems */ + (*passoc)->TableRegister = SN_NULL_TBL; + return(SN_PARSE_OK); + } + return(SN_PARSE_ERROR_VTAG); + } + return(SN_PARSE_ERROR_LOOKUP); + case SCTP_INITIATION_ACK: + if (chunk_length < sizeof(struct sctp_init_ack_chunk)) /* malformed chunk*/ + return(SN_PARSE_ERROR_CHHL); + sm->msg = SN_SCTP_INITACK; + sm->sctpchnk.InitAck = (struct sctp_init_ack *) ((char *) chunk_hdr + sizeof(struct sctp_chunkhdr)); + sm->chunk_length = chunk_length; + return ((*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP):(SN_PARSE_OK)); + case SCTP_ABORT_ASSOCIATION: /* access only minimum sized chunk */ + sm->msg = SN_SCTP_ABORT; + sm->chunk_length = chunk_length; + return ((*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP_ABORT):(SN_PARSE_OK)); + case SCTP_SHUTDOWN_ACK: + if (chunk_length < sizeof(struct sctp_shutdown_ack_chunk)) /* malformed chunk*/ + return(SN_PARSE_ERROR_CHHL); + if (sm->msg > SN_SCTP_SHUTACK) { + sm->msg = SN_SCTP_SHUTACK; + sm->chunk_length = chunk_length; + } + break; + case SCTP_SHUTDOWN_COMPLETE: /* minimum sized chunk */ + if (sm->msg > SN_SCTP_SHUTCOMP) { + sm->msg = SN_SCTP_SHUTCOMP; + sm->chunk_length = chunk_length; + } + return ((*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP):(SN_PARSE_OK)); + case SCTP_ASCONF: + if (sm->msg > SN_SCTP_ASCONF) { + if (chunk_length < (sizeof(struct sctp_asconf_chunk) + sizeof(struct sctp_ipv4addr_param))) /* malformed chunk*/ + return(SN_PARSE_ERROR_CHHL); + //leave parameter searching to later, if required + param_hdr = (struct sctp_paramhdr *) ((char *) chunk_hdr + sizeof(struct sctp_asconf_chunk)); /*compulsory IP parameter*/ + if (ntohs(param_hdr->param_type) == SCTP_IPV4_ADDRESS) { + if ((*passoc == NULL) && (direction == SN_TO_LOCAL)) { /* AddIP with no association */ + /* try look up with the ASCONF packet's alternative address */ + ipv4addr.s_addr = ((struct sctp_ipv4addr_param *) param_hdr)->addr; + *passoc = FindSctpGlobal(la, ipv4addr, sctp_hdr->v_tag, sctp_hdr->src_port, sctp_hdr->dest_port, &partial_match); + } + param_hdr = (struct sctp_paramhdr *) + ((char *) param_hdr + sizeof(struct sctp_ipv4addr_param)); /*asconf's compulsory address parameter */ + sm->chunk_length = chunk_length - sizeof(struct sctp_asconf_chunk) - sizeof(struct sctp_ipv4addr_param); /* rest of chunk */ + } else { + if (chunk_length < (sizeof(struct sctp_asconf_chunk) + sizeof(struct sctp_ipv6addr_param))) /* malformed chunk*/ + return(SN_PARSE_ERROR_CHHL); + param_hdr = (struct sctp_paramhdr *) + ((char *) param_hdr + sizeof(struct sctp_ipv6addr_param)); /*asconf's compulsory address parameter */ + sm->chunk_length = chunk_length - sizeof(struct sctp_asconf_chunk) - sizeof(struct sctp_ipv6addr_param); /* rest of chunk */ + } + sm->msg = SN_SCTP_ASCONF; + sm->sctpchnk.Asconf = param_hdr; + + if (*passoc == NULL) { /* AddIP with no association */ + *passoc = (struct sctp_nat_assoc *) sn_malloc(sizeof(struct sctp_nat_assoc)); + if (*passoc == NULL) {/* out of resources */ + return(SN_PARSE_ERROR_AS_MALLOC); + } + /* Initialise association - malloc initialises memory to zeros */ + (*passoc)->state = SN_ID; + LIST_INIT(&((*passoc)->Gaddr)); /* always initialise to avoid memory problems */ + (*passoc)->TableRegister = SN_NULL_TBL; + return(SN_PARSE_OK); + } + } + break; + case SCTP_ASCONF_ACK: + if (sm->msg > SN_SCTP_ASCONFACK) { + if (chunk_length < sizeof(struct sctp_asconf_ack_chunk)) /* malformed chunk*/ + return(SN_PARSE_ERROR_CHHL); + //leave parameter searching to later, if required + param_hdr = (struct sctp_paramhdr *) ((char *) chunk_hdr + + sizeof(struct sctp_asconf_ack_chunk)); + sm->msg = SN_SCTP_ASCONFACK; + sm->sctpchnk.Asconf = param_hdr; + sm->chunk_length = chunk_length - sizeof(struct sctp_asconf_ack_chunk); + } + break; + default: + break; /* do nothing*/ + } + + /* if no association is found exit - we need to find an Init or AddIP within sysctl_initialising_chunk_proc_limit */ + if ((*passoc == NULL) && (chunk_count >= sysctl_initialising_chunk_proc_limit)) + return(SN_PARSE_ERROR_LOOKUP); + + /* finished with this chunk, on to the next chunk*/ + bytes_left-= chunk_length; + + /* Is this the end of the packet ? */ + if (bytes_left == 0) + return (*passoc == NULL)?(SN_PARSE_ERROR_LOOKUP):(SN_PARSE_OK); + + /* Are there enough bytes in packet to at least retrieve length of next chunk ? */ + if (bytes_left < SN_MIN_CHUNK_SIZE) + return(SN_PARSE_ERROR_CHHL); + + chunk_hdr = SN_SCTP_NEXTCHUNK(chunk_hdr); + + /* Is the chunk long enough to not cause endless look and are there enough bytes in packet to read the chunk ? */ + chunk_length = SCTP_SIZE32(ntohs(chunk_hdr->chunk_length)); + if ((chunk_length < SN_MIN_CHUNK_SIZE) || (chunk_length > bytes_left)) + return(SN_PARSE_ERROR_CHHL); + if(++chunk_count > sysctl_chunk_proc_limit) + return(SN_PARSE_OK); /* limit for processing chunks, take what we get */ + } + + if (*passoc == NULL) + return (partial_match)?(SN_PARSE_ERROR_PARTIALLOOKUP):(SN_PARSE_ERROR_LOOKUP); + else + return(SN_PARSE_OK); +} + +/** @ingroup packet_parser + * @brief Extract Vtags from Asconf Chunk + * + * GetAsconfVtags scans an Asconf Chunk for the vtags parameter, and then + * extracts the vtags. + * + * GetAsconfVtags is not called from within sctp_PktParser. It is called only + * from within ID_process when an AddIP has been received. + * + * @param la Pointer to the relevant libalias instance + * @param sm Pointer to sctp message information + * @param l_vtag Pointer to the local vtag in the association this SCTP Message belongs to + * @param g_vtag Pointer to the local vtag in the association this SCTP Message belongs to + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * + * @return 1 - success | 0 - fail + */ +static int +GetAsconfVtags(struct libalias *la, struct sctp_nat_msg *sm, uint32_t *l_vtag, uint32_t *g_vtag, int direction) +{ + /* To be removed when information is in the sctp headers */ +#define SCTP_VTAG_PARAM 0xC007 + struct sctp_vtag_param { + struct sctp_paramhdr ph;/* type=SCTP_VTAG_PARAM */ + uint32_t local_vtag; + uint32_t remote_vtag; + } __attribute__((packed)); + + struct sctp_vtag_param *vtag_param; + struct sctp_paramhdr *param; + int bytes_left; + int param_size; + int param_count; + + param_count = 1; + param = sm->sctpchnk.Asconf; + param_size = SCTP_SIZE32(ntohs(param->param_length)); + bytes_left = sm->chunk_length; + /* step through Asconf parameters */ + while((bytes_left >= param_size) && (bytes_left >= SN_VTAG_PARAM_SIZE)) { + if (ntohs(param->param_type) == SCTP_VTAG_PARAM) { + vtag_param = (struct sctp_vtag_param *) param; + switch(direction) { + /* The Internet draft is a little ambigious as to order of these vtags. + We think it is this way around. If we are wrong, the order will need + to be changed. */ + case SN_TO_GLOBAL: + *g_vtag = vtag_param->local_vtag; + *l_vtag = vtag_param->remote_vtag; + break; + case SN_TO_LOCAL: + *g_vtag = vtag_param->remote_vtag; + *l_vtag = vtag_param->local_vtag; + break; + } + return(1); /* found */ + } + + bytes_left -= param_size; + if (bytes_left < SN_MIN_PARAM_SIZE) return(0); + + param = SN_SCTP_NEXTPARAM(param); + param_size = SCTP_SIZE32(ntohs(param->param_length)); + if (++param_count > sysctl_param_proc_limit) { + SN_LOG(SN_LOG_EVENT, + logsctperror("Parameter parse limit exceeded (GetAsconfVtags)", + sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction)); + return(0); /* not found limit exceeded*/ + } + } + return(0); /* not found */ +} + +/** @ingroup packet_parser + * @brief AddGlobalIPAddresses from Init,InitAck,or AddIP packets + * + * AddGlobalIPAddresses scans an SCTP chunk (in sm) for Global IP addresses, and + * adds them. + * + * @param sm Pointer to sctp message information + * @param assoc Pointer to the association this SCTP Message belongs to + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * + */ +static void +AddGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction) +{ + struct sctp_ipv4addr_param *ipv4_param; + struct sctp_paramhdr *param = NULL; + struct sctp_GlobalAddress *G_Addr; + struct in_addr g_addr = {0}; + int bytes_left = 0; + int param_size; + int param_count, addr_param_count = 0; + + switch(direction) { + case SN_TO_GLOBAL: /* does not contain global addresses */ + g_addr = sm->ip_hdr->ip_dst; + bytes_left = 0; /* force exit */ + break; + case SN_TO_LOCAL: + g_addr = sm->ip_hdr->ip_src; + param_count = 1; + switch(sm->msg) { + case SN_SCTP_INIT: + bytes_left = sm->chunk_length - sizeof(struct sctp_init_chunk); + param = (struct sctp_paramhdr *)((char *)sm->sctpchnk.Init + sizeof(struct sctp_init)); + break; + case SN_SCTP_INITACK: + bytes_left = sm->chunk_length - sizeof(struct sctp_init_ack_chunk); + param = (struct sctp_paramhdr *)((char *)sm->sctpchnk.InitAck + sizeof(struct sctp_init_ack)); + break; + case SN_SCTP_ASCONF: + bytes_left = sm->chunk_length; + param = sm->sctpchnk.Asconf; + break; + } + } + if (bytes_left >= SN_MIN_PARAM_SIZE) + param_size = SCTP_SIZE32(ntohs(param->param_length)); + else + param_size = bytes_left+1; /* force skip loop */ + + if ((assoc->state == SN_ID) && ((sm->msg == SN_SCTP_INIT) || (bytes_left < SN_MIN_PARAM_SIZE))) {/* add pkt address */ + G_Addr = (struct sctp_GlobalAddress *) sn_malloc(sizeof(struct sctp_GlobalAddress)); + if (G_Addr == NULL) {/* out of resources */ + SN_LOG(SN_LOG_EVENT, + logsctperror("AddGlobalIPAddress: No resources for adding global address - revert to no tracking", + sm->sctp_hdr->v_tag, 0, direction)); + assoc->num_Gaddr = 0; /* don't track any more for this assoc*/ + sysctl_track_global_addresses=0; + return; + } + G_Addr->g_addr = g_addr; + if (!Add_Global_Address_to_List(assoc, G_Addr)) + SN_LOG(SN_LOG_EVENT, + logsctperror("AddGlobalIPAddress: Address already in list", + sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); + } + + /* step through parameters */ + while((bytes_left >= param_size) && (bytes_left >= sizeof(struct sctp_ipv4addr_param))) { + if (assoc->num_Gaddr >= sysctl_track_global_addresses) { + SN_LOG(SN_LOG_EVENT, + logsctperror("AddGlobalIPAddress: Maximum Number of addresses reached", + sm->sctp_hdr->v_tag, sysctl_track_global_addresses, direction)); + return; + } + switch(ntohs(param->param_type)) { + case SCTP_ADD_IP_ADDRESS: + /* skip to address parameter - leave param_size so bytes left will be calculated properly*/ + param = (struct sctp_paramhdr *) &((struct sctp_asconf_addrv4_param *) param)->addrp; + case SCTP_IPV4_ADDRESS: + ipv4_param = (struct sctp_ipv4addr_param *) param; + /* add addresses to association */ + G_Addr = (struct sctp_GlobalAddress *) sn_malloc(sizeof(struct sctp_GlobalAddress)); + if (G_Addr == NULL) {/* out of resources */ + SN_LOG(SN_LOG_EVENT, + logsctperror("AddGlobalIPAddress: No resources for adding global address - revert to no tracking", + sm->sctp_hdr->v_tag, 0, direction)); + assoc->num_Gaddr = 0; /* don't track any more for this assoc*/ + sysctl_track_global_addresses=0; + return; + } + /* add address */ + addr_param_count++; + if ((sm->msg == SN_SCTP_ASCONF) && (ipv4_param->addr == INADDR_ANY)) { /* use packet address */ + G_Addr->g_addr = g_addr; + if (!Add_Global_Address_to_List(assoc, G_Addr)) + SN_LOG(SN_LOG_EVENT, + logsctperror("AddGlobalIPAddress: Address already in list", + sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); + return; /*shouldn't be any other addresses if the zero address is given*/ + } else { + G_Addr->g_addr.s_addr = ipv4_param->addr; + if (!Add_Global_Address_to_List(assoc, G_Addr)) + SN_LOG(SN_LOG_EVENT, + logsctperror("AddGlobalIPAddress: Address already in list", + sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); + } + } + + bytes_left -= param_size; + if (bytes_left < SN_MIN_PARAM_SIZE) + break; + + param = SN_SCTP_NEXTPARAM(param); + param_size = SCTP_SIZE32(ntohs(param->param_length)); + if (++param_count > sysctl_param_proc_limit) { + SN_LOG(SN_LOG_EVENT, + logsctperror("Parameter parse limit exceeded (AddGlobalIPAddress)", + sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction)); + break; /* limit exceeded*/ + } + } + if (addr_param_count == 0) { + SN_LOG(SN_LOG_DETAIL, + logsctperror("AddGlobalIPAddress: no address parameters to add", + sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); + } +} + +/** + * @brief Add_Global_Address_to_List + * + * Adds a global IP address to an associations address list, if it is not + * already there. The first address added us usually the packet's address, and + * is most likely to be used, so it is added at the beginning. Subsequent + * addresses are added after this one. + * + * @param assoc Pointer to the association this SCTP Message belongs to + * @param G_addr Pointer to the global address to add + * + * @return 1 - success | 0 - fail + */ +static int Add_Global_Address_to_List(struct sctp_nat_assoc *assoc, struct sctp_GlobalAddress *G_addr) +{ + struct sctp_GlobalAddress *iter_G_Addr = NULL, *first_G_Addr = NULL; + first_G_Addr = LIST_FIRST(&(assoc->Gaddr)); + if (first_G_Addr == NULL) { + LIST_INSERT_HEAD(&(assoc->Gaddr), G_addr, list_Gaddr); /* add new address to beginning of list*/ + } else { + LIST_FOREACH(iter_G_Addr, &(assoc->Gaddr), list_Gaddr) { + if (G_addr->g_addr.s_addr == iter_G_Addr->g_addr.s_addr) + return(0); /* already exists, so don't add */ + } + LIST_INSERT_AFTER(first_G_Addr, G_addr, list_Gaddr); /* add address to end of list*/ + } + assoc->num_Gaddr++; + return(1); /* success */ +} + +/** @ingroup packet_parser + * @brief RmGlobalIPAddresses from DelIP packets + * + * RmGlobalIPAddresses scans an ASCONF chunk for DelIP parameters to remove the + * given Global IP addresses from the association. It will not delete the + * the address if it is a list of one address. + * + * + * @param sm Pointer to sctp message information + * @param assoc Pointer to the association this SCTP Message belongs to + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * + */ +static void +RmGlobalIPAddresses(struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc, int direction) +{ + struct sctp_asconf_addrv4_param *asconf_ipv4_param; + struct sctp_paramhdr *param; + struct sctp_GlobalAddress *G_Addr, *G_Addr_tmp; + struct in_addr g_addr; + int bytes_left; + int param_size; + int param_count; + + if(direction == SN_TO_GLOBAL) + g_addr = sm->ip_hdr->ip_dst; + else + g_addr = sm->ip_hdr->ip_src; + + bytes_left = sm->chunk_length; + param_count = 1; + param = sm->sctpchnk.Asconf; + if (bytes_left >= SN_MIN_PARAM_SIZE) { + param_size = SCTP_SIZE32(ntohs(param->param_length)); + } else { + SN_LOG(SN_LOG_EVENT, + logsctperror("RmGlobalIPAddress: truncated packet - cannot remove IP addresses", + sm->sctp_hdr->v_tag, sysctl_track_global_addresses, direction)); + return; + } + + /* step through Asconf parameters */ + while((bytes_left >= param_size) && (bytes_left >= sizeof(struct sctp_ipv4addr_param))) { + if (ntohs(param->param_type) == SCTP_DEL_IP_ADDRESS) { + asconf_ipv4_param = (struct sctp_asconf_addrv4_param *) param; + if (asconf_ipv4_param->addrp.addr == INADDR_ANY) { /* remove all bar pkt address */ + LIST_FOREACH_SAFE(G_Addr, &(assoc->Gaddr), list_Gaddr, G_Addr_tmp) { + if(G_Addr->g_addr.s_addr != sm->ip_hdr->ip_src.s_addr) { + if (assoc->num_Gaddr > 1) { /* only delete if more than one */ + LIST_REMOVE(G_Addr, list_Gaddr); + sn_free(G_Addr); + assoc->num_Gaddr--; + } else { + SN_LOG(SN_LOG_EVENT, + logsctperror("RmGlobalIPAddress: Request to remove last IP address (didn't)", + sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); + } + } + } + return; /*shouldn't be any other addresses if the zero address is given*/ + } else { + LIST_FOREACH_SAFE(G_Addr, &(assoc->Gaddr), list_Gaddr, G_Addr_tmp) { + if(G_Addr->g_addr.s_addr == asconf_ipv4_param->addrp.addr) { + if (assoc->num_Gaddr > 1) { /* only delete if more than one */ + LIST_REMOVE(G_Addr, list_Gaddr); + sn_free(G_Addr); + assoc->num_Gaddr--; + break; /* Since add only adds new addresses, there should be no double entries */ + } else { + SN_LOG(SN_LOG_EVENT, + logsctperror("RmGlobalIPAddress: Request to remove last IP address (didn't)", + sm->sctp_hdr->v_tag, assoc->num_Gaddr, direction)); + } + } + } + } + } + bytes_left -= param_size; + if (bytes_left == 0) return; + else if (bytes_left < SN_MIN_PARAM_SIZE) { + SN_LOG(SN_LOG_EVENT, + logsctperror("RmGlobalIPAddress: truncated packet - may not have removed all IP addresses", + sm->sctp_hdr->v_tag, sysctl_track_global_addresses, direction)); + return; + } + + param = SN_SCTP_NEXTPARAM(param); + param_size = SCTP_SIZE32(ntohs(param->param_length)); + if (++param_count > sysctl_param_proc_limit) { + SN_LOG(SN_LOG_EVENT, + logsctperror("Parameter parse limit exceeded (RmGlobalIPAddress)", + sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction)); + return; /* limit exceeded*/ + } + } +} + +/** @ingroup packet_parser + * @brief Check that ASCONF was successful + * + * Each ASCONF configuration parameter carries a correlation ID which should be + * matched with an ASCONFack. This is difficult for a NAT, since every + * association could potentially have a number of outstanding ASCONF + * configuration parameters, which should only be activated on receipt of the + * ACK. + * + * Currently we only look for an ACK when the NAT is setting up a new + * association (ie AddIP for a connection that the NAT does not know about + * because the original Init went through a public interface or another NAT) + * Since there is currently no connection on this path, there should be no other + * ASCONF configuration parameters outstanding, so we presume that if there is + * an ACK that it is responding to the AddIP and activate the new association. + * + * @param la Pointer to the relevant libalias instance + * @param sm Pointer to sctp message information + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * + * @return 1 - success | 0 - fail + */ +static int +IsASCONFack(struct libalias *la, struct sctp_nat_msg *sm, int direction) +{ + struct sctp_paramhdr *param; + int bytes_left; + int param_size; + int param_count; + + param_count = 1; + param = sm->sctpchnk.Asconf; + param_size = SCTP_SIZE32(ntohs(param->param_length)); + if (param_size == 8) + return(1); /*success - default acknowledgement of everything */ + + bytes_left = sm->chunk_length; + if (bytes_left < param_size) + return(0); /* not found */ + /* step through Asconf parameters */ + while(bytes_left >= SN_ASCONFACK_PARAM_SIZE) { + if (ntohs(param->param_type) == SCTP_SUCCESS_REPORT) + return(1); /* success - but can't match correlation IDs - should only be one */ + /* check others just in case */ + bytes_left -= param_size; + if (bytes_left >= SN_MIN_PARAM_SIZE) { + param = SN_SCTP_NEXTPARAM(param); + } else { + return(0); + } + param_size = SCTP_SIZE32(ntohs(param->param_length)); + if (bytes_left < param_size) return(0); + + if (++param_count > sysctl_param_proc_limit) { + SN_LOG(SN_LOG_EVENT, + logsctperror("Parameter parse limit exceeded (IsASCONFack)", + sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction)); + return(0); /* not found limit exceeded*/ + } + } + return(0); /* not success */ +} + +/** @ingroup packet_parser + * @brief Check to see if ASCONF contains an Add IP or Del IP parameter + * + * IsADDorDEL scans an ASCONF packet to see if it contains an AddIP or DelIP + * parameter + * + * @param la Pointer to the relevant libalias instance + * @param sm Pointer to sctp message information + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * + * @return SCTP_ADD_IP_ADDRESS | SCTP_DEL_IP_ADDRESS | 0 - fail + */ +static int +IsADDorDEL(struct libalias *la, struct sctp_nat_msg *sm, int direction) +{ + struct sctp_paramhdr *param; + int bytes_left; + int param_size; + int param_count; + + param_count = 1; + param = sm->sctpchnk.Asconf; + param_size = SCTP_SIZE32(ntohs(param->param_length)); + + bytes_left = sm->chunk_length; + if (bytes_left < param_size) + return(0); /* not found */ + /* step through Asconf parameters */ + while(bytes_left >= SN_ASCONFACK_PARAM_SIZE) { + if (ntohs(param->param_type) == SCTP_ADD_IP_ADDRESS) + return(SCTP_ADD_IP_ADDRESS); + else if (ntohs(param->param_type) == SCTP_DEL_IP_ADDRESS) + return(SCTP_DEL_IP_ADDRESS); + /* check others just in case */ + bytes_left -= param_size; + if (bytes_left >= SN_MIN_PARAM_SIZE) { + param = SN_SCTP_NEXTPARAM(param); + } else { + return(0); /*Neither found */ + } + param_size = SCTP_SIZE32(ntohs(param->param_length)); + if (bytes_left < param_size) return(0); + + if (++param_count > sysctl_param_proc_limit) { + SN_LOG(SN_LOG_EVENT, + logsctperror("Parameter parse limit exceeded IsADDorDEL)", + sm->sctp_hdr->v_tag, sysctl_param_proc_limit, direction)); + return(0); /* not found limit exceeded*/ + } + } + return(0); /*Neither found */ +} + +/* ---------------------------------------------------------------------- + * STATE MACHINE CODE + * ---------------------------------------------------------------------- + */ +/** @addtogroup state_machine + * + * The SCTP NAT State Machine functions will: + * - Process an already parsed packet + * - Use the existing NAT Hash Tables + * - Determine the next state for the association + * - Update the NAT Hash Tables and Timer Queues + * - Return the appropriate action to take with the packet + */ +/** @ingroup state_machine + * @brief Process SCTP message + * + * This function is the base state machine. It calls the processing engine for + * each state. + * + * @param la Pointer to the relevant libalias instance + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * @param sm Pointer to sctp message information + * @param assoc Pointer to the association this SCTP Message belongs to + * + * @return SN_DROP_PKT | SN_NAT_PKT | SN_REPLY_ABORT | SN_REPLY_ERROR | SN_PROCESSING_ERROR + */ +static int +ProcessSctpMsg(struct libalias *la, int direction, struct sctp_nat_msg *sm, struct sctp_nat_assoc *assoc) +{ + int rtnval; + + switch (assoc->state) { + case SN_ID: /* Idle */ + rtnval = ID_process(la, direction, assoc, sm); + if (rtnval != SN_NAT_PKT) { + assoc->state = SN_RM;/* Mark for removal*/ + } + return(rtnval); + case SN_INi: /* Initialising - Init */ + return(INi_process(la, direction, assoc, sm)); + case SN_INa: /* Initialising - AddIP */ + return(INa_process(la, direction, assoc, sm)); + case SN_UP: /* Association UP */ + return(UP_process(la, direction, assoc, sm)); + case SN_CL: /* Association Closing */ + return(CL_process(la, direction, assoc, sm)); + } + return(SN_PROCESSING_ERROR); +} + +/** @ingroup state_machine + * @brief Process SCTP message while in the Idle state + * + * This function looks for an Incoming INIT or AddIP message. + * + * All other SCTP messages are invalid when in SN_ID, and are dropped. + * + * @param la Pointer to the relevant libalias instance + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * @param sm Pointer to sctp message information + * @param assoc Pointer to the association this SCTP Message belongs to + * + * @return SN_NAT_PKT | SN_DROP_PKT | SN_REPLY_ABORT | SN_REPLY_ERROR + */ +static int +ID_process(struct libalias *la, int direction, struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm) +{ + switch(sm->msg) { + case SN_SCTP_ASCONF: /* a packet containing an ASCONF chunk with ADDIP */ + if (!sysctl_accept_global_ootb_addip && (direction == SN_TO_LOCAL)) + return(SN_DROP_PKT); + /* if this Asconf packet does not contain the Vtag parameters it is of no use in Idle state */ + if (!GetAsconfVtags(la, sm, &(assoc->l_vtag), &(assoc->g_vtag), direction)) + return(SN_DROP_PKT); + case SN_SCTP_INIT: /* a packet containing an INIT chunk or an ASCONF AddIP */ + if (sysctl_track_global_addresses) + AddGlobalIPAddresses(sm, assoc, direction); + switch(direction){ + case SN_TO_GLOBAL: + assoc->l_addr = sm->ip_hdr->ip_src; + assoc->a_addr = FindAliasAddress(la, assoc->l_addr); + assoc->l_port = sm->sctp_hdr->src_port; + assoc->g_port = sm->sctp_hdr->dest_port; + if(sm->msg == SN_SCTP_INIT) + assoc->g_vtag = sm->sctpchnk.Init->initiate_tag; + if (AddSctpAssocGlobal(la, assoc)) /* DB clash *///**** need to add dst address + return((sm->msg == SN_SCTP_INIT) ? SN_REPLY_ABORT : SN_REPLY_ERROR); + if(sm->msg == SN_SCTP_ASCONF) { + if (AddSctpAssocLocal(la, assoc, sm->ip_hdr->ip_dst)) /* DB clash */ + return(SN_REPLY_ERROR); + assoc->TableRegister |= SN_WAIT_TOLOCAL; /* wait for tolocal ack */ + } + break; + case SN_TO_LOCAL: + assoc->l_addr = FindSctpRedirectAddress(la, sm); + assoc->a_addr = sm->ip_hdr->ip_dst; + assoc->l_port = sm->sctp_hdr->dest_port; + assoc->g_port = sm->sctp_hdr->src_port; + if(sm->msg == SN_SCTP_INIT) + assoc->l_vtag = sm->sctpchnk.Init->initiate_tag; + if (AddSctpAssocLocal(la, assoc, sm->ip_hdr->ip_src)) /* DB clash */ + return((sm->msg == SN_SCTP_INIT) ? SN_REPLY_ABORT : SN_REPLY_ERROR); + if(sm->msg == SN_SCTP_ASCONF) { + if (AddSctpAssocGlobal(la, assoc)) /* DB clash */ //**** need to add src address + return(SN_REPLY_ERROR); + assoc->TableRegister |= SN_WAIT_TOGLOBAL; /* wait for toglobal ack */ + } + break; + } + assoc->state = (sm->msg == SN_SCTP_INIT) ? SN_INi : SN_INa; + assoc->exp = SN_I_T(la); + sctp_AddTimeOut(la,assoc); + return(SN_NAT_PKT); + default: /* Any other type of SCTP message is not valid in Idle */ + return(SN_DROP_PKT); + } +return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */ +} + +/** @ingroup state_machine + * @brief Process SCTP message while waiting for an INIT-ACK message + * + * Only an INIT-ACK, resent INIT, or an ABORT SCTP packet are valid in this + * state, all other packets are dropped. + * + * @param la Pointer to the relevant libalias instance + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * @param sm Pointer to sctp message information + * @param assoc Pointer to the association this SCTP Message belongs to + * + * @return SN_NAT_PKT | SN_DROP_PKT | SN_REPLY_ABORT + */ +static int +INi_process(struct libalias *la, int direction, struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm) +{ + switch(sm->msg) { + case SN_SCTP_INIT: /* a packet containing a retransmitted INIT chunk */ + sctp_ResetTimeOut(la, assoc, SN_I_T(la)); + return(SN_NAT_PKT); + case SN_SCTP_INITACK: /* a packet containing an INIT-ACK chunk */ + switch(direction){ + case SN_TO_LOCAL: + if (assoc->num_Gaddr) /*If tracking global addresses for this association */ + AddGlobalIPAddresses(sm, assoc, direction); + assoc->l_vtag = sm->sctpchnk.Init->initiate_tag; + if (AddSctpAssocLocal(la, assoc, sm->ip_hdr->ip_src)) { /* DB clash */ + assoc->state = SN_RM;/* Mark for removal*/ + return(SN_SEND_ABORT); + } + break; + case SN_TO_GLOBAL: + assoc->l_addr = sm->ip_hdr->ip_src; // Only if not set in Init! * + assoc->g_vtag = sm->sctpchnk.Init->initiate_tag; + if (AddSctpAssocGlobal(la, assoc)) { /* DB clash */ + assoc->state = SN_RM;/* Mark for removal*/ + return(SN_SEND_ABORT); + } + break; + } + assoc->state = SN_UP;/* association established for NAT */ + sctp_ResetTimeOut(la,assoc, SN_U_T(la)); + return(SN_NAT_PKT); + case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */ + assoc->state = SN_RM;/* Mark for removal*/ + return(SN_NAT_PKT); + default: + return(SN_DROP_PKT); + } + return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */ +} + +/** @ingroup state_machine + * @brief Process SCTP message while waiting for an AddIp-ACK message + * + * Only an AddIP-ACK, resent AddIP, or an ABORT message are valid, all other + * SCTP packets are dropped + * + * @param la Pointer to the relevant libalias instance + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * @param sm Pointer to sctp message information + * @param assoc Pointer to the association this SCTP Message belongs to + * + * @return SN_NAT_PKT | SN_DROP_PKT + */ +static int +INa_process(struct libalias *la, int direction,struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm) +{ + switch(sm->msg) { + case SN_SCTP_ASCONF: /* a packet containing an ASCONF chunk*/ + sctp_ResetTimeOut(la,assoc, SN_I_T(la)); + return(SN_NAT_PKT); + case SN_SCTP_ASCONFACK: /* a packet containing an ASCONF chunk with a ADDIP-ACK */ + switch(direction){ + case SN_TO_LOCAL: + if (!(assoc->TableRegister & SN_WAIT_TOLOCAL)) /* wrong direction */ + return(SN_DROP_PKT); + break; + case SN_TO_GLOBAL: + if (!(assoc->TableRegister & SN_WAIT_TOGLOBAL)) /* wrong direction */ + return(SN_DROP_PKT); + } + if (IsASCONFack(la,sm,direction)) { + assoc->TableRegister &= SN_BOTH_TBL; /* remove wait flags */ + assoc->state = SN_UP; /* association established for NAT */ + sctp_ResetTimeOut(la,assoc, SN_U_T(la)); + return(SN_NAT_PKT); + } else { + assoc->state = SN_RM;/* Mark for removal*/ + return(SN_NAT_PKT); + } + case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */ + assoc->state = SN_RM;/* Mark for removal*/ + return(SN_NAT_PKT); + default: + return(SN_DROP_PKT); + } + return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */ +} + +/** @ingroup state_machine + * @brief Process SCTP messages while association is UP redirecting packets + * + * While in the SN_UP state, all packets for the particular association + * are passed. Only a SHUT-ACK or an ABORT will cause a change of state. + * + * @param la Pointer to the relevant libalias instance + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * @param sm Pointer to sctp message information + * @param assoc Pointer to the association this SCTP Message belongs to + * + * @return SN_NAT_PKT | SN_DROP_PKT + */ +static int +UP_process(struct libalias *la, int direction, struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm) +{ + switch(sm->msg) { + case SN_SCTP_SHUTACK: /* a packet containing a SHUTDOWN-ACK chunk */ + assoc->state = SN_CL; + sctp_ResetTimeOut(la,assoc, SN_C_T(la)); + return(SN_NAT_PKT); + case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */ + assoc->state = SN_RM;/* Mark for removal*/ + return(SN_NAT_PKT); + case SN_SCTP_ASCONF: /* a packet containing an ASCONF chunk*/ + if ((direction == SN_TO_LOCAL) && assoc->num_Gaddr) /*If tracking global addresses for this association & from global side */ + switch(IsADDorDEL(la,sm,direction)) { + case SCTP_ADD_IP_ADDRESS: + AddGlobalIPAddresses(sm, assoc, direction); + break; + case SCTP_DEL_IP_ADDRESS: + RmGlobalIPAddresses(sm, assoc, direction); + break; + } /* fall through to default */ + default: + sctp_ResetTimeOut(la,assoc, SN_U_T(la)); + return(SN_NAT_PKT); /* forward packet */ + } + return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */ +} + +/** @ingroup state_machine + * @brief Process SCTP message while association is in the process of closing + * + * This function waits for a SHUT-COMP to close the association. Depending on + * the the setting of sysctl_holddown_timer it may not remove the association + * immediately, but leave it up until SN_X_T(la). Only SHUT-COMP, SHUT-ACK, and + * ABORT packets are permitted in this state. All other packets are dropped. + * + * @param la Pointer to the relevant libalias instance + * @param direction SN_TO_LOCAL | SN_TO_GLOBAL + * @param sm Pointer to sctp message information + * @param assoc Pointer to the association this SCTP Message belongs to + * + * @return SN_NAT_PKT | SN_DROP_PKT + */ +static int +CL_process(struct libalias *la, int direction,struct sctp_nat_assoc *assoc, struct sctp_nat_msg *sm) +{ + switch(sm->msg) { + case SN_SCTP_SHUTCOMP: /* a packet containing a SHUTDOWN-COMPLETE chunk */ + assoc->state = SN_CL; /* Stay in Close state until timeout */ + if (sysctl_holddown_timer > 0) + sctp_ResetTimeOut(la, assoc, SN_X_T(la));/* allow to stay open for Tbit packets*/ + else + assoc->state = SN_RM;/* Mark for removal*/ + return(SN_NAT_PKT); + case SN_SCTP_SHUTACK: /* a packet containing a SHUTDOWN-ACK chunk */ + assoc->state = SN_CL; /* Stay in Close state until timeout */ + sctp_ResetTimeOut(la, assoc, SN_C_T(la)); + return(SN_NAT_PKT); + case SN_SCTP_ABORT: /* a packet containing an ABORT chunk */ + assoc->state = SN_RM;/* Mark for removal*/ + return(SN_NAT_PKT); + default: + return(SN_DROP_PKT); + } + return(SN_DROP_PKT);/* shouldn't get here very bad: log, drop and hope for the best */ +} + +/* ---------------------------------------------------------------------- + * HASH TABLE CODE + * ---------------------------------------------------------------------- + */ +/** @addtogroup Hash + * + * The Hash functions facilitate searching the NAT Hash Tables for associations + * as well as adding/removing associations from the table(s). + */ +/** @ingroup Hash + * @brief Find the SCTP association given the local address, port and vtag + * + * Searches the local look-up table for the association entry matching the + * provided local tuple + * + * @param la Pointer to the relevant libalias instance + * @param l_addr local address + * @param g_addr global address + * @param l_vtag local Vtag + * @param l_port local Port + * @param g_port global Port + * + * @return pointer to association or NULL + */ +static struct sctp_nat_assoc* +FindSctpLocal(struct libalias *la, struct in_addr l_addr, struct in_addr g_addr, uint32_t l_vtag, uint16_t l_port, uint16_t g_port) +{ + u_int i; + struct sctp_nat_assoc *assoc = NULL; + struct sctp_GlobalAddress *G_Addr = NULL; + + if (l_vtag != 0) { /* an init packet, vtag==0 */ + i = SN_TABLE_HASH(l_vtag, l_port, la->sctpNatTableSize); + LIST_FOREACH(assoc, &la->sctpTableLocal[i], list_L) { + if ((assoc->l_vtag == l_vtag) && (assoc->l_port == l_port) && (assoc->g_port == g_port)\ + && (assoc->l_addr.s_addr == l_addr.s_addr)) { + if (assoc->num_Gaddr) { + LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { + if(G_Addr->g_addr.s_addr == g_addr.s_addr) + return(assoc); + } + } else { + return(assoc); + } + } + } + } + return(NULL); +} + +/** @ingroup Hash + * @brief Check for Global Clash + * + * Searches the global look-up table for the association entry matching the + * provided global <(addresses):ports:vtag> tuple + * + * @param la Pointer to the relevant libalias instance + * @param Cassoc association being checked for a clash + * + * @return pointer to association or NULL + */ +static struct sctp_nat_assoc* +FindSctpGlobalClash(struct libalias *la, struct sctp_nat_assoc *Cassoc) +{ + u_int i; + struct sctp_nat_assoc *assoc = NULL; + struct sctp_GlobalAddress *G_Addr = NULL; + struct sctp_GlobalAddress *G_AddrC = NULL; + + if (Cassoc->g_vtag != 0) { /* an init packet, vtag==0 */ + i = SN_TABLE_HASH(Cassoc->g_vtag, Cassoc->g_port, la->sctpNatTableSize); + LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) { + if ((assoc->g_vtag == Cassoc->g_vtag) && (assoc->g_port == Cassoc->g_port) && (assoc->l_port == Cassoc->l_port)) { + if (assoc->num_Gaddr) { + LIST_FOREACH(G_AddrC, &(Cassoc->Gaddr), list_Gaddr) { + LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { + if(G_Addr->g_addr.s_addr == G_AddrC->g_addr.s_addr) + return(assoc); + } + } + } else { + return(assoc); + } + } + } + } + return(NULL); +} + +/** @ingroup Hash + * @brief Find the SCTP association given the global port and vtag + * + * Searches the global look-up table for the association entry matching the + * provided global tuple + * + * If all but the global address match it sets partial_match to 1 to indicate a + * partial match. If the NAT is tracking global IP addresses for this + * association, the NAT may respond with an ERRORM to request the missing + * address to be added. + * + * @param la Pointer to the relevant libalias instance + * @param g_addr global address + * @param g_vtag global vtag + * @param g_port global port + * @param l_port local port + * + * @return pointer to association or NULL + */ +static struct sctp_nat_assoc* +FindSctpGlobal(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t g_port, uint16_t l_port, int *partial_match) +{ + u_int i; + struct sctp_nat_assoc *assoc = NULL; + struct sctp_GlobalAddress *G_Addr = NULL; + + *partial_match = 0; + if (g_vtag != 0) { /* an init packet, vtag==0 */ + i = SN_TABLE_HASH(g_vtag, g_port, la->sctpNatTableSize); + LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) { + if ((assoc->g_vtag == g_vtag) && (assoc->g_port == g_port) && (assoc->l_port == l_port)) { + *partial_match = 1; + if (assoc->num_Gaddr) { + LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { + if(G_Addr->g_addr.s_addr == g_addr.s_addr) + return(assoc); + } + } else { + return(assoc); + } + } + } + } + return(NULL); +} + +/** @ingroup Hash + * @brief Find the SCTP association for a T-Flag message (given the global port and local vtag) + * + * Searches the local look-up table for a unique association entry matching the + * provided global port and local vtag information + * + * @param la Pointer to the relevant libalias instance + * @param g_addr global address + * @param l_vtag local Vtag + * @param g_port global Port + * @param l_port local Port + * + * @return pointer to association or NULL + */ +static struct sctp_nat_assoc* +FindSctpLocalT(struct libalias *la, struct in_addr g_addr, uint32_t l_vtag, uint16_t g_port, uint16_t l_port) +{ + u_int i; + struct sctp_nat_assoc *assoc = NULL, *lastmatch = NULL; + struct sctp_GlobalAddress *G_Addr = NULL; + int cnt = 0; + + if (l_vtag != 0) { /* an init packet, vtag==0 */ + i = SN_TABLE_HASH(l_vtag, g_port, la->sctpNatTableSize); + LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) { + if ((assoc->g_vtag == l_vtag) && (assoc->g_port == g_port) && (assoc->l_port == l_port)) { + if (assoc->num_Gaddr) { + LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { + if(G_Addr->g_addr.s_addr == G_Addr->g_addr.s_addr) + return(assoc); /* full match */ + } + } else { + if (++cnt > 1) return(NULL); + lastmatch = assoc; + } + } + } + } + /* If there is more than one match we do not know which local address to send to */ + return( cnt ? lastmatch : NULL ); +} + +/** @ingroup Hash + * @brief Find the SCTP association for a T-Flag message (given the local port and global vtag) + * + * Searches the global look-up table for a unique association entry matching the + * provided local port and global vtag information + * + * @param la Pointer to the relevant libalias instance + * @param g_addr global address + * @param g_vtag global vtag + * @param l_port local port + * @param g_port global port + * + * @return pointer to association or NULL + */ +static struct sctp_nat_assoc* +FindSctpGlobalT(struct libalias *la, struct in_addr g_addr, uint32_t g_vtag, uint16_t l_port, uint16_t g_port) +{ + u_int i; + struct sctp_nat_assoc *assoc = NULL; + struct sctp_GlobalAddress *G_Addr = NULL; + + if (g_vtag != 0) { /* an init packet, vtag==0 */ + i = SN_TABLE_HASH(g_vtag, l_port, la->sctpNatTableSize); + LIST_FOREACH(assoc, &la->sctpTableLocal[i], list_L) { + if ((assoc->l_vtag == g_vtag) && (assoc->l_port == l_port) && (assoc->g_port == g_port)) { + if (assoc->num_Gaddr) { + LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { + if(G_Addr->g_addr.s_addr == g_addr.s_addr) + return(assoc); + } + } else { + return(assoc); + } + } + } + } + return(NULL); +} + +/** @ingroup Hash + * @brief Add the sctp association information to the local look up table + * + * Searches the local look-up table for an existing association with the same + * details. If a match exists and is ONLY in the local look-up table then this + * is a repeated INIT packet, we need to remove this association from the + * look-up table and add the new association + * + * The new association is added to the head of the list and state is updated + * + * @param la Pointer to the relevant libalias instance + * @param assoc pointer to sctp association + * @param g_addr global address + * + * @return SN_ADD_OK | SN_ADD_CLASH + */ +static int +AddSctpAssocLocal(struct libalias *la, struct sctp_nat_assoc *assoc, struct in_addr g_addr) +{ + struct sctp_nat_assoc *found; + + LIBALIAS_LOCK_ASSERT(la); + found = FindSctpLocal(la, assoc->l_addr, g_addr, assoc->l_vtag, assoc->l_port, assoc->g_port); + /* + * Note that if a different global address initiated this Init, + * ie it wasn't resent as presumed: + * - the local receiver if receiving it for the first time will establish + * an association with the new global host + * - if receiving an init from a different global address after sending a + * lost initack it will send an initack to the new global host, the first + * association attempt will then be blocked if retried. + */ + if (found != NULL) { + if ((found->TableRegister == SN_LOCAL_TBL) && (found->g_port == assoc->g_port)) { /* resent message */ + RmSctpAssoc(la, found); + sctp_RmTimeOut(la, found); + freeGlobalAddressList(found); + sn_free(found); + } else + return(SN_ADD_CLASH); + } + + LIST_INSERT_HEAD(&la->sctpTableLocal[SN_TABLE_HASH(assoc->l_vtag, assoc->l_port, la->sctpNatTableSize)], + assoc, list_L); + assoc->TableRegister |= SN_LOCAL_TBL; + la->sctpLinkCount++; //increment link count + + if (assoc->TableRegister == SN_BOTH_TBL) { + /* libalias log -- controlled by libalias */ + if (la->packetAliasMode & PKT_ALIAS_LOG) + SctpShowAliasStats(la); + + SN_LOG(SN_LOG_INFO, logsctpassoc(assoc, "^")); + } + + return(SN_ADD_OK); +} + +/** @ingroup Hash + * @brief Add the sctp association information to the global look up table + * + * Searches the global look-up table for an existing association with the same + * details. If a match exists and is ONLY in the global look-up table then this + * is a repeated INIT packet, we need to remove this association from the + * look-up table and add the new association + * + * The new association is added to the head of the list and state is updated + * + * @param la Pointer to the relevant libalias instance + * @param assoc pointer to sctp association + * + * @return SN_ADD_OK | SN_ADD_CLASH + */ +static int +AddSctpAssocGlobal(struct libalias *la, struct sctp_nat_assoc *assoc) +{ + struct sctp_nat_assoc *found; + + LIBALIAS_LOCK_ASSERT(la); + found = FindSctpGlobalClash(la, assoc); + if (found != NULL) { + if ((found->TableRegister == SN_GLOBAL_TBL) && \ + (found->l_addr.s_addr == assoc->l_addr.s_addr) && (found->l_port == assoc->l_port)) { /* resent message */ + RmSctpAssoc(la, found); + sctp_RmTimeOut(la, found); + freeGlobalAddressList(found); + sn_free(found); + } else + return(SN_ADD_CLASH); + } + + LIST_INSERT_HEAD(&la->sctpTableGlobal[SN_TABLE_HASH(assoc->g_vtag, assoc->g_port, la->sctpNatTableSize)], + assoc, list_G); + assoc->TableRegister |= SN_GLOBAL_TBL; + la->sctpLinkCount++; //increment link count + + if (assoc->TableRegister == SN_BOTH_TBL) { + /* libalias log -- controlled by libalias */ + if (la->packetAliasMode & PKT_ALIAS_LOG) + SctpShowAliasStats(la); + + SN_LOG(SN_LOG_INFO, logsctpassoc(assoc, "^")); + } + + return(SN_ADD_OK); +} + +/** @ingroup Hash + * @brief Remove the sctp association information from the look up table + * + * For each of the two (local/global) look-up tables, remove the association + * from that table IF it has been registered in that table. + * + * NOTE: The calling code is responsible for freeing memory allocated to the + * association structure itself + * + * NOTE: The association is NOT removed from the timer queue + * + * @param la Pointer to the relevant libalias instance + * @param assoc pointer to sctp association + */ +static void +RmSctpAssoc(struct libalias *la, struct sctp_nat_assoc *assoc) +{ + // struct sctp_nat_assoc *found; + if (assoc == NULL) { + /* very bad, log and die*/ + SN_LOG(SN_LOG_LOW, + logsctperror("ERROR: alias_sctp:RmSctpAssoc(NULL)\n", 0, 0, SN_TO_NODIR)); + return; + } + /* log if association is fully up and now closing */ + if (assoc->TableRegister == SN_BOTH_TBL) { + SN_LOG(SN_LOG_INFO, logsctpassoc(assoc, "$")); + } + LIBALIAS_LOCK_ASSERT(la); + if (assoc->TableRegister & SN_LOCAL_TBL) { + assoc->TableRegister ^= SN_LOCAL_TBL; + la->sctpLinkCount--; //decrement link count + LIST_REMOVE(assoc, list_L); + } + + if (assoc->TableRegister & SN_GLOBAL_TBL) { + assoc->TableRegister ^= SN_GLOBAL_TBL; + la->sctpLinkCount--; //decrement link count + LIST_REMOVE(assoc, list_G); + } + // sn_free(assoc); //Don't remove now, remove if needed later + /* libalias logging -- controlled by libalias log definition */ + if (la->packetAliasMode & PKT_ALIAS_LOG) + SctpShowAliasStats(la); +} + +/** + * @ingroup Hash + * @brief free the Global Address List memory + * + * freeGlobalAddressList deletes all global IP addresses in an associations + * global IP address list. + * + * @param assoc + */ +static void freeGlobalAddressList(struct sctp_nat_assoc *assoc) +{ + struct sctp_GlobalAddress *gaddr1=NULL,*gaddr2=NULL; + /*free global address list*/ + gaddr1 = LIST_FIRST(&(assoc->Gaddr)); + while (gaddr1 != NULL) { + gaddr2 = LIST_NEXT(gaddr1, list_Gaddr); + sn_free(gaddr1); + gaddr1 = gaddr2; + } +} +/* ---------------------------------------------------------------------- + * TIMER QUEUE CODE + * ---------------------------------------------------------------------- + */ +/** @addtogroup Timer + * + * The timer queue management functions are designed to operate efficiently with + * a minimum of interaction with the queues. + * + * Once a timeout is set in the queue it will not be altered in the queue unless + * it has to be changed to a shorter time (usually only for aborts and closing). + * On a queue timeout, the real expiry time is checked, and if not leq than the + * timeout it is requeued (O(1)) at its later time. This is especially important + * for normal packets sent during an association. When a timer expires, it is + * updated to its new expiration time if necessary, or processed as a + * timeout. This means that while in UP state, the timing queue is only altered + * every U_T (every few minutes) for a particular association. + */ +/** @ingroup Timer + * @brief Add an association timeout to the timer queue + * + * Determine the location in the queue to add the timeout and insert the + * association into the list at that queue position + * + * @param la + * @param assoc + */ +static void +sctp_AddTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc) +{ + int add_loc; + LIBALIAS_LOCK_ASSERT(la); + add_loc = assoc->exp - la->sctpNatTimer.loc_time + la->sctpNatTimer.cur_loc; + if (add_loc >= SN_TIMER_QUEUE_SIZE) + add_loc -= SN_TIMER_QUEUE_SIZE; + LIST_INSERT_HEAD(&la->sctpNatTimer.TimerQ[add_loc], assoc, timer_Q); + assoc->exp_loc = add_loc; +} + +/** @ingroup Timer + * @brief Remove an association from timer queue + * + * This is an O(1) operation to remove the association pointer from its + * current position in the timer queue + * + * @param la Pointer to the relevant libalias instance + * @param assoc pointer to sctp association + */ +static void +sctp_RmTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc) +{ + LIBALIAS_LOCK_ASSERT(la); + LIST_REMOVE(assoc, timer_Q);/* Note this is O(1) */ +} + + +/** @ingroup Timer + * @brief Reset timer in timer queue + * + * Reset the actual timeout for the specified association. If it is earlier than + * the existing timeout, then remove and re-install the association into the + * queue + * + * @param la Pointer to the relevant libalias instance + * @param assoc pointer to sctp association + * @param newexp New expiration time + */ +static void +sctp_ResetTimeOut(struct libalias *la, struct sctp_nat_assoc *assoc, int newexp) +{ + if (newexp < assoc->exp) { + sctp_RmTimeOut(la, assoc); + assoc->exp = newexp; + sctp_AddTimeOut(la, assoc); + } else { + assoc->exp = newexp; + } +} + +/** @ingroup Timer + * @brief Check timer Q against current time + * + * Loop through each entry in the timer queue since the last time we processed + * the timer queue until now (the current time). For each association in the + * event list, we remove it from that position in the timer queue and check if + * it has really expired. If so we: + * - Log the timer expiry + * - Remove the association from the NAT tables + * - Release the memory used by the association + * + * If the timer hasn't really expired we place the association into its new + * correct position in the timer queue. + * + * @param la Pointer to the relevant libalias instance + */ +void +sctp_CheckTimers(struct libalias *la) +{ + struct sctp_nat_assoc *assoc; + + LIBALIAS_LOCK_ASSERT(la); + while(la->timeStamp >= la->sctpNatTimer.loc_time) { + while (!LIST_EMPTY(&la->sctpNatTimer.TimerQ[la->sctpNatTimer.cur_loc])) { + assoc = LIST_FIRST(&la->sctpNatTimer.TimerQ[la->sctpNatTimer.cur_loc]); + //SLIST_REMOVE_HEAD(&la->sctpNatTimer.TimerQ[la->sctpNatTimer.cur_loc], timer_Q); + LIST_REMOVE(assoc, timer_Q); + if (la->timeStamp >= assoc->exp) { /* state expired */ + SN_LOG(((assoc->state == SN_CL)?(SN_LOG_DEBUG):(SN_LOG_INFO)), + logsctperror("Timer Expired", assoc->g_vtag, assoc->state, SN_TO_NODIR)); + RmSctpAssoc(la, assoc); + freeGlobalAddressList(assoc); + sn_free(assoc); + } else {/* state not expired, reschedule timer*/ + sctp_AddTimeOut(la, assoc); + } + } + /* Goto next location in the timer queue*/ + ++la->sctpNatTimer.loc_time; + if (++la->sctpNatTimer.cur_loc >= SN_TIMER_QUEUE_SIZE) + la->sctpNatTimer.cur_loc = 0; + } +} + +/* ---------------------------------------------------------------------- + * LOGGING CODE + * ---------------------------------------------------------------------- + */ +/** @addtogroup Logging + * + * The logging functions provide logging of different items ranging from logging + * a simple message, through logging an association details to logging the + * current state of the NAT tables + */ +/** @ingroup Logging + * @brief Log sctp nat errors + * + * @param errormsg Error message to be logged + * @param vtag Current Vtag + * @param error Error number + * @param direction Direction of packet + */ +static void +logsctperror(char* errormsg, uint32_t vtag, int error, int direction) +{ + char dir; + switch(direction) { + case SN_TO_LOCAL: + dir = 'L'; + break; + case SN_TO_GLOBAL: + dir = 'G'; + break; + default: + dir = '*'; + break; + } + SctpAliasLog("->%c %s (vt=%u) %d\n", dir, errormsg, ntohl(vtag), error); +} + +/** @ingroup Logging + * @brief Log what the parser parsed + * + * @param direction Direction of packet + * @param sm Pointer to sctp message information + */ +static void +logsctpparse(int direction, struct sctp_nat_msg *sm) +{ + char *ploc, *pstate; + switch(direction) { + case SN_TO_LOCAL: + ploc = "TO_LOCAL -"; + break; + case SN_TO_GLOBAL: + ploc = "TO_GLOBAL -"; + break; + default: + ploc = ""; + } + switch(sm->msg) { + case SN_SCTP_INIT: + pstate = "Init"; + break; + case SN_SCTP_INITACK: + pstate = "InitAck"; + break; + case SN_SCTP_ABORT: + pstate = "Abort"; + break; + case SN_SCTP_SHUTACK: + pstate = "ShutAck"; + break; + case SN_SCTP_SHUTCOMP: + pstate = "ShutComp"; + break; + case SN_SCTP_ASCONF: + pstate = "Asconf"; + break; + case SN_SCTP_ASCONFACK: + pstate = "AsconfAck"; + break; + case SN_SCTP_OTHER: + pstate = "Other"; + break; + default: + pstate = "***ERROR***"; + break; + } + SctpAliasLog("Parsed: %s %s\n", ploc, pstate); +} + +/** @ingroup Logging + * @brief Log an SCTP association's details + * + * @param assoc pointer to sctp association + * @param s Character that indicates the state of processing for this packet + */ +static void logsctpassoc(struct sctp_nat_assoc *assoc, char* s) +{ + struct sctp_GlobalAddress *G_Addr = NULL; + char *sp; + switch(assoc->state) { + case SN_ID: + sp = "ID "; + break; + case SN_INi: + sp = "INi "; + break; + case SN_INa: + sp = "INa "; + break; + case SN_UP: + sp = "UP "; + break; + case SN_CL: + sp = "CL "; + break; + case SN_RM: + sp = "RM "; + break; + default: + sp = "***ERROR***"; + break; + } + SctpAliasLog("%sAssoc: %s exp=%u la=%s lv=%u lp=%u gv=%u gp=%u tbl=%d\n", + s, sp, assoc->exp, inet_ntoa(assoc->l_addr), ntohl(assoc->l_vtag), + ntohs(assoc->l_port), ntohl(assoc->g_vtag), ntohs(assoc->g_port), + assoc->TableRegister); + /* list global addresses */ + LIST_FOREACH(G_Addr, &(assoc->Gaddr), list_Gaddr) { + SctpAliasLog("\t\tga=%s\n",inet_ntoa(G_Addr->g_addr)); + } +} + +/** @ingroup Logging + * @brief Output Global table to log + * + * @param la Pointer to the relevant libalias instance + */ +static void logSctpGlobal(struct libalias *la) +{ + u_int i; + struct sctp_nat_assoc *assoc = NULL; + + SctpAliasLog("G->\n"); + for (i=0; i < la->sctpNatTableSize; i++) { + LIST_FOREACH(assoc, &la->sctpTableGlobal[i], list_G) { + logsctpassoc(assoc, " "); + } + } +} + +/** @ingroup Logging + * @brief Output Local table to log + * + * @param la Pointer to the relevant libalias instance + */ +static void logSctpLocal(struct libalias *la) +{ + u_int i; + struct sctp_nat_assoc *assoc = NULL; + + SctpAliasLog("L->\n"); + for (i=0; i < la->sctpNatTableSize; i++) { + LIST_FOREACH(assoc, &la->sctpTableLocal[i], list_L) { + logsctpassoc(assoc, " "); + } + } +} + +/** @ingroup Logging + * @brief Output timer queue to log + * + * @param la Pointer to the relevant libalias instance + */ +static void logTimerQ(struct libalias *la) +{ + static char buf[50]; + u_int i; + struct sctp_nat_assoc *assoc = NULL; + + SctpAliasLog("t->\n"); + for (i=0; i < SN_TIMER_QUEUE_SIZE; i++) { + LIST_FOREACH(assoc, &la->sctpNatTimer.TimerQ[i], timer_Q) { + snprintf(buf, 50, " l=%u ",i); + //SctpAliasLog(la->logDesc," l=%d ",i); + logsctpassoc(assoc, buf); + } + } +} + +/** @ingroup Logging + * @brief Sctp NAT logging function + * + * This function is based on a similar function in alias_db.c + * + * @param str/stream logging descriptor + * @param format printf type string + */ +#ifdef _KERNEL +static void +SctpAliasLog(const char *format, ...) +{ + char buffer[LIBALIAS_BUF_SIZE]; + va_list ap; + va_start(ap, format); + vsnprintf(buffer, LIBALIAS_BUF_SIZE, format, ap); + va_end(ap); + log(LOG_SECURITY | LOG_INFO, + "alias_sctp: %s", buffer); +} +#else +static void +SctpAliasLog(FILE *stream, const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + vfprintf(stream, format, ap); + va_end(ap); + fflush(stream); +} +#endif diff --git a/freebsd/sys/netinet/libalias/alias_sctp.h b/freebsd/sys/netinet/libalias/alias_sctp.h new file mode 100644 index 00000000..9ea21959 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_sctp.h @@ -0,0 +1,201 @@ +/*- + * Copyright (c) 2008 + * Swinburne University of Technology, Melbourne, Australia. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Alias_sctp forms part of the libalias kernel module to handle + * Network Address Translation (NAT) for the SCTP protocol. + * + * This software was developed by David A. Hayes + * with leadership and advice from Jason But + * + * The design is outlined in CAIA technical report number 080618A + * (D. Hayes and J. But, "Alias_sctp Version 0.1: SCTP NAT implementation in IPFW") + * + * Development is part of the CAIA SONATA project, + * proposed by Jason But and Grenville Armitage: + * http://caia.swin.edu.au/urp/sonata/ + * + * + * This project has been made possible in part by a grant from + * the Cisco University Research Program Fund at Community + * Foundation Silicon Valley. + * + */ + +/* $FreeBSD$ */ + +#ifndef _ALIAS_SCTP_HH_ +#define _ALIAS_SCTP_HH_ + +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#endif // #ifdef _KERNEL +#include + +#include +#include +#include + +#include +#include +#include + +/** + * These are defined in sctp_os_bsd.h, but it can't be included due to its local file + * inclusion, so I'm defining them here. + * + */ +#include +#include +/* The packed define for 64 bit platforms */ +#ifndef SCTP_PACKED +#define SCTP_PACKED __attribute__((packed)) +#endif //#ifndef SCTP_PACKED +#ifndef SCTP_UNUSED +#define SCTP_UNUSED __attribute__((unused)) +#endif //#ifndef SCTP_UNUSED + + +#include +//#include --might be needed later for mbuf stuff +#include + +#ifndef _KERNEL +#include +#include +#include +#endif //#ifdef _KERNEL + + +#define LINK_SCTP IPPROTO_SCTP + + +#define SN_TO_LOCAL 0 /**< packet traveling from global to local */ +#define SN_TO_GLOBAL 1 /**< packet traveling from local to global */ +#define SN_TO_NODIR 99 /**< used where direction is not important */ + +#define SN_NAT_PKT 0x0000 /**< Network Address Translate packet */ +#define SN_DROP_PKT 0x0001 /**< drop packet (don't forward it) */ +#define SN_PROCESSING_ERROR 0x0003 /**< Packet processing error */ +#define SN_REPLY_ABORT 0x0010 /**< Reply with ABORT to sender (don't forward it) */ +#define SN_SEND_ABORT 0x0020 /**< Send ABORT to destination */ +#define SN_TX_ABORT 0x0030 /**< mask for transmitting abort */ +#define SN_REFLECT_ERROR 0x0100 /**< Reply with ERROR to sender on OOTB packet Tbit set */ +#define SN_REPLY_ERROR 0x0200 /**< Reply with ERROR to sender on ASCONF clash */ +#define SN_TX_ERROR 0x0300 /**< mask for transmitting error */ + + +#define PKT_ALIAS_RESPOND 0x1000 /**< Signal to libalias that there is a response packet to send */ +/* + * Data structures + */ + +/** + * @brief sctp association information + * + * Structure that contains information about a particular sctp association + * currently under Network Address Translation. + * Information is stored in network byte order (as is libalias)*** + */ +struct sctp_nat_assoc { + uint32_t l_vtag; /**< local side verification tag */ + uint16_t l_port; /**< local side port number */ + uint32_t g_vtag; /**< global side verification tag */ + uint16_t g_port; /**< global side port number */ + struct in_addr l_addr; /**< local ip address */ + struct in_addr a_addr; /**< alias ip address */ + int state; /**< current state of NAT association */ + int TableRegister; /**< stores which look up tables association is registered in */ + int exp; /**< timer expiration in seconds from uptime */ + int exp_loc; /**< current location in timer_Q */ + int num_Gaddr; /**< number of global IP addresses in the list */ + LIST_HEAD(sctpGlobalAddresshead,sctp_GlobalAddress) Gaddr; /**< List of global addresses */ + LIST_ENTRY (sctp_nat_assoc) list_L; /**< Linked list of pointers for Local table*/ + LIST_ENTRY (sctp_nat_assoc) list_G; /**< Linked list of pointers for Global table */ + LIST_ENTRY (sctp_nat_assoc) timer_Q; /**< Linked list of pointers for timer Q */ +//Using libalias locking +}; + +struct sctp_GlobalAddress { + struct in_addr g_addr; + LIST_ENTRY (sctp_GlobalAddress) list_Gaddr; /**< Linked list of pointers for Global table */ +}; + +/** + * @brief SCTP chunk of interest + * + * The only chunks whose contents are of any interest are the INIT and ASCONF_AddIP + */ +union sctpChunkOfInt { + struct sctp_init *Init; /**< Pointer to Init Chunk */ + struct sctp_init_ack *InitAck; /**< Pointer to Init Chunk */ + struct sctp_paramhdr *Asconf; /**< Pointer to ASCONF chunk */ +}; + + +/** + * @brief SCTP message + * + * Structure containing the relevant information from the SCTP message + */ +struct sctp_nat_msg { + uint16_t msg; /**< one of the key messages defined above */ +#ifndef __rtems__ +#ifdef INET6 + // struct ip6_hdr *ip_hdr; /**< pointer to ip packet header */ /*no inet6 support yet*/ +#else + struct ip *ip_hdr; /**< pointer to ip packet header */ +#endif //#ifdef INET6 +#else //__rtems__ + struct ip *ip_hdr; /**< pointer to ip packet header */ +#endif //__rtems__ + struct sctphdr *sctp_hdr; /**< pointer to sctp common header */ + union sctpChunkOfInt sctpchnk; /**< union of pointers to the chunk of interest */ + int chunk_length; /**< length of chunk of interest */ +}; + + +/** + * @brief sctp nat timer queue structure + * + */ + +struct sctp_nat_timer { + int loc_time; /**< time in seconds for the current location in the queue */ + int cur_loc; /**< index of the current location in the circular queue */ + LIST_HEAD(sctpTimerQ,sctp_nat_assoc) *TimerQ; /**< List of associations at this position in the timer Q */ +}; + + + +#endif //#ifndef _ALIAS_SCTP_H diff --git a/freebsd/sys/netinet/libalias/alias_skinny.c b/freebsd/sys/netinet/libalias/alias_skinny.c new file mode 100644 index 00000000..4d311efe --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_skinny.c @@ -0,0 +1,449 @@ +#include + +/*- + * alias_skinny.c + * + * Copyright (c) 2002, 2003 MarcusCom, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Joe Marcus Clarke + * + * $FreeBSD$ + */ + +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include +#include +#endif + +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#else +#include +#include +#endif + +static void +AliasHandleSkinny(struct libalias *, struct ip *, struct alias_link *); + +static int +fingerprint(struct libalias *la, struct alias_data *ah) +{ + + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL) + return (-1); + if (la->skinnyPort != 0 && (ntohs(*ah->sport) == la->skinnyPort || + ntohs(*ah->dport) == la->skinnyPort)) + return (0); + return (-1); +} + +static int +protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + AliasHandleSkinny(la, pip, ah->lnk); + return (0); +} + +struct proto_handler handlers[] = { + { + .pri = 110, + .dir = IN|OUT, + .proto = TCP, + .fingerprint = &fingerprint, + .protohandler = &protohandler + }, + { EOH } +}; + +static int +mod_handler(module_t mod, int type, void *data) +{ + int error; + + switch (type) { + case MOD_LOAD: + error = 0; + LibAliasAttachHandlers(handlers); + break; + case MOD_UNLOAD: + error = 0; + LibAliasDetachHandlers(handlers); + break; + default: + error = EINVAL; + } + return (error); +} + +#ifdef _KERNEL +static +#endif +moduledata_t alias_mod = { + "alias_skinny", mod_handler, NULL +}; + +#ifdef _KERNEL +DECLARE_MODULE(alias_skinny, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); +MODULE_VERSION(alias_skinny, 1); +MODULE_DEPEND(alias_skinny, libalias, 1, 1, 1); +#endif + +/* + * alias_skinny.c handles the translation for the Cisco Skinny Station + * protocol. Skinny typically uses TCP port 2000 to set up calls between + * a Cisco Call Manager and a Cisco IP phone. When a phone comes on line, + * it first needs to register with the Call Manager. To do this it sends + * a registration message. This message contains the IP address of the + * IP phone. This message must then be translated to reflect our global + * IP address. Along with the registration message (and usually in the + * same packet), the phone sends an IP port message. This message indicates + * the TCP port over which it will communicate. + * + * When a call is placed from the phone, the Call Manager will send an + * Open Receive Channel message to the phone to let the caller know someone + * has answered. The phone then sends back an Open Receive Channel + * Acknowledgement. In this packet, the phone sends its IP address again, + * and the UDP port over which the voice traffic should flow. These values + * need translation. Right after the Open Receive Channel Acknowledgement, + * the Call Manager sends a Start Media Transmission message indicating the + * call is connected. This message contains the IP address and UDP port + * number of the remote (called) party. Once this message is translated, the + * call can commence. The called part sends the first UDP packet to the + * calling phone at the pre-arranged UDP port in the Open Receive Channel + * Acknowledgement. + * + * Skinny is a Cisco-proprietary protocol and is a trademark of Cisco Systems, + * Inc. All rights reserved. +*/ + +/* #define LIBALIAS_DEBUG 1 */ + +/* Message types that need translating */ +#define REG_MSG 0x00000001 +#define IP_PORT_MSG 0x00000002 +#define OPNRCVCH_ACK 0x00000022 +#define START_MEDIATX 0x0000008a + +struct skinny_header { + u_int32_t len; + u_int32_t reserved; + u_int32_t msgId; +}; + +struct RegisterMessage { + u_int32_t msgId; + char devName [16]; + u_int32_t uid; + u_int32_t instance; + u_int32_t ipAddr; + u_char devType; + u_int32_t maxStreams; +}; + +struct IpPortMessage { + u_int32_t msgId; + u_int32_t stationIpPort; /* Note: Skinny uses 32-bit port + * numbers */ +}; + +struct OpenReceiveChannelAck { + u_int32_t msgId; + u_int32_t status; + u_int32_t ipAddr; + u_int32_t port; + u_int32_t passThruPartyID; +}; + +struct StartMediaTransmission { + u_int32_t msgId; + u_int32_t conferenceID; + u_int32_t passThruPartyID; + u_int32_t remoteIpAddr; + u_int32_t remotePort; + u_int32_t MSPacket; + u_int32_t payloadCap; + u_int32_t precedence; + u_int32_t silenceSuppression; + u_short maxFramesPerPacket; + u_int32_t G723BitRate; +}; + +typedef enum { + ClientToServer = 0, + ServerToClient = 1 +} ConvDirection; + + +static int +alias_skinny_reg_msg(struct RegisterMessage *reg_msg, struct ip *pip, + struct tcphdr *tc, struct alias_link *lnk, + ConvDirection direction) +{ + (void)direction; + + reg_msg->ipAddr = (u_int32_t) GetAliasAddress(lnk).s_addr; + + tc->th_sum = 0; +#ifdef _KERNEL + tc->th_x2 = 1; +#else + tc->th_sum = TcpChecksum(pip); +#endif + + return (0); +} + +static int +alias_skinny_startmedia(struct StartMediaTransmission *start_media, + struct ip *pip, struct tcphdr *tc, + struct alias_link *lnk, u_int32_t localIpAddr, + ConvDirection direction) +{ + struct in_addr dst, src; + + (void)pip; + (void)tc; + (void)lnk; + (void)direction; + + dst.s_addr = start_media->remoteIpAddr; + src.s_addr = localIpAddr; + + /* + * XXX I should probably handle in bound global translations as + * well. + */ + + return (0); +} + +static int +alias_skinny_port_msg(struct IpPortMessage *port_msg, struct ip *pip, + struct tcphdr *tc, struct alias_link *lnk, + ConvDirection direction) +{ + (void)direction; + + port_msg->stationIpPort = (u_int32_t) ntohs(GetAliasPort(lnk)); + + tc->th_sum = 0; +#ifdef _KERNEL + tc->th_x2 = 1; +#else + tc->th_sum = TcpChecksum(pip); +#endif + return (0); +} + +static int +alias_skinny_opnrcvch_ack(struct libalias *la, struct OpenReceiveChannelAck *opnrcvch_ack, + struct ip *pip, struct tcphdr *tc, + struct alias_link *lnk, u_int32_t * localIpAddr, + ConvDirection direction) +{ + struct in_addr null_addr; + struct alias_link *opnrcv_lnk; + u_int32_t localPort; + + (void)lnk; + (void)direction; + + *localIpAddr = (u_int32_t) opnrcvch_ack->ipAddr; + localPort = opnrcvch_ack->port; + + null_addr.s_addr = INADDR_ANY; + opnrcv_lnk = FindUdpTcpOut(la, pip->ip_src, null_addr, + htons((u_short) opnrcvch_ack->port), 0, + IPPROTO_UDP, 1); + opnrcvch_ack->ipAddr = (u_int32_t) GetAliasAddress(opnrcv_lnk).s_addr; + opnrcvch_ack->port = (u_int32_t) ntohs(GetAliasPort(opnrcv_lnk)); + + tc->th_sum = 0; +#ifdef _KERNEL + tc->th_x2 = 1; +#else + tc->th_sum = TcpChecksum(pip); +#endif + return (0); +} + +static void +AliasHandleSkinny(struct libalias *la, struct ip *pip, struct alias_link *lnk) +{ + size_t hlen, tlen, dlen; + struct tcphdr *tc; + u_int32_t msgId, t, len, lip; + struct skinny_header *sd; + size_t orig_len, skinny_hdr_len = sizeof(struct skinny_header); + ConvDirection direction; + + lip = -1; + tc = (struct tcphdr *)ip_next(pip); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + sd = (struct skinny_header *)tcp_next(tc); + + /* + * XXX This direction is reserved for future use. I still need to + * handle the scenario where the call manager is on the inside, and + * the calling phone is on the global outside. + */ + if (ntohs(tc->th_dport) == la->skinnyPort) { + direction = ClientToServer; + } else if (ntohs(tc->th_sport) == la->skinnyPort) { + direction = ServerToClient; + } else { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: Invalid port number, not a Skinny packet\n"); +#endif + return; + } + + orig_len = dlen; + /* + * Skinny packets can contain many messages. We need to loop + * through the packet using len to determine message boundaries. + * This comes into play big time with port messages being in the + * same packet as register messages. Also, open receive channel + * acks are usually buried in a pakcet some 400 bytes long. + */ + while (dlen >= skinny_hdr_len) { + len = (sd->len); + msgId = (sd->msgId); + t = len; + + if (t > orig_len || t > dlen) { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: Not a skinny packet, invalid length \n"); +#endif + return; + } + switch (msgId) { + case REG_MSG: { + struct RegisterMessage *reg_mesg; + + if (len < (int)sizeof(struct RegisterMessage)) { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: Not a skinny packet, bad registration message\n"); +#endif + return; + } + reg_mesg = (struct RegisterMessage *)&sd->msgId; +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: Received a register message"); +#endif + alias_skinny_reg_msg(reg_mesg, pip, tc, lnk, direction); + break; + } + case IP_PORT_MSG: { + struct IpPortMessage *port_mesg; + + if (len < (int)sizeof(struct IpPortMessage)) { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: Not a skinny packet, port message\n"); +#endif + return; + } +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: Received ipport message\n"); +#endif + port_mesg = (struct IpPortMessage *)&sd->msgId; + alias_skinny_port_msg(port_mesg, pip, tc, lnk, direction); + break; + } + case OPNRCVCH_ACK: { + struct OpenReceiveChannelAck *opnrcvchn_ack; + + if (len < (int)sizeof(struct OpenReceiveChannelAck)) { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: Not a skinny packet, packet,OpnRcvChnAckMsg\n"); +#endif + return; + } +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: Received open rcv channel msg\n"); +#endif + opnrcvchn_ack = (struct OpenReceiveChannelAck *)&sd->msgId; + alias_skinny_opnrcvch_ack(la, opnrcvchn_ack, pip, tc, lnk, &lip, direction); + break; + } + case START_MEDIATX: { + struct StartMediaTransmission *startmedia_tx; + + if (len < (int)sizeof(struct StartMediaTransmission)) { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: Not a skinny packet,StartMediaTx Message\n"); +#endif + return; + } + if (lip == -1) { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: received a" + " packet,StartMediaTx Message before" + " packet,OpnRcvChnAckMsg\n" +#endif + return; + } + +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/Skinny: Received start media trans msg\n"); +#endif + startmedia_tx = (struct StartMediaTransmission *)&sd->msgId; + alias_skinny_startmedia(startmedia_tx, pip, tc, lnk, lip, direction); + break; + } + default: + break; + } + /* Place the pointer at the next message in the packet. */ + dlen -= len + (skinny_hdr_len - sizeof(msgId)); + sd = (struct skinny_header *)(((char *)&sd->msgId) + len); + } +} diff --git a/freebsd/sys/netinet/libalias/alias_smedia.c b/freebsd/sys/netinet/libalias/alias_smedia.c new file mode 100644 index 00000000..3d558a94 --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_smedia.c @@ -0,0 +1,551 @@ +#include + +/* + * alias_smedia.c + * + * Copyright (c) 2000 Whistle Communications, Inc. + * All rights reserved. + * + * Subject to the following obligations and disclaimer of warranty, use and + * redistribution of this software, in source or object code forms, with or + * without modifications are expressly permitted by Whistle Communications; + * provided, however, that: + * 1. Any and all reproductions of the source or object code must include the + * copyright notice above and the following disclaimer of warranties; and + * 2. No rights are granted, in any manner or form, to use Whistle + * Communications, Inc. trademarks, including the mark "WHISTLE + * COMMUNICATIONS" on advertising, endorsements, or otherwise except as + * such appears in the above copyright notice or in the software. + * + * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND + * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO + * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, + * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. + * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY + * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS + * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. + * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES + * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING + * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * Copyright (c) 2000 Junichi SATOH + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Authors: Erik Salander + * Junichi SATOH + * + */ + +#include +__FBSDID("$FreeBSD$"); + +/* + Alias_smedia.c is meant to contain the aliasing code for streaming media + protocols. It performs special processing for RSTP sessions under TCP. + Specifically, when a SETUP request is sent by a client, or a 200 reply + is sent by a server, it is intercepted and modified. The address is + changed to the gateway machine and an aliasing port is used. + + More specifically, the "client_port" configuration parameter is + parsed for SETUP requests. The "server_port" configuration parameter is + parsed for 200 replies eminating from a server. This is intended to handle + the unicast case. + + RTSP also allows a redirection of a stream to another client by using the + "destination" configuration parameter. The destination config parm would + indicate a different IP address. This function is NOT supported by the + RTSP translation code below. + + The RTSP multicast functions without any address translation intervention. + + For this routine to work, the SETUP/200 must fit entirely + into a single TCP packet. This is typically the case, but exceptions + can easily be envisioned under the actual specifications. + + Probably the most troubling aspect of the approach taken here is + that the new SETUP/200 will typically be a different length, and + this causes a certain amount of bookkeeping to keep track of the + changes of sequence and acknowledgment numbers, since the client + machine is totally unaware of the modification to the TCP stream. + + Initial version: May, 2000 (eds) +*/ + +#ifdef _KERNEL +#include +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#else +#include +#include +#endif + +#define RTSP_CONTROL_PORT_NUMBER_1 554 +#define RTSP_CONTROL_PORT_NUMBER_2 7070 +#define TFTP_PORT_NUMBER 69 + +static void +AliasHandleRtspOut(struct libalias *, struct ip *, struct alias_link *, + int maxpacketsize); +static int +fingerprint(struct libalias *la, struct alias_data *ah) +{ + + if (ah->dport != NULL && ah->aport != NULL && ah->sport != NULL && + ntohs(*ah->dport) == TFTP_PORT_NUMBER) + return (0); + if (ah->dport == NULL || ah->sport == NULL || ah->lnk == NULL || + ah->maxpktsize == 0) + return (-1); + if (ntohs(*ah->dport) == RTSP_CONTROL_PORT_NUMBER_1 + || ntohs(*ah->sport) == RTSP_CONTROL_PORT_NUMBER_1 + || ntohs(*ah->dport) == RTSP_CONTROL_PORT_NUMBER_2 + || ntohs(*ah->sport) == RTSP_CONTROL_PORT_NUMBER_2) + return (0); + return (-1); +} + +static int +protohandler(struct libalias *la, struct ip *pip, struct alias_data *ah) +{ + + if (ntohs(*ah->dport) == TFTP_PORT_NUMBER) + FindRtspOut(la, pip->ip_src, pip->ip_dst, + *ah->sport, *ah->aport, IPPROTO_UDP); + else AliasHandleRtspOut(la, pip, ah->lnk, ah->maxpktsize); + return (0); +} + +struct proto_handler handlers[] = { + { + .pri = 100, + .dir = OUT, + .proto = TCP|UDP, + .fingerprint = &fingerprint, + .protohandler = &protohandler + }, + { EOH } +}; + +static int +mod_handler(module_t mod, int type, void *data) +{ + int error; + + switch (type) { + case MOD_LOAD: + error = 0; + LibAliasAttachHandlers(handlers); + break; + case MOD_UNLOAD: + error = 0; + LibAliasDetachHandlers(handlers); + break; + default: + error = EINVAL; + } + return (error); +} + +#ifdef _KERNEL +static +#endif +moduledata_t alias_mod = { + "alias_smedia", mod_handler, NULL +}; + +#ifdef _KERNEL +DECLARE_MODULE(alias_smedia, alias_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND); +MODULE_VERSION(alias_smedia, 1); +MODULE_DEPEND(alias_smedia, libalias, 1, 1, 1); +#endif + +#define RTSP_CONTROL_PORT_NUMBER_1 554 +#define RTSP_CONTROL_PORT_NUMBER_2 7070 +#define RTSP_PORT_GROUP 2 + +#define ISDIGIT(a) (((a) >= '0') && ((a) <= '9')) + +static int +search_string(char *data, int dlen, const char *search_str) +{ + int i, j, k; + int search_str_len; + + search_str_len = strlen(search_str); + for (i = 0; i < dlen - search_str_len; i++) { + for (j = i, k = 0; j < dlen - search_str_len; j++, k++) { + if (data[j] != search_str[k] && + data[j] != search_str[k] - ('a' - 'A')) { + break; + } + if (k == search_str_len - 1) { + return (j + 1); + } + } + } + return (-1); +} + +static int +alias_rtsp_out(struct libalias *la, struct ip *pip, + struct alias_link *lnk, + char *data, + const char *port_str) +{ + int hlen, tlen, dlen; + struct tcphdr *tc; + int i, j, pos, state, port_dlen, new_dlen, delta; + u_short p[2], new_len; + u_short sport, eport, base_port; + u_short salias = 0, ealias = 0, base_alias = 0; + const char *transport_str = "transport:"; + char newdata[2048], *port_data, *port_newdata, stemp[80]; + int links_created = 0, pkt_updated = 0; + struct alias_link *rtsp_lnk = NULL; + struct in_addr null_addr; + + /* Calculate data length of TCP packet */ + tc = (struct tcphdr *)ip_next(pip); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + /* Find keyword, "Transport: " */ + pos = search_string(data, dlen, transport_str); + if (pos < 0) { + return (-1); + } + port_data = data + pos; + port_dlen = dlen - pos; + + memcpy(newdata, data, pos); + port_newdata = newdata + pos; + + while (port_dlen > (int)strlen(port_str)) { + /* Find keyword, appropriate port string */ + pos = search_string(port_data, port_dlen, port_str); + if (pos < 0) { + break; + } + memcpy(port_newdata, port_data, pos + 1); + port_newdata += (pos + 1); + + p[0] = p[1] = 0; + sport = eport = 0; + state = 0; + for (i = pos; i < port_dlen; i++) { + switch (state) { + case 0: + if (port_data[i] == '=') { + state++; + } + break; + case 1: + if (ISDIGIT(port_data[i])) { + p[0] = p[0] * 10 + port_data[i] - '0'; + } else { + if (port_data[i] == ';') { + state = 3; + } + if (port_data[i] == '-') { + state++; + } + } + break; + case 2: + if (ISDIGIT(port_data[i])) { + p[1] = p[1] * 10 + port_data[i] - '0'; + } else { + state++; + } + break; + case 3: + base_port = p[0]; + sport = htons(p[0]); + eport = htons(p[1]); + + if (!links_created) { + + links_created = 1; + /* + * Find an even numbered port + * number base that satisfies the + * contiguous number of ports we + * need + */ + null_addr.s_addr = 0; + if (0 == (salias = FindNewPortGroup(la, null_addr, + FindAliasAddress(la, pip->ip_src), + sport, 0, + RTSP_PORT_GROUP, + IPPROTO_UDP, 1))) { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/RTSP: Cannot find contiguous RTSP data ports\n"); +#endif + } else { + + base_alias = ntohs(salias); + for (j = 0; j < RTSP_PORT_GROUP; j++) { + /* + * Establish link + * to port found in + * RTSP packet + */ + rtsp_lnk = FindRtspOut(la, GetOriginalAddress(lnk), null_addr, + htons(base_port + j), htons(base_alias + j), + IPPROTO_UDP); + if (rtsp_lnk != NULL) { +#ifndef NO_FW_PUNCH + /* + * Punch + * hole in + * firewall + */ + PunchFWHole(rtsp_lnk); +#endif + } else { +#ifdef LIBALIAS_DEBUG + fprintf(stderr, + "PacketAlias/RTSP: Cannot allocate RTSP data ports\n"); +#endif + break; + } + } + } + ealias = htons(base_alias + (RTSP_PORT_GROUP - 1)); + } + if (salias && rtsp_lnk) { + + pkt_updated = 1; + + /* Copy into IP packet */ + sprintf(stemp, "%d", ntohs(salias)); + memcpy(port_newdata, stemp, strlen(stemp)); + port_newdata += strlen(stemp); + + if (eport != 0) { + *port_newdata = '-'; + port_newdata++; + + /* Copy into IP packet */ + sprintf(stemp, "%d", ntohs(ealias)); + memcpy(port_newdata, stemp, strlen(stemp)); + port_newdata += strlen(stemp); + } + *port_newdata = ';'; + port_newdata++; + } + state++; + break; + } + if (state > 3) { + break; + } + } + port_data += i; + port_dlen -= i; + } + + if (!pkt_updated) + return (-1); + + memcpy(port_newdata, port_data, port_dlen); + port_newdata += port_dlen; + *port_newdata = '\0'; + + /* Create new packet */ + new_dlen = port_newdata - newdata; + memcpy(data, newdata, new_dlen); + + SetAckModified(lnk); + tc = (struct tcphdr *)ip_next(pip); + delta = GetDeltaSeqOut(tc->th_seq, lnk); + AddSeq(lnk, delta + new_dlen - dlen, pip->ip_hl, pip->ip_len, + tc->th_seq, tc->th_off); + + new_len = htons(hlen + new_dlen); + DifferentialChecksum(&pip->ip_sum, + &new_len, + &pip->ip_len, + 1); + pip->ip_len = new_len; + + tc->th_sum = 0; +#ifdef _KERNEL + tc->th_x2 = 1; +#else + tc->th_sum = TcpChecksum(pip); +#endif + return (0); +} + +/* Support the protocol used by early versions of RealPlayer */ + +static int +alias_pna_out(struct libalias *la, struct ip *pip, + struct alias_link *lnk, + char *data, + int dlen) +{ + struct alias_link *pna_links; + u_short msg_id, msg_len; + char *work; + u_short alias_port, port; + struct tcphdr *tc; + + work = data; + work += 5; + while (work + 4 < data + dlen) { + memcpy(&msg_id, work, 2); + work += 2; + memcpy(&msg_len, work, 2); + work += 2; + if (ntohs(msg_id) == 0) { + /* end of options */ + return (0); + } + if ((ntohs(msg_id) == 1) || (ntohs(msg_id) == 7)) { + memcpy(&port, work, 2); + pna_links = FindUdpTcpOut(la, pip->ip_src, GetDestAddress(lnk), + port, 0, IPPROTO_UDP, 1); + if (pna_links != NULL) { +#ifndef NO_FW_PUNCH + /* Punch hole in firewall */ + PunchFWHole(pna_links); +#endif + tc = (struct tcphdr *)ip_next(pip); + alias_port = GetAliasPort(pna_links); + memcpy(work, &alias_port, 2); + + /* Compute TCP checksum for revised packet */ + tc->th_sum = 0; +#ifdef _KERNEL + tc->th_x2 = 1; +#else + tc->th_sum = TcpChecksum(pip); +#endif + } + } + work += ntohs(msg_len); + } + + return (0); +} + +static void +AliasHandleRtspOut(struct libalias *la, struct ip *pip, struct alias_link *lnk, int maxpacketsize) +{ + int hlen, tlen, dlen; + struct tcphdr *tc; + char *data; + const char *setup = "SETUP", *pna = "PNA", *str200 = "200"; + const char *okstr = "OK", *client_port_str = "client_port"; + const char *server_port_str = "server_port"; + int i, parseOk; + + (void)maxpacketsize; + + tc = (struct tcphdr *)ip_next(pip); + hlen = (pip->ip_hl + tc->th_off) << 2; + tlen = ntohs(pip->ip_len); + dlen = tlen - hlen; + + data = (char *)pip; + data += hlen; + + /* When aliasing a client, check for the SETUP request */ + if ((ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_1) || + (ntohs(tc->th_dport) == RTSP_CONTROL_PORT_NUMBER_2)) { + + if (dlen >= (int)strlen(setup)) { + if (memcmp(data, setup, strlen(setup)) == 0) { + alias_rtsp_out(la, pip, lnk, data, client_port_str); + return; + } + } + if (dlen >= (int)strlen(pna)) { + if (memcmp(data, pna, strlen(pna)) == 0) { + alias_pna_out(la, pip, lnk, data, dlen); + } + } + } else { + + /* + * When aliasing a server, check for the 200 reply + * Accomodate varying number of blanks between 200 & OK + */ + + if (dlen >= (int)strlen(str200)) { + + for (parseOk = 0, i = 0; + i <= dlen - (int)strlen(str200); + i++) { + if (memcmp(&data[i], str200, strlen(str200)) == 0) { + parseOk = 1; + break; + } + } + if (parseOk) { + + i += strlen(str200); /* skip string found */ + while (data[i] == ' ') /* skip blank(s) */ + i++; + + if ((dlen - i) >= (int)strlen(okstr)) { + + if (memcmp(&data[i], okstr, strlen(okstr)) == 0) + alias_rtsp_out(la, pip, lnk, data, server_port_str); + + } + } + } + } +} diff --git a/freebsd/sys/netinet/libalias/alias_util.c b/freebsd/sys/netinet/libalias/alias_util.c new file mode 100644 index 00000000..1e0c95ae --- /dev/null +++ b/freebsd/sys/netinet/libalias/alias_util.c @@ -0,0 +1,178 @@ +#include + +/*- + * Copyright (c) 2001 Charles Mott + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + + +/* + Alias_util.c contains general utilities used by other functions + in the packet aliasing module. At the moment, there are functions + for computing IP header and TCP packet checksums. + + The checksum routines are based upon example code in a Unix networking + text written by Stevens (sorry, I can't remember the title -- but + at least this is a good author). + + Initial Version: August, 1996 (cjm) + + Version 1.7: January 9, 1997 + Added differential checksum update function. +*/ + +#ifdef _KERNEL +#include +#include +#else +#include +#include +#endif + +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#else +#include +#include +#endif + +/* + * Note: the checksum routines assume that the actual checksum word has + * been zeroed out. If the checksum word is filled with the proper value, + * then these routines will give a result of zero (useful for testing + * purposes); + */ +u_short +LibAliasInternetChecksum(struct libalias *la __unused, u_short * ptr, + int nbytes) +{ + int sum, oddbyte; + + LIBALIAS_LOCK(la); + sum = 0; + while (nbytes > 1) { + sum += *ptr++; + nbytes -= 2; + } + if (nbytes == 1) { + oddbyte = 0; + ((u_char *) & oddbyte)[0] = *(u_char *) ptr; + ((u_char *) & oddbyte)[1] = 0; + sum += oddbyte; + } + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + LIBALIAS_UNLOCK(la); + return (~sum); +} + +#ifndef _KERNEL +u_short +IpChecksum(struct ip *pip) +{ + return (LibAliasInternetChecksum(NULL, (u_short *) pip, + (pip->ip_hl << 2))); + +} + +u_short +TcpChecksum(struct ip *pip) +{ + u_short *ptr; + struct tcphdr *tc; + int nhdr, ntcp, nbytes; + int sum, oddbyte; + + nhdr = pip->ip_hl << 2; + ntcp = ntohs(pip->ip_len) - nhdr; + + tc = (struct tcphdr *)ip_next(pip); + ptr = (u_short *) tc; + +/* Add up TCP header and data */ + nbytes = ntcp; + sum = 0; + while (nbytes > 1) { + sum += *ptr++; + nbytes -= 2; + } + if (nbytes == 1) { + oddbyte = 0; + ((u_char *) & oddbyte)[0] = *(u_char *) ptr; + ((u_char *) & oddbyte)[1] = 0; + sum += oddbyte; + } +/* "Pseudo-header" data */ + ptr = (void *)&pip->ip_dst; + sum += *ptr++; + sum += *ptr; + ptr = (void *)&pip->ip_src; + sum += *ptr++; + sum += *ptr; + sum += htons((u_short) ntcp); + sum += htons((u_short) pip->ip_p); + +/* Roll over carry bits */ + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + +/* Return checksum */ + return ((u_short) ~ sum); +} +#endif /* not _KERNEL */ + +void +DifferentialChecksum(u_short * cksum, void *newp, void *oldp, int n) +{ + int i; + int accumulate; + u_short *new = newp; + u_short *old = oldp; + + accumulate = *cksum; + for (i = 0; i < n; i++) { + accumulate -= *new++; + accumulate += *old++; + } + + if (accumulate < 0) { + accumulate = -accumulate; + accumulate = (accumulate >> 16) + (accumulate & 0xffff); + accumulate += accumulate >> 16; + *cksum = (u_short) ~ accumulate; + } else { + accumulate = (accumulate >> 16) + (accumulate & 0xffff); + accumulate += accumulate >> 16; + *cksum = (u_short) accumulate; + } +} diff --git a/freebsd/sys/netinet/pim.h b/freebsd/sys/netinet/pim.h new file mode 100644 index 00000000..2f887cc2 --- /dev/null +++ b/freebsd/sys/netinet/pim.h @@ -0,0 +1,119 @@ +/*- + * Copyright (c) 1996-2000 + * University of Southern California/Information Sciences Institute. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_PIM_HH_ +#define _NETINET_PIM_HH_ + +/* + * Protocol Independent Multicast (PIM) definitions. + * RFC 2362, June 1998. + * + * Written by Ahmed Helmy, USC/SGI, July 1996. + * Modified by George Edmond Eddy (Rusty), ISI, February 1998. + * Modified by Pavlin Radoslavov, USC/ISI, May 1998, October 2000. + */ + +#include + +#ifndef _PIM_VT +#ifndef BYTE_ORDER +# error BYTE_ORDER is not defined! +#endif +#if (BYTE_ORDER != BIG_ENDIAN) && (BYTE_ORDER != LITTLE_ENDIAN) +# error BYTE_ORDER must be defined to either BIG_ENDIAN or LITTLE_ENDIAN +#endif +#endif /* ! _PIM_VT */ + +/* + * PIM packet header + */ +struct pim { +#ifdef _PIM_VT + uint8_t pim_vt; /* PIM version and message type */ +#else /* ! _PIM_VT */ +#if BYTE_ORDER == BIG_ENDIAN + u_int pim_vers:4, /* PIM protocol version */ + pim_type:4; /* PIM message type */ +#endif +#if BYTE_ORDER == LITTLE_ENDIAN + u_int pim_type:4, /* PIM message type */ + pim_vers:4; /* PIM protocol version */ +#endif +#endif /* ! _PIM_VT */ + uint8_t pim_reserved; /* Reserved */ + uint16_t pim_cksum; /* IP-style checksum */ +}; +/* KAME-related name backward compatibility */ +#define pim_ver pim_vers +#define pim_rsv pim_reserved + +#ifdef _PIM_VT +#define PIM_MAKE_VT(v, t) (0xff & (((v) << 4) | (0x0f & (t)))) +#define PIM_VT_V(x) (((x) >> 4) & 0x0f) +#define PIM_VT_T(x) ((x) & 0x0f) +#endif /* _PIM_VT */ + +#define PIM_VERSION 2 +#define PIM_MINLEN 8 /* PIM message min. length */ +#define PIM_REG_MINLEN (PIM_MINLEN+20) /* PIM Register hdr + inner IPv4 hdr */ +#define PIM6_REG_MINLEN (PIM_MINLEN+40) /* PIM Register hdr + inner IPv6 hdr */ + +/* + * PIM message types + */ +#define PIM_HELLO 0x0 /* PIM-SM and PIM-DM */ +#define PIM_REGISTER 0x1 /* PIM-SM only */ +#define PIM_REGISTER_STOP 0x2 /* PIM-SM only */ +#define PIM_JOIN_PRUNE 0x3 /* PIM-SM and PIM-DM */ +#define PIM_BOOTSTRAP 0x4 /* PIM-SM only */ +#define PIM_ASSERT 0x5 /* PIM-SM and PIM-DM */ +#define PIM_GRAFT 0x6 /* PIM-DM only */ +#define PIM_GRAFT_ACK 0x7 /* PIM-DM only */ +#define PIM_CAND_RP_ADV 0x8 /* PIM-SM only */ +#define PIM_ALL_DF_ELECTION 0xa /* Bidir-PIM-SM only */ + +/* + * PIM-Register message flags + */ +#define PIM_BORDER_REGISTER 0x80000000U /* The Border bit (host-order) */ +#define PIM_NULL_REGISTER 0x40000000U /* The Null-Register bit (host-order)*/ + +/* + * All-PIM-Routers IPv4 and IPv6 multicast addresses + */ +#define INADDR_ALLPIM_ROUTERS_GROUP (uint32_t)0xe000000dU /* 224.0.0.13 */ +#define IN6ADDR_LINKLOCAL_ALLPIM_ROUTERS "ff02::d" +#define IN6ADDR_LINKLOCAL_ALLPIM_ROUTERS_INIT \ + {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0d }}} + +#endif /* _NETINET_PIM_HH_ */ diff --git a/freebsd/sys/netinet/pim_var.h b/freebsd/sys/netinet/pim_var.h new file mode 100644 index 00000000..9d80bbb2 --- /dev/null +++ b/freebsd/sys/netinet/pim_var.h @@ -0,0 +1,84 @@ +/*- + * Copyright (c) 1998-2000 + * University of Southern California/Information Sciences Institute. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_PIM_VAR_HH_ +#define _NETINET_PIM_VAR_HH_ + +/* + * Protocol Independent Multicast (PIM), + * kernel variables and implementation-specific definitions. + * + * Written by George Edmond Eddy (Rusty), ISI, February 1998. + * Modified by Pavlin Radoslavov, USC/ISI, May 1998, Aug 1999, October 2000. + * Modified by Hitoshi Asaeda, WIDE, August 1998. + */ + +/* + * PIM statistics kept in the kernel + */ +struct pimstat { + u_quad_t pims_rcv_total_msgs; /* total PIM messages received */ + u_quad_t pims_rcv_total_bytes; /* total PIM bytes received */ + u_quad_t pims_rcv_tooshort; /* rcvd with too few bytes */ + u_quad_t pims_rcv_badsum; /* rcvd with bad checksum */ + u_quad_t pims_rcv_badversion; /* rcvd bad PIM version */ + u_quad_t pims_rcv_registers_msgs; /* rcvd regs. msgs (data only) */ + u_quad_t pims_rcv_registers_bytes; /* rcvd regs. bytes (data only) */ + u_quad_t pims_rcv_registers_wrongiif; /* rcvd regs. on wrong iif */ + u_quad_t pims_rcv_badregisters; /* rcvd invalid registers */ + u_quad_t pims_snd_registers_msgs; /* sent regs. msgs (data only) */ + u_quad_t pims_snd_registers_bytes; /* sent regs. bytes (data only) */ +}; + +#ifdef _KERNEL +#define PIMSTAT_ADD(name, val) V_pimstat.name += (val) +#define PIMSTAT_INC(name) PIMSTAT_ADD(name, 1) +#endif + +/* + * Names for PIM sysctl objects + */ +#define PIMCTL_STATS 1 /* statistics (read-only) */ +#define PIMCTL_MAXID 2 + +#define PIMCTL_NAMES { \ + { 0, 0 }, \ + { "stats", CTLTYPE_STRUCT }, \ +} + +#ifdef _KERNEL + +void pim_input(struct mbuf *, int); +SYSCTL_DECL(_net_inet_pim); +#endif + +#endif /* _NETINET_PIM_VAR_HH_ */ diff --git a/freebsd/sys/netinet/raw_ip.c b/freebsd/sys/netinet/raw_ip.c new file mode 100644 index 00000000..fb90880f --- /dev/null +++ b/freebsd/sys/netinet/raw_ip.c @@ -0,0 +1,1116 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)raw_ip.c 8.7 (Berkeley) 5/15/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef IPSEC +#include +#endif /*IPSEC*/ + +#include + +VNET_DEFINE(struct inpcbhead, ripcb); +VNET_DEFINE(struct inpcbinfo, ripcbinfo); + +#define V_ripcb VNET(ripcb) +#define V_ripcbinfo VNET(ripcbinfo) + +/* + * Control and data hooks for ipfw, dummynet, divert and so on. + * The data hooks are not used here but it is convenient + * to keep them all in one place. + */ +VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL; +VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; + +int (*ip_dn_ctl_ptr)(struct sockopt *); +int (*ip_dn_io_ptr)(struct mbuf **, int, struct ip_fw_args *); +void (*ip_divert_ptr)(struct mbuf *, int); +int (*ng_ipfw_input_p)(struct mbuf **, int, + struct ip_fw_args *, int); + +/* + * Hooks for multicast routing. They all default to NULL, so leave them not + * initialized and rely on BSS being set to 0. + */ + +/* + * The socket used to communicate with the multicast routing daemon. + */ +VNET_DEFINE(struct socket *, ip_mrouter); + +/* + * The various mrouter and rsvp functions. + */ +int (*ip_mrouter_set)(struct socket *, struct sockopt *); +int (*ip_mrouter_get)(struct socket *, struct sockopt *); +int (*ip_mrouter_done)(void); +int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, + struct ip_moptions *); +int (*mrt_ioctl)(u_long, caddr_t, int); +int (*legal_vif_num)(int); +u_long (*ip_mcast_src)(int); + +void (*rsvp_input_p)(struct mbuf *m, int off); +int (*ip_rsvp_vif)(struct socket *, struct sockopt *); +void (*ip_rsvp_force_done)(struct socket *); + +/* + * Hash functions + */ + +#define INP_PCBHASH_RAW_SIZE 256 +#define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \ + (((proto) + (laddr) + (faddr)) % (mask) + 1) + +static void +rip_inshash(struct inpcb *inp) +{ + struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + struct inpcbhead *pcbhash; + int hash; + + INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_WLOCK_ASSERT(inp); + + if (inp->inp_ip_p != 0 && + inp->inp_laddr.s_addr != INADDR_ANY && + inp->inp_faddr.s_addr != INADDR_ANY) { + hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr, + inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); + } else + hash = 0; + pcbhash = &pcbinfo->ipi_hashbase[hash]; + LIST_INSERT_HEAD(pcbhash, inp, inp_hash); +} + +static void +rip_delhash(struct inpcb *inp) +{ + + INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); + INP_WLOCK_ASSERT(inp); + + LIST_REMOVE(inp, inp_hash); +} + +/* + * Raw interface to IP protocol. + */ + +/* + * Initialize raw connection block q. + */ +static void +rip_zone_change(void *tag) +{ + + uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); +} + +static int +rip_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp = mem; + + INP_LOCK_INIT(inp, "inp", "rawinp"); + return (0); +} + +void +rip_init(void) +{ + + INP_INFO_LOCK_INIT(&V_ripcbinfo, "rip"); + LIST_INIT(&V_ripcb); +#ifdef VIMAGE + V_ripcbinfo.ipi_vnet = curvnet; +#endif + V_ripcbinfo.ipi_listhead = &V_ripcb; + V_ripcbinfo.ipi_hashbase = + hashinit(INP_PCBHASH_RAW_SIZE, M_PCB, &V_ripcbinfo.ipi_hashmask); + V_ripcbinfo.ipi_porthashbase = + hashinit(1, M_PCB, &V_ripcbinfo.ipi_porthashmask); + V_ripcbinfo.ipi_zone = uma_zcreate("ripcb", sizeof(struct inpcb), + NULL, NULL, rip_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(V_ripcbinfo.ipi_zone, maxsockets); + EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, + EVENTHANDLER_PRI_ANY); +} + +#ifdef VIMAGE +void +rip_destroy(void) +{ + + hashdestroy(V_ripcbinfo.ipi_hashbase, M_PCB, + V_ripcbinfo.ipi_hashmask); + hashdestroy(V_ripcbinfo.ipi_porthashbase, M_PCB, + V_ripcbinfo.ipi_porthashmask); +} +#endif + +static int +rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, + struct sockaddr_in *ripsrc) +{ + int policyfail = 0; + + INP_RLOCK_ASSERT(last); + +#ifdef IPSEC + /* check AH/ESP integrity. */ + if (ipsec4_in_reject(n, last)) { + policyfail = 1; + } +#endif /* IPSEC */ +#ifdef MAC + if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) + policyfail = 1; +#endif + /* Check the minimum TTL for socket. */ + if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) + policyfail = 1; + if (!policyfail) { + struct mbuf *opts = NULL; + struct socket *so; + + so = last->inp_socket; + if ((last->inp_flags & INP_CONTROLOPTS) || + (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) + ip_savecontrol(last, &opts, ip, n); + SOCKBUF_LOCK(&so->so_rcv); + if (sbappendaddr_locked(&so->so_rcv, + (struct sockaddr *)ripsrc, n, opts) == 0) { + /* should notify about lost packet */ + m_freem(n); + if (opts) + m_freem(opts); + SOCKBUF_UNLOCK(&so->so_rcv); + } else + sorwakeup_locked(so); + } else + m_freem(n); + return (policyfail); +} + +/* + * Setup generic address and protocol structures for raw_input routine, then + * pass them along with mbuf chain. + */ +void +rip_input(struct mbuf *m, int off) +{ + struct ifnet *ifp; + struct ip *ip = mtod(m, struct ip *); + int proto = ip->ip_p; + struct inpcb *inp, *last; + struct sockaddr_in ripsrc; + int hash; + + bzero(&ripsrc, sizeof(ripsrc)); + ripsrc.sin_len = sizeof(ripsrc); + ripsrc.sin_family = AF_INET; + ripsrc.sin_addr = ip->ip_src; + last = NULL; + + ifp = m->m_pkthdr.rcvif; + + hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, + ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); + INP_INFO_RLOCK(&V_ripcbinfo); + LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { + if (inp->inp_ip_p != proto) + continue; +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) + continue; + if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) + continue; + if (jailed_without_vnet(inp->inp_cred)) { + /* + * XXX: If faddr was bound to multicast group, + * jailed raw socket will drop datagram. + */ + if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) + continue; + } + if (last != NULL) { + struct mbuf *n; + + n = m_copy(m, 0, (int)M_COPYALL); + if (n != NULL) + (void) rip_append(last, ip, n, &ripsrc); + /* XXX count dropped packet */ + INP_RUNLOCK(last); + } + INP_RLOCK(inp); + last = inp; + } + LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { + if (inp->inp_ip_p && inp->inp_ip_p != proto) + continue; +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (!in_nullhost(inp->inp_laddr) && + !in_hosteq(inp->inp_laddr, ip->ip_dst)) + continue; + if (!in_nullhost(inp->inp_faddr) && + !in_hosteq(inp->inp_faddr, ip->ip_src)) + continue; + if (jailed_without_vnet(inp->inp_cred)) { + /* + * Allow raw socket in jail to receive multicast; + * assume process had PRIV_NETINET_RAW at attach, + * and fall through into normal filter path if so. + */ + if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && + prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) + continue; + } + /* + * If this raw socket has multicast state, and we + * have received a multicast, check if this socket + * should receive it, as multicast filtering is now + * the responsibility of the transport layer. + */ + if (inp->inp_moptions != NULL && + IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + /* + * If the incoming datagram is for IGMP, allow it + * through unconditionally to the raw socket. + * + * In the case of IGMPv2, we may not have explicitly + * joined the group, and may have set IFF_ALLMULTI + * on the interface. imo_multi_filter() may discard + * control traffic we actually need to see. + * + * Userland multicast routing daemons should continue + * filter the control traffic appropriately. + */ + int blocked; + + blocked = MCAST_PASS; + if (proto != IPPROTO_IGMP) { + struct sockaddr_in group; + + bzero(&group, sizeof(struct sockaddr_in)); + group.sin_len = sizeof(struct sockaddr_in); + group.sin_family = AF_INET; + group.sin_addr = ip->ip_dst; + + blocked = imo_multi_filter(inp->inp_moptions, + ifp, + (struct sockaddr *)&group, + (struct sockaddr *)&ripsrc); + } + + if (blocked != MCAST_PASS) { + IPSTAT_INC(ips_notmember); + continue; + } + } + if (last != NULL) { + struct mbuf *n; + + n = m_copy(m, 0, (int)M_COPYALL); + if (n != NULL) + (void) rip_append(last, ip, n, &ripsrc); + /* XXX count dropped packet */ + INP_RUNLOCK(last); + } + INP_RLOCK(inp); + last = inp; + } + INP_INFO_RUNLOCK(&V_ripcbinfo); + if (last != NULL) { + if (rip_append(last, ip, m, &ripsrc) != 0) + IPSTAT_INC(ips_delivered); + INP_RUNLOCK(last); + } else { + m_freem(m); + IPSTAT_INC(ips_noproto); + IPSTAT_DEC(ips_delivered); + } +} + +/* + * Generate IP header and pass packet to ip_output. Tack on options user may + * have setup with control call. + */ +int +rip_output(struct mbuf *m, struct socket *so, u_long dst) +{ + struct ip *ip; + int error; + struct inpcb *inp = sotoinpcb(so); + int flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | + IP_ALLOWBROADCAST; + + /* + * If the user handed us a complete IP packet, use it. Otherwise, + * allocate an mbuf for a header and fill it in. + */ + if ((inp->inp_flags & INP_HDRINCL) == 0) { + if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) { + m_freem(m); + return(EMSGSIZE); + } + M_PREPEND(m, sizeof(struct ip), M_DONTWAIT); + if (m == NULL) + return(ENOBUFS); + + INP_RLOCK(inp); + ip = mtod(m, struct ip *); + ip->ip_tos = inp->inp_ip_tos; + if (inp->inp_flags & INP_DONTFRAG) + ip->ip_off = IP_DF; + else + ip->ip_off = 0; + ip->ip_p = inp->inp_ip_p; + ip->ip_len = m->m_pkthdr.len; + ip->ip_src = inp->inp_laddr; + if (jailed(inp->inp_cred)) { + /* + * prison_local_ip4() would be good enough but would + * let a source of INADDR_ANY pass, which we do not + * want to see from jails. We do not go through the + * pain of in_pcbladdr() for raw sockets. + */ + if (ip->ip_src.s_addr == INADDR_ANY) + error = prison_get_ip4(inp->inp_cred, + &ip->ip_src); + else + error = prison_local_ip4(inp->inp_cred, + &ip->ip_src); + if (error != 0) { + INP_RUNLOCK(inp); + m_freem(m); + return (error); + } + } + ip->ip_dst.s_addr = dst; + ip->ip_ttl = inp->inp_ip_ttl; + } else { + if (m->m_pkthdr.len > IP_MAXPACKET) { + m_freem(m); + return(EMSGSIZE); + } + INP_RLOCK(inp); + ip = mtod(m, struct ip *); + error = prison_check_ip4(inp->inp_cred, &ip->ip_src); + if (error != 0) { + INP_RUNLOCK(inp); + m_freem(m); + return (error); + } + + /* + * Don't allow both user specified and setsockopt options, + * and don't allow packet length sizes that will crash. + */ + if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) + || (ip->ip_len > m->m_pkthdr.len) + || (ip->ip_len < (ip->ip_hl << 2))) { + INP_RUNLOCK(inp); + m_freem(m); + return (EINVAL); + } + if (ip->ip_id == 0) + ip->ip_id = ip_newid(); + + /* + * XXX prevent ip_output from overwriting header fields. + */ + flags |= IP_RAWOUTPUT; + IPSTAT_INC(ips_rawout); + } + + if (inp->inp_flags & INP_ONESBCAST) + flags |= IP_SENDONES; + +#ifdef MAC + mac_inpcb_create_mbuf(inp, m); +#endif + + error = ip_output(m, inp->inp_options, NULL, flags, + inp->inp_moptions, inp); + INP_RUNLOCK(inp); + return (error); +} + +/* + * Raw IP socket option processing. + * + * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could + * only be created by a privileged process, and as such, socket option + * operations to manage system properties on any raw socket were allowed to + * take place without explicit additional access control checks. However, + * raw sockets can now also be created in jail(), and therefore explicit + * checks are now required. Likewise, raw sockets can be used by a process + * after it gives up privilege, so some caution is required. For options + * passed down to the IP layer via ip_ctloutput(), checks are assumed to be + * performed in ip_ctloutput() and therefore no check occurs here. + * Unilaterally checking priv_check() here breaks normal IP socket option + * operations on raw sockets. + * + * When adding new socket options here, make sure to add access control + * checks here as necessary. + */ +int +rip_ctloutput(struct socket *so, struct sockopt *sopt) +{ + struct inpcb *inp = sotoinpcb(so); + int error, optval; + + if (sopt->sopt_level != IPPROTO_IP) { + if ((sopt->sopt_level == SOL_SOCKET) && + (sopt->sopt_name == SO_SETFIB)) { + inp->inp_inc.inc_fibnum = so->so_fibnum; + return (0); + } + return (EINVAL); + } + + error = 0; + switch (sopt->sopt_dir) { + case SOPT_GET: + switch (sopt->sopt_name) { + case IP_HDRINCL: + optval = inp->inp_flags & INP_HDRINCL; + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case IP_FW3: /* generic ipfw v.3 functions */ + case IP_FW_ADD: /* ADD actually returns the body... */ + case IP_FW_GET: + case IP_FW_TABLE_GETSIZE: + case IP_FW_TABLE_LIST: + case IP_FW_NAT_GET_CONFIG: + case IP_FW_NAT_GET_LOG: + if (V_ip_fw_ctl_ptr != NULL) + error = V_ip_fw_ctl_ptr(sopt); + else + error = ENOPROTOOPT; + break; + + case IP_DUMMYNET3: /* generic dummynet v.3 functions */ + case IP_DUMMYNET_GET: + if (ip_dn_ctl_ptr != NULL) + error = ip_dn_ctl_ptr(sopt); + else + error = ENOPROTOOPT; + break ; + + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + case MRT_VERSION: + case MRT_ASSERT: + case MRT_API_SUPPORT: + case MRT_API_CONFIG: + case MRT_ADD_BW_UPCALL: + case MRT_DEL_BW_UPCALL: + error = priv_check(curthread, PRIV_NETINET_MROUTE); + if (error != 0) + return (error); + error = ip_mrouter_get ? ip_mrouter_get(so, sopt) : + EOPNOTSUPP; + break; + + default: + error = ip_ctloutput(so, sopt); + break; + } + break; + + case SOPT_SET: + switch (sopt->sopt_name) { + case IP_HDRINCL: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + if (optval) + inp->inp_flags |= INP_HDRINCL; + else + inp->inp_flags &= ~INP_HDRINCL; + break; + + case IP_FW3: /* generic ipfw v.3 functions */ + case IP_FW_ADD: + case IP_FW_DEL: + case IP_FW_FLUSH: + case IP_FW_ZERO: + case IP_FW_RESETLOG: + case IP_FW_TABLE_ADD: + case IP_FW_TABLE_DEL: + case IP_FW_TABLE_FLUSH: + case IP_FW_NAT_CFG: + case IP_FW_NAT_DEL: + if (V_ip_fw_ctl_ptr != NULL) + error = V_ip_fw_ctl_ptr(sopt); + else + error = ENOPROTOOPT; + break; + + case IP_DUMMYNET3: /* generic dummynet v.3 functions */ + case IP_DUMMYNET_CONFIGURE: + case IP_DUMMYNET_DEL: + case IP_DUMMYNET_FLUSH: + if (ip_dn_ctl_ptr != NULL) + error = ip_dn_ctl_ptr(sopt); + else + error = ENOPROTOOPT ; + break ; + + case IP_RSVP_ON: + error = priv_check(curthread, PRIV_NETINET_MROUTE); + if (error != 0) + return (error); + error = ip_rsvp_init(so); + break; + + case IP_RSVP_OFF: + error = priv_check(curthread, PRIV_NETINET_MROUTE); + if (error != 0) + return (error); + error = ip_rsvp_done(); + break; + + case IP_RSVP_VIF_ON: + case IP_RSVP_VIF_OFF: + error = priv_check(curthread, PRIV_NETINET_MROUTE); + if (error != 0) + return (error); + error = ip_rsvp_vif ? + ip_rsvp_vif(so, sopt) : EINVAL; + break; + + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + case MRT_VERSION: + case MRT_ASSERT: + case MRT_API_SUPPORT: + case MRT_API_CONFIG: + case MRT_ADD_BW_UPCALL: + case MRT_DEL_BW_UPCALL: + error = priv_check(curthread, PRIV_NETINET_MROUTE); + if (error != 0) + return (error); + error = ip_mrouter_set ? ip_mrouter_set(so, sopt) : + EOPNOTSUPP; + break; + + default: + error = ip_ctloutput(so, sopt); + break; + } + break; + } + + return (error); +} + +/* + * This function exists solely to receive the PRC_IFDOWN messages which are + * sent by if_down(). It looks for an ifaddr whose ifa_addr is sa, and calls + * in_ifadown() to remove all routes corresponding to that address. It also + * receives the PRC_IFUP messages from if_up() and reinstalls the interface + * routes. + */ +void +rip_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + struct in_ifaddr *ia; + struct ifnet *ifp; + int err; + int flags; + + switch (cmd) { + case PRC_IFDOWN: + IN_IFADDR_RLOCK(); + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + if (ia->ia_ifa.ifa_addr == sa + && (ia->ia_flags & IFA_ROUTE)) { + ifa_ref(&ia->ia_ifa); + IN_IFADDR_RUNLOCK(); + /* + * in_ifscrub kills the interface route. + */ + in_ifscrub(ia->ia_ifp, ia); + /* + * in_ifadown gets rid of all the rest of the + * routes. This is not quite the right thing + * to do, but at least if we are running a + * routing process they will come back. + */ + in_ifadown(&ia->ia_ifa, 0); + ifa_free(&ia->ia_ifa); + break; + } + } + if (ia == NULL) /* If ia matched, already unlocked. */ + IN_IFADDR_RUNLOCK(); + break; + + case PRC_IFUP: + IN_IFADDR_RLOCK(); + TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { + if (ia->ia_ifa.ifa_addr == sa) + break; + } + if (ia == NULL || (ia->ia_flags & IFA_ROUTE)) { + IN_IFADDR_RUNLOCK(); + return; + } + ifa_ref(&ia->ia_ifa); + IN_IFADDR_RUNLOCK(); + flags = RTF_UP; + ifp = ia->ia_ifa.ifa_ifp; + + if ((ifp->if_flags & IFF_LOOPBACK) + || (ifp->if_flags & IFF_POINTOPOINT)) + flags |= RTF_HOST; + + err = rtinit(&ia->ia_ifa, RTM_ADD, flags); + if (err == 0) + ia->ia_flags |= IFA_ROUTE; + err = ifa_add_loopback_route((struct ifaddr *)ia, sa); + ifa_free(&ia->ia_ifa); + break; + } +} + +u_long rip_sendspace = 9216; +u_long rip_recvspace = 9216; + +SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, + &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); +SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, + &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams"); + +static int +rip_attach(struct socket *so, int proto, struct thread *td) +{ + struct inpcb *inp; + int error; + + inp = sotoinpcb(so); + KASSERT(inp == NULL, ("rip_attach: inp != NULL")); + + error = priv_check(td, PRIV_NETINET_RAW); + if (error) + return (error); + if (proto >= IPPROTO_MAX || proto < 0) + return EPROTONOSUPPORT; + error = soreserve(so, rip_sendspace, rip_recvspace); + if (error) + return (error); + INP_INFO_WLOCK(&V_ripcbinfo); + error = in_pcballoc(so, &V_ripcbinfo); + if (error) { + INP_INFO_WUNLOCK(&V_ripcbinfo); + return (error); + } + inp = (struct inpcb *)so->so_pcb; + inp->inp_vflag |= INP_IPV4; + inp->inp_ip_p = proto; + inp->inp_ip_ttl = V_ip_defttl; + rip_inshash(inp); + INP_INFO_WUNLOCK(&V_ripcbinfo); + INP_WUNLOCK(inp); + return (0); +} + +static void +rip_detach(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("rip_detach: inp == NULL")); + KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, + ("rip_detach: not closed")); + + INP_INFO_WLOCK(&V_ripcbinfo); + INP_WLOCK(inp); + rip_delhash(inp); + if (so == V_ip_mrouter && ip_mrouter_done) + ip_mrouter_done(); + if (ip_rsvp_force_done) + ip_rsvp_force_done(so); + if (so == V_ip_rsvpd) + ip_rsvp_done(); + in_pcbdetach(inp); + in_pcbfree(inp); + INP_INFO_WUNLOCK(&V_ripcbinfo); +} + +static void +rip_dodisconnect(struct socket *so, struct inpcb *inp) +{ + + INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); + INP_WLOCK_ASSERT(inp); + + rip_delhash(inp); + inp->inp_faddr.s_addr = INADDR_ANY; + rip_inshash(inp); + SOCK_LOCK(so); + so->so_state &= ~SS_ISCONNECTED; + SOCK_UNLOCK(so); +} + +static void +rip_abort(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("rip_abort: inp == NULL")); + + INP_INFO_WLOCK(&V_ripcbinfo); + INP_WLOCK(inp); + rip_dodisconnect(so, inp); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_ripcbinfo); +} + +static void +rip_close(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("rip_close: inp == NULL")); + + INP_INFO_WLOCK(&V_ripcbinfo); + INP_WLOCK(inp); + rip_dodisconnect(so, inp); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_ripcbinfo); +} + +static int +rip_disconnect(struct socket *so) +{ + struct inpcb *inp; + + if ((so->so_state & SS_ISCONNECTED) == 0) + return (ENOTCONN); + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("rip_disconnect: inp == NULL")); + + INP_INFO_WLOCK(&V_ripcbinfo); + INP_WLOCK(inp); + rip_dodisconnect(so, inp); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_ripcbinfo); + return (0); +} + +static int +rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)nam; + struct inpcb *inp; + int error; + + if (nam->sa_len != sizeof(*addr)) + return (EINVAL); + + error = prison_check_ip4(td->td_ucred, &addr->sin_addr); + if (error != 0) + return (error); + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("rip_bind: inp == NULL")); + + if (TAILQ_EMPTY(&V_ifnet) || + (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) || + (addr->sin_addr.s_addr && + (inp->inp_flags & INP_BINDANY) == 0 && + ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) + return (EADDRNOTAVAIL); + + INP_INFO_WLOCK(&V_ripcbinfo); + INP_WLOCK(inp); + rip_delhash(inp); + inp->inp_laddr = addr->sin_addr; + rip_inshash(inp); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_ripcbinfo); + return (0); +} + +static int +rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)nam; + struct inpcb *inp; + + if (nam->sa_len != sizeof(*addr)) + return (EINVAL); + if (TAILQ_EMPTY(&V_ifnet)) + return (EADDRNOTAVAIL); + if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) + return (EAFNOSUPPORT); + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("rip_connect: inp == NULL")); + + INP_INFO_WLOCK(&V_ripcbinfo); + INP_WLOCK(inp); + rip_delhash(inp); + inp->inp_faddr = addr->sin_addr; + rip_inshash(inp); + soisconnected(so); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_ripcbinfo); + return (0); +} + +static int +rip_shutdown(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("rip_shutdown: inp == NULL")); + + INP_WLOCK(inp); + socantsendmore(so); + INP_WUNLOCK(inp); + return (0); +} + +static int +rip_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct thread *td) +{ + struct inpcb *inp; + u_long dst; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("rip_send: inp == NULL")); + + /* + * Note: 'dst' reads below are unlocked. + */ + if (so->so_state & SS_ISCONNECTED) { + if (nam) { + m_freem(m); + return (EISCONN); + } + dst = inp->inp_faddr.s_addr; /* Unlocked read. */ + } else { + if (nam == NULL) { + m_freem(m); + return (ENOTCONN); + } + dst = ((struct sockaddr_in *)nam)->sin_addr.s_addr; + } + return (rip_output(m, so, dst)); +} + +static int +rip_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = V_ripcbinfo.ipi_count; + n += imax(n / 8, 10); + req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); + return (0); + } + + if (req->newptr != 0) + return (EPERM); + + /* + * OK, now we're committed to doing something. + */ + INP_INFO_RLOCK(&V_ripcbinfo); + gencnt = V_ripcbinfo.ipi_gencnt; + n = V_ripcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_ripcbinfo); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return (error); + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return (ENOMEM); + + INP_INFO_RLOCK(&V_ripcbinfo); + for (inp = LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + INP_WLOCK(inp); + if (inp->inp_gencnt <= gencnt && + cr_canseeinpcb(req->td->td_ucred, inp) == 0) { + in_pcbref(inp); + inp_list[i++] = inp; + } + INP_WUNLOCK(inp); + } + INP_INFO_RUNLOCK(&V_ripcbinfo); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_RLOCK(inp); + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + + bzero(&xi, sizeof(xi)); + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + INP_RUNLOCK(inp); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } else + INP_RUNLOCK(inp); + } + INP_INFO_WLOCK(&V_ripcbinfo); + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_WLOCK(inp); + if (!in_pcbrele(inp)) + INP_WUNLOCK(inp); + } + INP_INFO_WUNLOCK(&V_ripcbinfo); + + if (!error) { + /* + * Give the user an updated idea of our state. If the + * generation differs from what we told her before, she knows + * that something happened while we were processing this + * request, and it might be necessary to retry. + */ + INP_INFO_RLOCK(&V_ripcbinfo); + xig.xig_gen = V_ripcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = V_ripcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_ripcbinfo); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, + rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); + +struct pr_usrreqs rip_usrreqs = { + .pru_abort = rip_abort, + .pru_attach = rip_attach, + .pru_bind = rip_bind, + .pru_connect = rip_connect, + .pru_control = in_control, + .pru_detach = rip_detach, + .pru_disconnect = rip_disconnect, + .pru_peeraddr = in_getpeeraddr, + .pru_send = rip_send, + .pru_shutdown = rip_shutdown, + .pru_sockaddr = in_getsockaddr, + .pru_sosetlabel = in_pcbsosetlabel, + .pru_close = rip_close, +}; diff --git a/freebsd/sys/netinet/sctp.h b/freebsd/sys/netinet/sctp.h new file mode 100644 index 00000000..bf188a23 --- /dev/null +++ b/freebsd/sys/netinet/sctp.h @@ -0,0 +1,549 @@ +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ +/* $KAME: sctp.h,v 1.18 2005/03/06 16:04:16 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef _NETINET_SCTP_HH_ +#define _NETINET_SCTP_HH_ + +#include + + +#define SCTP_PACKED __attribute__((packed)) + +/* + * SCTP protocol - RFC2960. + */ +struct sctphdr { + uint16_t src_port; /* source port */ + uint16_t dest_port; /* destination port */ + uint32_t v_tag; /* verification tag of packet */ + uint32_t checksum; /* Adler32 C-Sum */ + /* chunks follow... */ +} SCTP_PACKED; + +/* + * SCTP Chunks + */ +struct sctp_chunkhdr { + uint8_t chunk_type; /* chunk type */ + uint8_t chunk_flags; /* chunk flags */ + uint16_t chunk_length; /* chunk length */ + /* optional params follow */ +} SCTP_PACKED; + +/* + * SCTP chunk parameters + */ +struct sctp_paramhdr { + uint16_t param_type; /* parameter type */ + uint16_t param_length; /* parameter length */ +} SCTP_PACKED; + +/* + * user socket options: socket API defined + */ +/* + * read-write options + */ +#define SCTP_RTOINFO 0x00000001 +#define SCTP_ASSOCINFO 0x00000002 +#define SCTP_INITMSG 0x00000003 +#define SCTP_NODELAY 0x00000004 +#define SCTP_AUTOCLOSE 0x00000005 +#define SCTP_SET_PEER_PRIMARY_ADDR 0x00000006 +#define SCTP_PRIMARY_ADDR 0x00000007 +#define SCTP_ADAPTATION_LAYER 0x00000008 +/* same as above */ +#define SCTP_ADAPTION_LAYER 0x00000008 +#define SCTP_DISABLE_FRAGMENTS 0x00000009 +#define SCTP_PEER_ADDR_PARAMS 0x0000000a +#define SCTP_DEFAULT_SEND_PARAM 0x0000000b +/* ancillary data/notification interest options */ +#define SCTP_EVENTS 0x0000000c +/* Without this applied we will give V4 and V6 addresses on a V6 socket */ +#define SCTP_I_WANT_MAPPED_V4_ADDR 0x0000000d +#define SCTP_MAXSEG 0x0000000e +#define SCTP_DELAYED_SACK 0x0000000f +#define SCTP_FRAGMENT_INTERLEAVE 0x00000010 +#define SCTP_PARTIAL_DELIVERY_POINT 0x00000011 +/* authentication support */ +#define SCTP_AUTH_CHUNK 0x00000012 +#define SCTP_AUTH_KEY 0x00000013 +#define SCTP_HMAC_IDENT 0x00000014 +#define SCTP_AUTH_ACTIVE_KEY 0x00000015 +#define SCTP_AUTH_DELETE_KEY 0x00000016 +#define SCTP_USE_EXT_RCVINFO 0x00000017 +#define SCTP_AUTO_ASCONF 0x00000018 /* rw */ +#define SCTP_MAXBURST 0x00000019 /* rw */ +#define SCTP_MAX_BURST 0x00000019 /* rw */ +/* assoc level context */ +#define SCTP_CONTEXT 0x0000001a /* rw */ +/* explicit EOR signalling */ +#define SCTP_EXPLICIT_EOR 0x0000001b +#define SCTP_REUSE_PORT 0x0000001c /* rw */ +#define SCTP_AUTH_DEACTIVATE_KEY 0x0000001d + +/* + * read-only options + */ +#define SCTP_STATUS 0x00000100 +#define SCTP_GET_PEER_ADDR_INFO 0x00000101 +/* authentication support */ +#define SCTP_PEER_AUTH_CHUNKS 0x00000102 +#define SCTP_LOCAL_AUTH_CHUNKS 0x00000103 +#define SCTP_GET_ASSOC_NUMBER 0x00000104 /* ro */ +#define SCTP_GET_ASSOC_ID_LIST 0x00000105 /* ro */ +#define SCTP_TIMEOUTS 0x00000106 + +/* + * user socket options: BSD implementation specific + */ +/* + * Blocking I/O is enabled on any TCP type socket by default. For the UDP + * model if this is turned on then the socket buffer is shared for send + * resources amongst all associations. The default for the UDP model is that + * is SS_NBIO is set. Which means all associations have a separate send + * limit BUT they will NOT ever BLOCK instead you will get an error back + * EAGAIN if you try to send too much. If you want the blocking semantics you + * set this option at the cost of sharing one socket send buffer size amongst + * all associations. Peeled off sockets turn this option off and block. But + * since both TCP and peeled off sockets have only one assoc per socket this + * is fine. It probably does NOT make sense to set this on SS_NBIO on a TCP + * model OR peeled off UDP model, but we do allow you to do so. You just use + * the normal syscall to toggle SS_NBIO the way you want. + * + * Blocking I/O is controlled by the SS_NBIO flag on the socket state so_state + * field. + */ + +/* these should probably go into sockets API */ +#define SCTP_RESET_STREAMS 0x00001004 /* wo */ + + +/* here on down are more implementation specific */ +#define SCTP_SET_DEBUG_LEVEL 0x00001005 +#define SCTP_CLR_STAT_LOG 0x00001007 +/* CMT ON/OFF socket option */ +#define SCTP_CMT_ON_OFF 0x00001200 +#define SCTP_CMT_USE_DAC 0x00001201 +/* JRS - Pluggable Congestion Control Socket option */ +#define SCTP_PLUGGABLE_CC 0x00001202 + +/* read only */ +#define SCTP_GET_SNDBUF_USE 0x00001101 +#define SCTP_GET_STAT_LOG 0x00001103 +#define SCTP_PCB_STATUS 0x00001104 +#define SCTP_GET_NONCE_VALUES 0x00001105 + + +/* Special hook for dynamically setting primary for all assoc's, + * this is a write only option that requires root privilege. + */ +#define SCTP_SET_DYNAMIC_PRIMARY 0x00002001 + +/* VRF (virtual router feature) and multi-VRF support + * options. VRF's provide splits within a router + * that give the views of multiple routers. A + * standard host, without VRF support, is just + * a single VRF. If VRF's are supported then + * the transport must be VRF aware. This means + * that every socket call coming in must be directed + * within the endpoint to one of the VRF's it belongs + * to. The endpoint, before binding, may select + * the "default" VRF it is in by using a set socket + * option with SCTP_VRF_ID. This will also + * get propagated to the default VRF. Once the + * endpoint binds an address then it CANNOT add + * additional VRF's to become a Multi-VRF endpoint. + * + * Before BINDING additional VRF's can be added with + * the SCTP_ADD_VRF_ID call or deleted with + * SCTP_DEL_VRF_ID. + * + * Associations are ALWAYS contained inside a single + * VRF. They cannot reside in two (or more) VRF's. Incoming + * packets, assuming the router is VRF aware, can always + * tell us what VRF they arrived on. A host not supporting + * any VRF's will find that the packets always arrived on the + * single VRF that the host has. + * + */ + +#define SCTP_VRF_ID 0x00003001 +#define SCTP_ADD_VRF_ID 0x00003002 +#define SCTP_GET_VRF_IDS 0x00003003 +#define SCTP_GET_ASOC_VRF 0x00003004 +#define SCTP_DEL_VRF_ID 0x00003005 + +/* + * If you enable packet logging you can get + * a poor mans ethereal output in binary + * form. Note this is a compile option to + * the kernel, SCTP_PACKET_LOGGING, and + * without it in your kernel you + * will get a EOPNOTSUPP + */ +#define SCTP_GET_PACKET_LOG 0x00004001 + +/* + * hidden implementation specific options these are NOT user visible (should + * move out of sctp.h) + */ +/* sctp_bindx() flags as hidden socket options */ +#define SCTP_BINDX_ADD_ADDR 0x00008001 +#define SCTP_BINDX_REM_ADDR 0x00008002 +/* Hidden socket option that gets the addresses */ +#define SCTP_GET_PEER_ADDRESSES 0x00008003 +#define SCTP_GET_LOCAL_ADDRESSES 0x00008004 +/* return the total count in bytes needed to hold all local addresses bound */ +#define SCTP_GET_LOCAL_ADDR_SIZE 0x00008005 +/* Return the total count in bytes needed to hold the remote address */ +#define SCTP_GET_REMOTE_ADDR_SIZE 0x00008006 +/* hidden option for connectx */ +#define SCTP_CONNECT_X 0x00008007 +/* hidden option for connectx_delayed, part of sendx */ +#define SCTP_CONNECT_X_DELAYED 0x00008008 +#define SCTP_CONNECT_X_COMPLETE 0x00008009 +/* hidden socket option based sctp_peeloff */ +#define SCTP_PEELOFF 0x0000800a +/* the real worker for sctp_getaddrlen() */ +#define SCTP_GET_ADDR_LEN 0x0000800b +/* temporary workaround for Apple listen() issue, no args used */ +#define SCTP_LISTEN_FIX 0x0000800c +/* Debug things that need to be purged */ +#define SCTP_SET_INITIAL_DBG_SEQ 0x00009f00 + +/* JRS - Supported congestion control modules for pluggable + * congestion control + */ +/* Standard TCP Congestion Control */ +#define SCTP_CC_RFC2581 0x00000000 +/* High Speed TCP Congestion Control (Floyd) */ +#define SCTP_CC_HSTCP 0x00000001 +/* HTCP Congestion Control */ +#define SCTP_CC_HTCP 0x00000002 + + +/* fragment interleave constants + * setting must be one of these or + * EINVAL returned. + */ +#define SCTP_FRAG_LEVEL_0 0x00000000 +#define SCTP_FRAG_LEVEL_1 0x00000001 +#define SCTP_FRAG_LEVEL_2 0x00000002 + +/* + * user state values + */ +#define SCTP_CLOSED 0x0000 +#define SCTP_BOUND 0x1000 +#define SCTP_LISTEN 0x2000 +#define SCTP_COOKIE_WAIT 0x0002 +#define SCTP_COOKIE_ECHOED 0x0004 +#define SCTP_ESTABLISHED 0x0008 +#define SCTP_SHUTDOWN_SENT 0x0010 +#define SCTP_SHUTDOWN_RECEIVED 0x0020 +#define SCTP_SHUTDOWN_ACK_SENT 0x0040 +#define SCTP_SHUTDOWN_PENDING 0x0080 + +/* + * SCTP operational error codes (user visible) + */ +#define SCTP_CAUSE_NO_ERROR 0x0000 +#define SCTP_CAUSE_INVALID_STREAM 0x0001 +#define SCTP_CAUSE_MISSING_PARAM 0x0002 +#define SCTP_CAUSE_STALE_COOKIE 0x0003 +#define SCTP_CAUSE_OUT_OF_RESC 0x0004 +#define SCTP_CAUSE_UNRESOLVABLE_ADDR 0x0005 +#define SCTP_CAUSE_UNRECOG_CHUNK 0x0006 +#define SCTP_CAUSE_INVALID_PARAM 0x0007 +#define SCTP_CAUSE_UNRECOG_PARAM 0x0008 +#define SCTP_CAUSE_NO_USER_DATA 0x0009 +#define SCTP_CAUSE_COOKIE_IN_SHUTDOWN 0x000a +#define SCTP_CAUSE_RESTART_W_NEWADDR 0x000b +#define SCTP_CAUSE_USER_INITIATED_ABT 0x000c +#define SCTP_CAUSE_PROTOCOL_VIOLATION 0x000d + +/* Error causes from RFC5061 */ +#define SCTP_CAUSE_DELETING_LAST_ADDR 0x00a0 +#define SCTP_CAUSE_RESOURCE_SHORTAGE 0x00a1 +#define SCTP_CAUSE_DELETING_SRC_ADDR 0x00a2 +#define SCTP_CAUSE_ILLEGAL_ASCONF_ACK 0x00a3 +#define SCTP_CAUSE_REQUEST_REFUSED 0x00a4 + +/* Error causes from nat-draft */ +#define SCTP_CAUSE_NAT_COLLIDING_STATE 0x00b0 +#define SCTP_CAUSE_NAT_MISSING_STATE 0x00b1 + +/* Error causes from RFC4895 */ +#define SCTP_CAUSE_UNSUPPORTED_HMACID 0x0105 + +/* + * error cause parameters (user visible) + */ +struct sctp_error_cause { + uint16_t code; + uint16_t length; + /* optional cause-specific info may follow */ +} SCTP_PACKED; + +struct sctp_error_invalid_stream { + struct sctp_error_cause cause; /* code=SCTP_ERROR_INVALID_STREAM */ + uint16_t stream_id; /* stream id of the DATA in error */ + uint16_t reserved; +} SCTP_PACKED; + +struct sctp_error_missing_param { + struct sctp_error_cause cause; /* code=SCTP_ERROR_MISSING_PARAM */ + uint32_t num_missing_params; /* number of missing parameters */ + /* uint16_t param_type's follow */ +} SCTP_PACKED; + +struct sctp_error_stale_cookie { + struct sctp_error_cause cause; /* code=SCTP_ERROR_STALE_COOKIE */ + uint32_t stale_time; /* time in usec of staleness */ +} SCTP_PACKED; + +struct sctp_error_out_of_resource { + struct sctp_error_cause cause; /* code=SCTP_ERROR_OUT_OF_RESOURCES */ +} SCTP_PACKED; + +struct sctp_error_unresolv_addr { + struct sctp_error_cause cause; /* code=SCTP_ERROR_UNRESOLVABLE_ADDR */ + +} SCTP_PACKED; + +struct sctp_error_unrecognized_chunk { + struct sctp_error_cause cause; /* code=SCTP_ERROR_UNRECOG_CHUNK */ + struct sctp_chunkhdr ch;/* header from chunk in error */ +} SCTP_PACKED; + +/* + * Main SCTP chunk types we place these here so natd and f/w's in user land + * can find them. + */ +/************0x00 series ***********/ +#define SCTP_DATA 0x00 +#define SCTP_INITIATION 0x01 +#define SCTP_INITIATION_ACK 0x02 +#define SCTP_SELECTIVE_ACK 0x03 +#define SCTP_HEARTBEAT_REQUEST 0x04 +#define SCTP_HEARTBEAT_ACK 0x05 +#define SCTP_ABORT_ASSOCIATION 0x06 +#define SCTP_SHUTDOWN 0x07 +#define SCTP_SHUTDOWN_ACK 0x08 +#define SCTP_OPERATION_ERROR 0x09 +#define SCTP_COOKIE_ECHO 0x0a +#define SCTP_COOKIE_ACK 0x0b +#define SCTP_ECN_ECHO 0x0c +#define SCTP_ECN_CWR 0x0d +#define SCTP_SHUTDOWN_COMPLETE 0x0e +/* RFC4895 */ +#define SCTP_AUTHENTICATION 0x0f +/* EY nr_sack chunk id*/ +#define SCTP_NR_SELECTIVE_ACK 0x10 +/************0x40 series ***********/ +/************0x80 series ***********/ +/* RFC5061 */ +#define SCTP_ASCONF_ACK 0x80 +/* draft-ietf-stewart-pktdrpsctp */ +#define SCTP_PACKET_DROPPED 0x81 +/* draft-ietf-stewart-strreset-xxx */ +#define SCTP_STREAM_RESET 0x82 + +/* RFC4820 */ +#define SCTP_PAD_CHUNK 0x84 +/************0xc0 series ***********/ +/* RFC3758 */ +#define SCTP_FORWARD_CUM_TSN 0xc0 +/* RFC5061 */ +#define SCTP_ASCONF 0xc1 + + +/* ABORT and SHUTDOWN COMPLETE FLAG */ +#define SCTP_HAD_NO_TCB 0x01 + +/* Packet dropped flags */ +#define SCTP_FROM_MIDDLE_BOX SCTP_HAD_NO_TCB +#define SCTP_BADCRC 0x02 +#define SCTP_PACKET_TRUNCATED 0x04 + +#define SCTP_SAT_NETWORK_MIN 400 /* min ms for RTT to set satellite + * time */ +#define SCTP_SAT_NETWORK_BURST_INCR 2 /* how many times to multiply maxburst + * in sat */ + +/* Data Chuck Specific Flags */ +#define SCTP_DATA_FRAG_MASK 0x03 +#define SCTP_DATA_MIDDLE_FRAG 0x00 +#define SCTP_DATA_LAST_FRAG 0x01 +#define SCTP_DATA_FIRST_FRAG 0x02 +#define SCTP_DATA_NOT_FRAG 0x03 +#define SCTP_DATA_UNORDERED 0x04 +#define SCTP_DATA_SACK_IMMEDIATELY 0x08 +/* ECN Nonce: SACK Chunk Specific Flags */ +#define SCTP_SACK_NONCE_SUM 0x01 + +/* CMT DAC algorithm SACK flag */ +#define SCTP_SACK_CMT_DAC 0x80 + +/* + * PCB flags (in sctp_flags bitmask). + * Note the features and flags are meant + * for use by netstat. + */ +#define SCTP_PCB_FLAGS_UDPTYPE 0x00000001 +#define SCTP_PCB_FLAGS_TCPTYPE 0x00000002 +#define SCTP_PCB_FLAGS_BOUNDALL 0x00000004 +#define SCTP_PCB_FLAGS_ACCEPTING 0x00000008 +#define SCTP_PCB_FLAGS_UNBOUND 0x00000010 +#define SCTP_PCB_FLAGS_CLOSE_IP 0x00040000 +#define SCTP_PCB_FLAGS_WAS_CONNECTED 0x00080000 +#define SCTP_PCB_FLAGS_WAS_ABORTED 0x00100000 +/* TCP model support */ + +#define SCTP_PCB_FLAGS_CONNECTED 0x00200000 +#define SCTP_PCB_FLAGS_IN_TCPPOOL 0x00400000 +#define SCTP_PCB_FLAGS_DONT_WAKE 0x00800000 +#define SCTP_PCB_FLAGS_WAKEOUTPUT 0x01000000 +#define SCTP_PCB_FLAGS_WAKEINPUT 0x02000000 +#define SCTP_PCB_FLAGS_BOUND_V6 0x04000000 +#define SCTP_PCB_FLAGS_BLOCKING_IO 0x08000000 +#define SCTP_PCB_FLAGS_SOCKET_GONE 0x10000000 +#define SCTP_PCB_FLAGS_SOCKET_ALLGONE 0x20000000 +#define SCTP_PCB_FLAGS_SOCKET_CANT_READ 0x40000000 +/* flags to copy to new PCB */ +#define SCTP_PCB_COPY_FLAGS (SCTP_PCB_FLAGS_BOUNDALL|\ + SCTP_PCB_FLAGS_WAKEINPUT|\ + SCTP_PCB_FLAGS_BOUND_V6) + + +/* + * PCB Features (in sctp_features bitmask) + */ +#define SCTP_PCB_FLAGS_EXT_RCVINFO 0x00000002 +#define SCTP_PCB_FLAGS_DONOT_HEARTBEAT 0x00000004 +#define SCTP_PCB_FLAGS_FRAG_INTERLEAVE 0x00000008 +#define SCTP_PCB_FLAGS_INTERLEAVE_STRMS 0x00000010 +#define SCTP_PCB_FLAGS_DO_ASCONF 0x00000020 +#define SCTP_PCB_FLAGS_AUTO_ASCONF 0x00000040 +#define SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE 0x00000080 +/* socket options */ +#define SCTP_PCB_FLAGS_NODELAY 0x00000100 +#define SCTP_PCB_FLAGS_AUTOCLOSE 0x00000200 +#define SCTP_PCB_FLAGS_RECVDATAIOEVNT 0x00000400 +#define SCTP_PCB_FLAGS_RECVASSOCEVNT 0x00000800 +#define SCTP_PCB_FLAGS_RECVPADDREVNT 0x00001000 +#define SCTP_PCB_FLAGS_RECVPEERERR 0x00002000 +#define SCTP_PCB_FLAGS_RECVSENDFAILEVNT 0x00004000 +#define SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT 0x00008000 +#define SCTP_PCB_FLAGS_ADAPTATIONEVNT 0x00010000 +#define SCTP_PCB_FLAGS_PDAPIEVNT 0x00020000 +#define SCTP_PCB_FLAGS_AUTHEVNT 0x00040000 +#define SCTP_PCB_FLAGS_STREAM_RESETEVNT 0x00080000 +#define SCTP_PCB_FLAGS_NO_FRAGMENT 0x00100000 +#define SCTP_PCB_FLAGS_EXPLICIT_EOR 0x00400000 +#define SCTP_PCB_FLAGS_NEEDS_MAPPED_V4 0x00800000 +#define SCTP_PCB_FLAGS_MULTIPLE_ASCONFS 0x01000000 +#define SCTP_PCB_FLAGS_PORTREUSE 0x02000000 +#define SCTP_PCB_FLAGS_DRYEVNT 0x04000000 +/*- + * mobility_features parameters (by micchie).Note + * these features are applied against the + * sctp_mobility_features flags.. not the sctp_features + * flags. + */ +#define SCTP_MOBILITY_BASE 0x00000001 +#define SCTP_MOBILITY_FASTHANDOFF 0x00000002 +#define SCTP_MOBILITY_PRIM_DELETED 0x00000004 + + +#define SCTP_SMALLEST_PMTU 512 /* smallest pmtu allowed when disabling PMTU + * discovery */ + +#include + +/* This dictates the size of the packet + * collection buffer. This only applies + * if SCTP_PACKET_LOGGING is enabled in + * your config. + */ +#define SCTP_PACKET_LOG_SIZE 65536 + +/* Maximum delays and such a user can set for options that + * take ms. + */ +#define SCTP_MAX_SACK_DELAY 500 /* per RFC4960 */ +#define SCTP_MAX_HB_INTERVAL 14400000 /* 4 hours in ms */ +#define SCTP_MAX_COOKIE_LIFE 3600000 /* 1 hour in ms */ + + +/* Types of logging/KTR tracing that can be enabled via the + * sysctl net.inet.sctp.sctp_logging. You must also enable + * SUBSYS tracing. + * Note that you must have the SCTP option in the kernel + * to enable these as well. + */ +#define SCTP_BLK_LOGGING_ENABLE 0x00000001 +#define SCTP_CWND_MONITOR_ENABLE 0x00000002 +#define SCTP_CWND_LOGGING_ENABLE 0x00000004 +#define SCTP_EARLYFR_LOGGING_ENABLE 0x00000010 +#define SCTP_FLIGHT_LOGGING_ENABLE 0x00000020 +#define SCTP_FR_LOGGING_ENABLE 0x00000040 +#define SCTP_LOCK_LOGGING_ENABLE 0x00000080 +#define SCTP_MAP_LOGGING_ENABLE 0x00000100 +#define SCTP_MBCNT_LOGGING_ENABLE 0x00000200 +#define SCTP_MBUF_LOGGING_ENABLE 0x00000400 +#define SCTP_NAGLE_LOGGING_ENABLE 0x00000800 +#define SCTP_RECV_RWND_LOGGING_ENABLE 0x00001000 +#define SCTP_RTTVAR_LOGGING_ENABLE 0x00002000 +#define SCTP_SACK_LOGGING_ENABLE 0x00004000 +#define SCTP_SACK_RWND_LOGGING_ENABLE 0x00008000 +#define SCTP_SB_LOGGING_ENABLE 0x00010000 +#define SCTP_STR_LOGGING_ENABLE 0x00020000 +#define SCTP_WAKE_LOGGING_ENABLE 0x00040000 +#define SCTP_LOG_MAXBURST_ENABLE 0x00080000 +#define SCTP_LOG_RWND_ENABLE 0x00100000 +#define SCTP_LOG_SACK_ARRIVALS_ENABLE 0x00200000 +#define SCTP_LTRACE_CHUNK_ENABLE 0x00400000 +#define SCTP_LTRACE_ERROR_ENABLE 0x00800000 +#define SCTP_LAST_PACKET_TRACING 0x01000000 +#define SCTP_THRESHOLD_LOGGING 0x02000000 +#define SCTP_LOG_AT_SEND_2_SCTP 0x04000000 +#define SCTP_LOG_AT_SEND_2_OUTQ 0x08000000 +#define SCTP_LOG_TRY_ADVANCE 0x10000000 + + +#undef SCTP_PACKED + +#endif /* !_NETINET_SCTP_HH_ */ diff --git a/freebsd/sys/netinet/sctp_asconf.c b/freebsd/sys/netinet/sctp_asconf.c new file mode 100644 index 00000000..206cf600 --- /dev/null +++ b/freebsd/sys/netinet/sctp_asconf.c @@ -0,0 +1,3397 @@ +#include + +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_asconf.c,v 1.24 2005/03/06 16:04:16 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * debug flags: + * SCTP_DEBUG_ASCONF1: protocol info, general info and errors + * SCTP_DEBUG_ASCONF2: detailed info + */ +#ifdef SCTP_DEBUG +#endif /* SCTP_DEBUG */ + + +static void +sctp_asconf_get_source_ip(struct mbuf *m, struct sockaddr *sa) +{ + struct ip *iph; + struct sockaddr_in *sin; + +#ifdef INET6 + struct sockaddr_in6 *sin6; + +#endif + + iph = mtod(m, struct ip *); + if (iph->ip_v == IPVERSION) { + /* IPv4 source */ + sin = (struct sockaddr_in *)sa; + bzero(sin, sizeof(*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_port = 0; + sin->sin_addr.s_addr = iph->ip_src.s_addr; + return; + } +#ifdef INET6 + else if (iph->ip_v == (IPV6_VERSION >> 4)) { + /* IPv6 source */ + struct ip6_hdr *ip6; + + sin6 = (struct sockaddr_in6 *)sa; + bzero(sin6, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_port = 0; + ip6 = mtod(m, struct ip6_hdr *); + sin6->sin6_addr = ip6->ip6_src; + return; + } +#endif /* INET6 */ + else + return; +} + +/* + * draft-ietf-tsvwg-addip-sctp + * + * An ASCONF parameter queue exists per asoc which holds the pending address + * operations. Lists are updated upon receipt of ASCONF-ACK. + * + * A restricted_addrs list exists per assoc to hold local addresses that are + * not (yet) usable by the assoc as a source address. These addresses are + * either pending an ASCONF operation (and exist on the ASCONF parameter + * queue), or they are permanently restricted (the peer has returned an + * ERROR indication to an ASCONF(ADD), or the peer does not support ASCONF). + * + * Deleted addresses are always immediately removed from the lists as they will + * (shortly) no longer exist in the kernel. We send ASCONFs as a courtesy, + * only if allowed. + */ + +/* + * ASCONF parameter processing. + * response_required: set if a reply is required (eg. SUCCESS_REPORT). + * returns a mbuf to an "error" response parameter or NULL/"success" if ok. + * FIX: allocating this many mbufs on the fly is pretty inefficient... + */ +static struct mbuf * +sctp_asconf_success_response(uint32_t id) +{ + struct mbuf *m_reply = NULL; + struct sctp_asconf_paramhdr *aph; + + m_reply = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_paramhdr), + 0, M_DONTWAIT, 1, MT_DATA); + if (m_reply == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "asconf_success_response: couldn't get mbuf!\n"); + return NULL; + } + aph = mtod(m_reply, struct sctp_asconf_paramhdr *); + aph->correlation_id = id; + aph->ph.param_type = htons(SCTP_SUCCESS_REPORT); + aph->ph.param_length = sizeof(struct sctp_asconf_paramhdr); + SCTP_BUF_LEN(m_reply) = aph->ph.param_length; + aph->ph.param_length = htons(aph->ph.param_length); + + return m_reply; +} + +static struct mbuf * +sctp_asconf_error_response(uint32_t id, uint16_t cause, uint8_t * error_tlv, + uint16_t tlv_length) +{ + struct mbuf *m_reply = NULL; + struct sctp_asconf_paramhdr *aph; + struct sctp_error_cause *error; + uint8_t *tlv; + + m_reply = sctp_get_mbuf_for_msg((sizeof(struct sctp_asconf_paramhdr) + + tlv_length + + sizeof(struct sctp_error_cause)), + 0, M_DONTWAIT, 1, MT_DATA); + if (m_reply == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "asconf_error_response: couldn't get mbuf!\n"); + return NULL; + } + aph = mtod(m_reply, struct sctp_asconf_paramhdr *); + error = (struct sctp_error_cause *)(aph + 1); + + aph->correlation_id = id; + aph->ph.param_type = htons(SCTP_ERROR_CAUSE_IND); + error->code = htons(cause); + error->length = tlv_length + sizeof(struct sctp_error_cause); + aph->ph.param_length = error->length + + sizeof(struct sctp_asconf_paramhdr); + + if (aph->ph.param_length > MLEN) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "asconf_error_response: tlv_length (%xh) too big\n", + tlv_length); + sctp_m_freem(m_reply); /* discard */ + return NULL; + } + if (error_tlv != NULL) { + tlv = (uint8_t *) (error + 1); + memcpy(tlv, error_tlv, tlv_length); + } + SCTP_BUF_LEN(m_reply) = aph->ph.param_length; + error->length = htons(error->length); + aph->ph.param_length = htons(aph->ph.param_length); + + return m_reply; +} + +static struct mbuf * +sctp_process_asconf_add_ip(struct mbuf *m, struct sctp_asconf_paramhdr *aph, + struct sctp_tcb *stcb, int response_required) +{ + struct mbuf *m_reply = NULL; + struct sockaddr_storage sa_source, sa_store; + struct sctp_ipv4addr_param *v4addr; + uint16_t param_type, param_length, aparam_length; + struct sockaddr *sa; + struct sockaddr_in *sin; + int zero_address = 0; + +#ifdef INET6 + struct sockaddr_in6 *sin6; + struct sctp_ipv6addr_param *v6addr; + +#endif /* INET6 */ + + aparam_length = ntohs(aph->ph.param_length); + v4addr = (struct sctp_ipv4addr_param *)(aph + 1); +#ifdef INET6 + v6addr = (struct sctp_ipv6addr_param *)(aph + 1); +#endif /* INET6 */ + param_type = ntohs(v4addr->ph.param_type); + param_length = ntohs(v4addr->ph.param_length); + + sa = (struct sockaddr *)&sa_store; + switch (param_type) { + case SCTP_IPV4_ADDRESS: + if (param_length != sizeof(struct sctp_ipv4addr_param)) { + /* invalid param size */ + return NULL; + } + sin = (struct sockaddr_in *)&sa_store; + bzero(sin, sizeof(*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_port = stcb->rport; + sin->sin_addr.s_addr = v4addr->addr; + if (sin->sin_addr.s_addr == INADDR_ANY) + zero_address = 1; + SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: adding "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); + break; + case SCTP_IPV6_ADDRESS: +#ifdef INET6 + if (param_length != sizeof(struct sctp_ipv6addr_param)) { + /* invalid param size */ + return NULL; + } + sin6 = (struct sockaddr_in6 *)&sa_store; + bzero(sin6, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_port = stcb->rport; + memcpy((caddr_t)&sin6->sin6_addr, v6addr->addr, + sizeof(struct in6_addr)); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + zero_address = 1; + SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_add_ip: adding "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); +#else + /* IPv6 not enabled! */ + /* FIX ME: currently sends back an invalid param error */ + m_reply = sctp_asconf_error_response(aph->correlation_id, + SCTP_CAUSE_INVALID_PARAM, (uint8_t *) aph, aparam_length); + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_asconf_add_ip: v6 disabled- skipping "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); + return m_reply; +#endif + break; + default: + m_reply = sctp_asconf_error_response(aph->correlation_id, + SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *) aph, + aparam_length); + return m_reply; + } /* end switch */ + + /* if 0.0.0.0/::0, add the source address instead */ + if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) { + sa = (struct sockaddr *)&sa_source; + sctp_asconf_get_source_ip(m, sa); + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_asconf_add_ip: using source addr "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); + } + /* add the address */ + if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, + SCTP_ADDR_DYNAMIC_ADDED) != 0) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_asconf_add_ip: error adding address\n"); + m_reply = sctp_asconf_error_response(aph->correlation_id, + SCTP_CAUSE_RESOURCE_SHORTAGE, (uint8_t *) aph, + aparam_length); + } else { + /* notify upper layer */ + sctp_ulp_notify(SCTP_NOTIFY_ASCONF_ADD_IP, stcb, 0, sa, SCTP_SO_NOT_LOCKED); + if (response_required) { + m_reply = + sctp_asconf_success_response(aph->correlation_id); + } + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, + NULL, SCTP_FROM_SCTP_ASCONF + SCTP_LOC_1); + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, + stcb, NULL); + } + + return m_reply; +} + +static int +sctp_asconf_del_remote_addrs_except(struct sctp_tcb *stcb, struct sockaddr *src) +{ + struct sctp_nets *src_net, *net; + + /* make sure the source address exists as a destination net */ + src_net = sctp_findnet(stcb, src); + if (src_net == NULL) { + /* not found */ + return -1; + } + /* delete all destination addresses except the source */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if (net != src_net) { + /* delete this address */ + sctp_remove_net(stcb, net); + SCTPDBG(SCTP_DEBUG_ASCONF1, + "asconf_del_remote_addrs_except: deleting "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, + (struct sockaddr *)&net->ro._l_addr); + /* notify upper layer */ + sctp_ulp_notify(SCTP_NOTIFY_ASCONF_DELETE_IP, stcb, 0, + (struct sockaddr *)&net->ro._l_addr, SCTP_SO_NOT_LOCKED); + } + } + return 0; +} + +static struct mbuf * +sctp_process_asconf_delete_ip(struct mbuf *m, struct sctp_asconf_paramhdr *aph, + struct sctp_tcb *stcb, int response_required) +{ + struct mbuf *m_reply = NULL; + struct sockaddr_storage sa_source, sa_store; + struct sctp_ipv4addr_param *v4addr; + uint16_t param_type, param_length, aparam_length; + struct sockaddr *sa; + struct sockaddr_in *sin; + int zero_address = 0; + int result; + +#ifdef INET6 + struct sockaddr_in6 *sin6; + struct sctp_ipv6addr_param *v6addr; + +#endif /* INET6 */ + + /* get the source IP address for src and 0.0.0.0/::0 delete checks */ + sctp_asconf_get_source_ip(m, (struct sockaddr *)&sa_source); + + aparam_length = ntohs(aph->ph.param_length); + v4addr = (struct sctp_ipv4addr_param *)(aph + 1); +#ifdef INET6 + v6addr = (struct sctp_ipv6addr_param *)(aph + 1); +#endif /* INET6 */ + param_type = ntohs(v4addr->ph.param_type); + param_length = ntohs(v4addr->ph.param_length); + + sa = (struct sockaddr *)&sa_store; + switch (param_type) { + case SCTP_IPV4_ADDRESS: + if (param_length != sizeof(struct sctp_ipv4addr_param)) { + /* invalid param size */ + return NULL; + } + sin = (struct sockaddr_in *)&sa_store; + bzero(sin, sizeof(*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_port = stcb->rport; + sin->sin_addr.s_addr = v4addr->addr; + if (sin->sin_addr.s_addr == INADDR_ANY) + zero_address = 1; + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_asconf_delete_ip: deleting "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); + break; + case SCTP_IPV6_ADDRESS: + if (param_length != sizeof(struct sctp_ipv6addr_param)) { + /* invalid param size */ + return NULL; + } +#ifdef INET6 + sin6 = (struct sockaddr_in6 *)&sa_store; + bzero(sin6, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_port = stcb->rport; + memcpy(&sin6->sin6_addr, v6addr->addr, + sizeof(struct in6_addr)); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + zero_address = 1; + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_asconf_delete_ip: deleting "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); +#else + /* IPv6 not enabled! No "action" needed; just ack it */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_asconf_delete_ip: v6 disabled- ignoring: "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); + /* just respond with a "success" ASCONF-ACK */ + return NULL; +#endif + break; + default: + m_reply = sctp_asconf_error_response(aph->correlation_id, + SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *) aph, + aparam_length); + return m_reply; + } + + /* make sure the source address is not being deleted */ + if (sctp_cmpaddr(sa, (struct sockaddr *)&sa_source)) { + /* trying to delete the source address! */ + SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: tried to delete source addr\n"); + m_reply = sctp_asconf_error_response(aph->correlation_id, + SCTP_CAUSE_DELETING_SRC_ADDR, (uint8_t *) aph, + aparam_length); + return m_reply; + } + /* if deleting 0.0.0.0/::0, delete all addresses except src addr */ + if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) { + result = sctp_asconf_del_remote_addrs_except(stcb, + (struct sockaddr *)&sa_source); + + if (result) { + /* src address did not exist? */ + SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: src addr does not exist?\n"); + /* what error to reply with?? */ + m_reply = + sctp_asconf_error_response(aph->correlation_id, + SCTP_CAUSE_REQUEST_REFUSED, (uint8_t *) aph, + aparam_length); + } else if (response_required) { + m_reply = + sctp_asconf_success_response(aph->correlation_id); + } + return m_reply; + } + /* delete the address */ + result = sctp_del_remote_addr(stcb, sa); + /* + * note if result == -2, the address doesn't exist in the asoc but + * since it's being deleted anyways, we just ack the delete -- but + * this probably means something has already gone awry + */ + if (result == -1) { + /* only one address in the asoc */ + SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_delete_ip: tried to delete last IP addr!\n"); + m_reply = sctp_asconf_error_response(aph->correlation_id, + SCTP_CAUSE_DELETING_LAST_ADDR, (uint8_t *) aph, + aparam_length); + } else { + if (response_required) { + m_reply = sctp_asconf_success_response(aph->correlation_id); + } + /* notify upper layer */ + sctp_ulp_notify(SCTP_NOTIFY_ASCONF_DELETE_IP, stcb, 0, sa, SCTP_SO_NOT_LOCKED); + } + return m_reply; +} + +static struct mbuf * +sctp_process_asconf_set_primary(struct mbuf *m, + struct sctp_asconf_paramhdr *aph, + struct sctp_tcb *stcb, int response_required) +{ + struct mbuf *m_reply = NULL; + struct sockaddr_storage sa_source, sa_store; + struct sctp_ipv4addr_param *v4addr; + uint16_t param_type, param_length, aparam_length; + struct sockaddr *sa; + struct sockaddr_in *sin; + int zero_address = 0; + +#ifdef INET6 + struct sockaddr_in6 *sin6; + struct sctp_ipv6addr_param *v6addr; + +#endif /* INET6 */ + + aparam_length = ntohs(aph->ph.param_length); + v4addr = (struct sctp_ipv4addr_param *)(aph + 1); +#ifdef INET6 + v6addr = (struct sctp_ipv6addr_param *)(aph + 1); +#endif /* INET6 */ + param_type = ntohs(v4addr->ph.param_type); + param_length = ntohs(v4addr->ph.param_length); + + sa = (struct sockaddr *)&sa_store; + switch (param_type) { + case SCTP_IPV4_ADDRESS: + if (param_length != sizeof(struct sctp_ipv4addr_param)) { + /* invalid param size */ + return NULL; + } + sin = (struct sockaddr_in *)&sa_store; + bzero(sin, sizeof(*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_addr.s_addr = v4addr->addr; + if (sin->sin_addr.s_addr == INADDR_ANY) + zero_address = 1; + SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); + break; + case SCTP_IPV6_ADDRESS: + if (param_length != sizeof(struct sctp_ipv6addr_param)) { + /* invalid param size */ + return NULL; + } +#ifdef INET6 + sin6 = (struct sockaddr_in6 *)&sa_store; + bzero(sin6, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + memcpy((caddr_t)&sin6->sin6_addr, v6addr->addr, + sizeof(struct in6_addr)); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + zero_address = 1; + SCTPDBG(SCTP_DEBUG_ASCONF1, "process_asconf_set_primary: "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); +#else + /* IPv6 not enabled! No "action" needed; just ack it */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_asconf_set_primary: v6 disabled- ignoring: "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); + /* just respond with a "success" ASCONF-ACK */ + return NULL; +#endif + break; + default: + m_reply = sctp_asconf_error_response(aph->correlation_id, + SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *) aph, + aparam_length); + return m_reply; + } + + /* if 0.0.0.0/::0, use the source address instead */ + if (zero_address && SCTP_BASE_SYSCTL(sctp_nat_friendly)) { + sa = (struct sockaddr *)&sa_source; + sctp_asconf_get_source_ip(m, sa); + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_asconf_set_primary: using source addr "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); + } + /* set the primary address */ + if (sctp_set_primary_addr(stcb, sa, NULL) == 0) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_asconf_set_primary: primary address set\n"); + /* notify upper layer */ + sctp_ulp_notify(SCTP_NOTIFY_ASCONF_SET_PRIMARY, stcb, 0, sa, SCTP_SO_NOT_LOCKED); + + if (response_required) { + m_reply = sctp_asconf_success_response(aph->correlation_id); + } + /* + * Mobility adaptation. Ideally, when the reception of SET + * PRIMARY with DELETE IP ADDRESS of the previous primary + * destination, unacknowledged DATA are retransmitted + * immediately to the new primary destination for seamless + * handover. If the destination is UNCONFIRMED and marked to + * REQ_PRIM, The retransmission occur when reception of the + * HEARTBEAT-ACK. (See sctp_handle_heartbeat_ack in + * sctp_input.c) Also, when change of the primary + * destination, it is better that all subsequent new DATA + * containing already queued DATA are transmitted to the new + * primary destination. (by micchie) + */ + if ((sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_BASE) || + sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_FASTHANDOFF)) && + sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_PRIM_DELETED) && + (stcb->asoc.primary_destination->dest_state & + SCTP_ADDR_UNCONFIRMED) == 0) { + + sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_TIMER + SCTP_LOC_7); + if (sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_FASTHANDOFF)) { + sctp_assoc_immediate_retrans(stcb, + stcb->asoc.primary_destination); + } + if (sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_BASE)) { + sctp_move_chunks_from_net(stcb, + stcb->asoc.deleted_primary); + } + sctp_delete_prim_timer(stcb->sctp_ep, stcb, + stcb->asoc.deleted_primary); + } + } else { + /* couldn't set the requested primary address! */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_asconf_set_primary: set primary failed!\n"); + /* must have been an invalid address, so report */ + m_reply = sctp_asconf_error_response(aph->correlation_id, + SCTP_CAUSE_UNRESOLVABLE_ADDR, (uint8_t *) aph, + aparam_length); + } + + return m_reply; +} + +/* + * handles an ASCONF chunk. + * if all parameters are processed ok, send a plain (empty) ASCONF-ACK + */ +void +sctp_handle_asconf(struct mbuf *m, unsigned int offset, + struct sctp_asconf_chunk *cp, struct sctp_tcb *stcb, + int first) +{ + struct sctp_association *asoc; + uint32_t serial_num; + struct mbuf *n, *m_ack, *m_result, *m_tail; + struct sctp_asconf_ack_chunk *ack_cp; + struct sctp_asconf_paramhdr *aph, *ack_aph; + struct sctp_ipv6addr_param *p_addr; + unsigned int asconf_limit; + int error = 0; /* did an error occur? */ + + /* asconf param buffer */ + uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE]; + struct sctp_asconf_ack *ack, *ack_next; + + /* verify minimum length */ + if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_asconf_chunk)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "handle_asconf: chunk too small = %xh\n", + ntohs(cp->ch.chunk_length)); + return; + } + asoc = &stcb->asoc; + serial_num = ntohl(cp->serial_number); + + if (compare_with_wrap(asoc->asconf_seq_in, serial_num, MAX_SEQ) || + serial_num == asoc->asconf_seq_in) { + /* got a duplicate ASCONF */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "handle_asconf: got duplicate serial number = %xh\n", + serial_num); + return; + } else if (serial_num != (asoc->asconf_seq_in + 1)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: incorrect serial number = %xh (expected next = %xh)\n", + serial_num, asoc->asconf_seq_in + 1); + return; + } + /* it's the expected "next" sequence number, so process it */ + asoc->asconf_seq_in = serial_num; /* update sequence */ + /* get length of all the param's in the ASCONF */ + asconf_limit = offset + ntohs(cp->ch.chunk_length); + SCTPDBG(SCTP_DEBUG_ASCONF1, + "handle_asconf: asconf_limit=%u, sequence=%xh\n", + asconf_limit, serial_num); + + if (first) { + /* delete old cache */ + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: Now processing firstASCONF. Try to delte old cache\n"); + + ack = TAILQ_FIRST(&stcb->asoc.asconf_ack_sent); + while (ack != NULL) { + ack_next = TAILQ_NEXT(ack, next); + if (ack->serial_number == serial_num) + break; + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: delete old(%u) < first(%u)\n", + ack->serial_number, serial_num); + TAILQ_REMOVE(&stcb->asoc.asconf_ack_sent, ack, next); + if (ack->data != NULL) { + sctp_m_freem(ack->data); + } + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), ack); + ack = ack_next; + } + } + m_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_ack_chunk), 0, + M_DONTWAIT, 1, MT_DATA); + if (m_ack == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "handle_asconf: couldn't get mbuf!\n"); + return; + } + m_tail = m_ack; /* current reply chain's tail */ + + /* fill in ASCONF-ACK header */ + ack_cp = mtod(m_ack, struct sctp_asconf_ack_chunk *); + ack_cp->ch.chunk_type = SCTP_ASCONF_ACK; + ack_cp->ch.chunk_flags = 0; + ack_cp->serial_number = htonl(serial_num); + /* set initial lengths (eg. just an ASCONF-ACK), ntohx at the end! */ + SCTP_BUF_LEN(m_ack) = sizeof(struct sctp_asconf_ack_chunk); + ack_cp->ch.chunk_length = sizeof(struct sctp_asconf_ack_chunk); + + /* skip the lookup address parameter */ + offset += sizeof(struct sctp_asconf_chunk); + p_addr = (struct sctp_ipv6addr_param *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), (uint8_t *) & aparam_buf); + if (p_addr == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "handle_asconf: couldn't get lookup addr!\n"); + /* respond with a missing/invalid mandatory parameter error */ + return; + } + /* param_length is already validated in process_control... */ + offset += ntohs(p_addr->ph.param_length); /* skip lookup addr */ + + /* get pointer to first asconf param in ASCONF-ACK */ + ack_aph = (struct sctp_asconf_paramhdr *)(mtod(m_ack, caddr_t)+sizeof(struct sctp_asconf_ack_chunk)); + if (ack_aph == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "Gak in asconf2\n"); + return; + } + /* get pointer to first asconf param in ASCONF */ + aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_asconf_paramhdr), (uint8_t *) & aparam_buf); + if (aph == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "Empty ASCONF received?\n"); + goto send_reply; + } + /* process through all parameters */ + while (aph != NULL) { + unsigned int param_length, param_type; + + param_type = ntohs(aph->ph.param_type); + param_length = ntohs(aph->ph.param_length); + if (offset + param_length > asconf_limit) { + /* parameter goes beyond end of chunk! */ + sctp_m_freem(m_ack); + return; + } + m_result = NULL; + + if (param_length > sizeof(aparam_buf)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: param length (%u) larger than buffer size!\n", param_length); + sctp_m_freem(m_ack); + return; + } + if (param_length <= sizeof(struct sctp_paramhdr)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: param length (%u) too short\n", param_length); + sctp_m_freem(m_ack); + } + /* get the entire parameter */ + aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, param_length, aparam_buf); + if (aph == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: couldn't get entire param\n"); + sctp_m_freem(m_ack); + return; + } + switch (param_type) { + case SCTP_ADD_IP_ADDRESS: + asoc->peer_supports_asconf = 1; + m_result = sctp_process_asconf_add_ip(m, aph, stcb, + error); + break; + case SCTP_DEL_IP_ADDRESS: + asoc->peer_supports_asconf = 1; + m_result = sctp_process_asconf_delete_ip(m, aph, stcb, + error); + break; + case SCTP_ERROR_CAUSE_IND: + /* not valid in an ASCONF chunk */ + break; + case SCTP_SET_PRIM_ADDR: + asoc->peer_supports_asconf = 1; + m_result = sctp_process_asconf_set_primary(m, aph, + stcb, error); + break; + case SCTP_NAT_VTAGS: + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: sees a NAT VTAG state parameter\n"); + break; + case SCTP_SUCCESS_REPORT: + /* not valid in an ASCONF chunk */ + break; + case SCTP_ULP_ADAPTATION: + /* FIX */ + break; + default: + if ((param_type & 0x8000) == 0) { + /* Been told to STOP at this param */ + asconf_limit = offset; + /* + * FIX FIX - We need to call + * sctp_arethere_unrecognized_parameters() + * to get a operr and send it for any + * param's with the 0x4000 bit set OR do it + * here ourselves... note we still must STOP + * if the 0x8000 bit is clear. + */ + } + /* unknown/invalid param type */ + break; + } /* switch */ + + /* add any (error) result to the reply mbuf chain */ + if (m_result != NULL) { + SCTP_BUF_NEXT(m_tail) = m_result; + m_tail = m_result; + /* update lengths, make sure it's aligned too */ + SCTP_BUF_LEN(m_result) = SCTP_SIZE32(SCTP_BUF_LEN(m_result)); + ack_cp->ch.chunk_length += SCTP_BUF_LEN(m_result); + /* set flag to force success reports */ + error = 1; + } + offset += SCTP_SIZE32(param_length); + /* update remaining ASCONF message length to process */ + if (offset >= asconf_limit) { + /* no more data in the mbuf chain */ + break; + } + /* get pointer to next asconf param */ + aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, + sizeof(struct sctp_asconf_paramhdr), + (uint8_t *) & aparam_buf); + if (aph == NULL) { + /* can't get an asconf paramhdr */ + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: can't get asconf param hdr!\n"); + /* FIX ME - add error here... */ + } + } + +send_reply: + ack_cp->ch.chunk_length = htons(ack_cp->ch.chunk_length); + /* save the ASCONF-ACK reply */ + ack = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asconf_ack), + struct sctp_asconf_ack); + if (ack == NULL) { + sctp_m_freem(m_ack); + return; + } + ack->serial_number = serial_num; + ack->last_sent_to = NULL; + ack->data = m_ack; + ack->len = 0; + n = m_ack; + while (n) { + ack->len += SCTP_BUF_LEN(n); + n = SCTP_BUF_NEXT(n); + } + TAILQ_INSERT_TAIL(&stcb->asoc.asconf_ack_sent, ack, next); + + /* see if last_control_chunk_from is set properly (use IP src addr) */ + if (stcb->asoc.last_control_chunk_from == NULL) { + /* + * this could happen if the source address was just newly + * added + */ + struct ip *iph; + struct sctphdr *sh; + struct sockaddr_storage from_store; + struct sockaddr *from = (struct sockaddr *)&from_store; + + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: looking up net for IP source address\n"); + /* pullup already done, IP options already stripped */ + iph = mtod(m, struct ip *); + sh = (struct sctphdr *)((caddr_t)iph + sizeof(*iph)); + switch (iph->ip_v) { + case IPVERSION: + { + struct sockaddr_in *from4; + + from4 = (struct sockaddr_in *)&from_store; + bzero(from4, sizeof(*from4)); + from4->sin_family = AF_INET; + from4->sin_len = sizeof(struct sockaddr_in); + from4->sin_addr.s_addr = iph->ip_src.s_addr; + from4->sin_port = sh->src_port; + break; + } +#ifdef INET6 + case IPV6_VERSION >> 4: + { + struct ip6_hdr *ip6; + struct sockaddr_in6 *from6; + + ip6 = mtod(m, struct ip6_hdr *); + from6 = (struct sockaddr_in6 *)&from_store; + bzero(from6, sizeof(*from6)); + from6->sin6_family = AF_INET6; + from6->sin6_len = sizeof(struct sockaddr_in6); + from6->sin6_addr = ip6->ip6_src; + from6->sin6_port = sh->src_port; + /* + * Get the scopes in properly to the sin6 + * addr's + */ + /* we probably don't need these operations */ + (void)sa6_recoverscope(from6); + sa6_embedscope(from6, + MODULE_GLOBAL(ip6_use_defzone)); + + break; + } +#endif + default: + /* unknown address type */ + from = NULL; + } + if (from != NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "Looking for IP source: "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, from); + /* look up the from address */ + stcb->asoc.last_control_chunk_from = sctp_findnet(stcb, from); +#ifdef SCTP_DEBUG + if (stcb->asoc.last_control_chunk_from == NULL) + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf: IP source address not found?!\n"); +#endif + } + } +} + +/* + * does the address match? returns 0 if not, 1 if so + */ +static uint32_t +sctp_asconf_addr_match(struct sctp_asconf_addr *aa, struct sockaddr *sa) +{ +#ifdef INET6 + if (sa->sa_family == AF_INET6) { + /* IPv6 sa address */ + /* XXX scopeid */ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; + + if ((aa->ap.addrp.ph.param_type == SCTP_IPV6_ADDRESS) && + (memcmp(&aa->ap.addrp.addr, &sin6->sin6_addr, + sizeof(struct in6_addr)) == 0)) { + return (1); + } + } else +#endif /* INET6 */ + if (sa->sa_family == AF_INET) { + /* IPv4 sa address */ + struct sockaddr_in *sin = (struct sockaddr_in *)sa; + + if ((aa->ap.addrp.ph.param_type == SCTP_IPV4_ADDRESS) && + (memcmp(&aa->ap.addrp.addr, &sin->sin_addr, + sizeof(struct in_addr)) == 0)) { + return (1); + } + } + return (0); +} + +/* + * does the address match? returns 0 if not, 1 if so + */ +static uint32_t +sctp_addr_match( + struct sctp_ipv6addr_param *v6addr, + struct sockaddr *sa) +{ + uint16_t param_type, param_length; + struct sctp_ipv4addr_param *v4addr = (struct sctp_ipv4addr_param *)v6addr; + +#ifdef INET6 + if (sa->sa_family == AF_INET6) { + /* IPv6 sa address */ + /* XXX scopeid */ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; + + param_type = ntohs(v6addr->ph.param_type); + param_length = ntohs(v6addr->ph.param_length); + + if ((param_type == SCTP_IPV6_ADDRESS) && + param_length == sizeof(struct sctp_ipv6addr_param) && + (memcmp(&v6addr->addr, &sin6->sin6_addr, + sizeof(struct in6_addr)) == 0)) { + return (1); + } + } +#endif + if (sa->sa_family == AF_INET) { + /* IPv4 sa address */ + struct sockaddr_in *sin = (struct sockaddr_in *)sa; + + param_type = ntohs(v4addr->ph.param_type); + param_length = ntohs(v4addr->ph.param_length); + + if ((param_type == SCTP_IPV4_ADDRESS) && + param_length == sizeof(struct sctp_ipv4addr_param) && + (memcmp(&v4addr->addr, &sin->sin_addr, + sizeof(struct in_addr)) == 0)) { + return (1); + } + } + return (0); +} + +/* + * Cleanup for non-responded/OP ERR'd ASCONF + */ +void +sctp_asconf_cleanup(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + /* mark peer as ASCONF incapable */ + stcb->asoc.peer_supports_asconf = 0; + /* + * clear out any existing asconfs going out + */ + sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_ASCONF + SCTP_LOC_2); + stcb->asoc.asconf_seq_out_acked = stcb->asoc.asconf_seq_out; + /* remove the old ASCONF on our outbound queue */ + sctp_toss_old_asconf(stcb); +} + +/* + * cleanup any cached source addresses that may be topologically + * incorrect after a new address has been added to this interface. + */ +static void +sctp_asconf_nets_cleanup(struct sctp_tcb *stcb, struct sctp_ifn *ifn) +{ + struct sctp_nets *net; + + /* + * Ideally, we want to only clear cached routes and source addresses + * that are topologically incorrect. But since there is no easy way + * to know whether the newly added address on the ifn would cause a + * routing change (i.e. a new egress interface would be chosen) + * without doing a new routing lookup and source address selection, + * we will (for now) just flush any cached route using a different + * ifn (and cached source addrs) and let output re-choose them + * during the next send on that net. + */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + /* + * clear any cached route (and cached source address) if the + * route's interface is NOT the same as the address change. + * If it's the same interface, just clear the cached source + * address. + */ + if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro) && + ((ifn == NULL) || + (SCTP_GET_IF_INDEX_FROM_ROUTE(&net->ro) != ifn->ifn_index))) { + /* clear any cached route */ + RTFREE(net->ro.ro_rt); + net->ro.ro_rt = NULL; + } + /* clear any cached source address */ + if (net->src_addr_selected) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + net->src_addr_selected = 0; + } + } +} + + +void +sctp_assoc_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *dstnet) +{ + int error; + + if (dstnet->dest_state & SCTP_ADDR_UNCONFIRMED) { + return; + } + if (stcb->asoc.deleted_primary == NULL) { + return; + } + if (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "assoc_immediate_retrans: Deleted primary is "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.deleted_primary->ro._l_addr.sa); + SCTPDBG(SCTP_DEBUG_ASCONF1, "Current Primary is "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.primary_destination->ro._l_addr.sa); + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, + stcb->asoc.deleted_primary, + SCTP_FROM_SCTP_TIMER + SCTP_LOC_8); + stcb->asoc.num_send_timers_up--; + if (stcb->asoc.num_send_timers_up < 0) { + stcb->asoc.num_send_timers_up = 0; + } + SCTP_TCB_LOCK_ASSERT(stcb); + error = sctp_t3rxt_timer(stcb->sctp_ep, stcb, + stcb->asoc.deleted_primary); + if (error) { + SCTP_INP_DECR_REF(stcb->sctp_ep); + return; + } + SCTP_TCB_LOCK_ASSERT(stcb); +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(4, stcb->sctp_ep, stcb, stcb->asoc.deleted_primary); +#endif + sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); + if ((stcb->asoc.num_send_timers_up == 0) && + (stcb->asoc.sent_queue_cnt > 0)) { + struct sctp_tmit_chunk *chk; + + chk = TAILQ_FIRST(&stcb->asoc.sent_queue); + sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, chk->whoTo); + } + } + return; +} + +static int + sctp_asconf_queue_mgmt(struct sctp_tcb *, struct sctp_ifa *, uint16_t); + +void +sctp_net_immediate_retrans(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + struct sctp_tmit_chunk *chk; + + SCTPDBG(SCTP_DEBUG_ASCONF1, "net_immediate_retrans: RTO is %d\n", net->RTO); + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_TIMER + SCTP_LOC_5); + stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); + net->error_count = 0; + TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { + if (chk->whoTo == net) { + if (chk->sent < SCTP_DATAGRAM_RESEND) { + chk->sent = SCTP_DATAGRAM_RESEND; + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + sctp_flight_size_decrease(chk); + sctp_total_flight_decrease(stcb, chk); + net->marked_retrans++; + stcb->asoc.marked_retrans++; + } + } + } + if (net->marked_retrans) { + sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); + } +} + +static void +sctp_path_check_and_react(struct sctp_tcb *stcb, struct sctp_ifa *newifa) +{ + struct sctp_nets *net; + int addrnum, changed; + + /* + * If number of local valid addresses is 1, the valid address is + * probably newly added address. Several valid addresses in this + * association. A source address may not be changed. Additionally, + * they can be configured on a same interface as "alias" addresses. + * (by micchie) + */ + addrnum = sctp_local_addr_count(stcb); + SCTPDBG(SCTP_DEBUG_ASCONF1, "p_check_react(): %d local addresses\n", + addrnum); + if (addrnum == 1) { + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + /* clear any cached route and source address */ + if (net->ro.ro_rt) { + RTFREE(net->ro.ro_rt); + net->ro.ro_rt = NULL; + } + if (net->src_addr_selected) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + net->src_addr_selected = 0; + } + /* Retransmit unacknowledged DATA chunks immediately */ + if (sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_FASTHANDOFF)) { + sctp_net_immediate_retrans(stcb, net); + } + /* also, SET PRIMARY is maybe already sent */ + } + return; + } + /* Multiple local addresses exsist in the association. */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + /* clear any cached route and source address */ + if (net->ro.ro_rt) { + RTFREE(net->ro.ro_rt); + net->ro.ro_rt = NULL; + } + if (net->src_addr_selected) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + net->src_addr_selected = 0; + } + /* + * Check if the nexthop is corresponding to the new address. + * If the new address is corresponding to the current + * nexthop, the path will be changed. If the new address is + * NOT corresponding to the current nexthop, the path will + * not be changed. + */ + SCTP_RTALLOC((sctp_route_t *) & net->ro, + stcb->sctp_ep->def_vrf_id); + if (net->ro.ro_rt == NULL) + continue; + + changed = 0; + if (net->ro._l_addr.sa.sa_family == AF_INET) { + if (sctp_v4src_match_nexthop(newifa, (sctp_route_t *) & net->ro)) + changed = 1; + } +#ifdef INET6 + if (net->ro._l_addr.sa.sa_family == AF_INET6) { + if (sctp_v6src_match_nexthop( + &newifa->address.sin6, (sctp_route_t *) & net->ro)) + changed = 1; + } +#endif + /* + * if the newly added address does not relate routing + * information, we skip. + */ + if (changed == 0) + continue; + /* Retransmit unacknowledged DATA chunks immediately */ + if (sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_FASTHANDOFF)) { + sctp_net_immediate_retrans(stcb, net); + } + /* Send SET PRIMARY for this new address */ + if (net == stcb->asoc.primary_destination) { + (void)sctp_asconf_queue_mgmt(stcb, newifa, + SCTP_SET_PRIM_ADDR); + } + } +} + +/* + * process an ADD/DELETE IP ack from peer. + * addr: corresponding sctp_ifa to the address being added/deleted. + * type: SCTP_ADD_IP_ADDRESS or SCTP_DEL_IP_ADDRESS. + * flag: 1=success, 0=failure. + */ +static void +sctp_asconf_addr_mgmt_ack(struct sctp_tcb *stcb, struct sctp_ifa *addr, + uint16_t type, uint32_t flag) +{ + /* + * do the necessary asoc list work- if we get a failure indication, + * leave the address on the assoc's restricted list. If we get a + * success indication, remove the address from the restricted list. + */ + /* + * Note: this will only occur for ADD_IP_ADDRESS, since + * DEL_IP_ADDRESS is never actually added to the list... + */ + if (flag) { + /* success case, so remove from the restricted list */ + sctp_del_local_addr_restricted(stcb, addr); + + if (sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_BASE) || + sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_FASTHANDOFF)) { + sctp_path_check_and_react(stcb, addr); + return; + } + /* clear any cached/topologically incorrect source addresses */ + sctp_asconf_nets_cleanup(stcb, addr->ifn_p); + } + /* else, leave it on the list */ +} + +/* + * add an asconf add/delete/set primary IP address parameter to the queue. + * type = SCTP_ADD_IP_ADDRESS, SCTP_DEL_IP_ADDRESS, SCTP_SET_PRIM_ADDR. + * returns 0 if queued, -1 if not queued/removed. + * NOTE: if adding, but a delete for the same address is already scheduled + * (and not yet sent out), simply remove it from queue. Same for deleting + * an address already scheduled for add. If a duplicate operation is found, + * ignore the new one. + */ +static int +sctp_asconf_queue_mgmt(struct sctp_tcb *stcb, struct sctp_ifa *ifa, + uint16_t type) +{ + struct sctp_asconf_addr *aa, *aa_next; + struct sockaddr *sa; + + /* make sure the request isn't already in the queue */ + for (aa = TAILQ_FIRST(&stcb->asoc.asconf_queue); aa != NULL; + aa = aa_next) { + aa_next = TAILQ_NEXT(aa, next); + /* address match? */ + if (sctp_asconf_addr_match(aa, &ifa->address.sa) == 0) + continue; + /* + * is the request already in queue but not sent? pass the + * request already sent in order to resolve the following + * case: 1. arrival of ADD, then sent 2. arrival of DEL. we + * can't remove the ADD request already sent 3. arrival of + * ADD + */ + if (aa->ap.aph.ph.param_type == type && aa->sent == 0) { + return (-1); + } + /* is the negative request already in queue, and not sent */ + if ((aa->sent == 0) && (type == SCTP_ADD_IP_ADDRESS) && + (aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS)) { + /* add requested, delete already queued */ + TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next); + /* remove the ifa from the restricted list */ + sctp_del_local_addr_restricted(stcb, ifa); + /* free the asconf param */ + SCTP_FREE(aa, SCTP_M_ASC_ADDR); + SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: add removes queued entry\n"); + return (-1); + } + if ((aa->sent == 0) && (type == SCTP_DEL_IP_ADDRESS) && + (aa->ap.aph.ph.param_type == SCTP_ADD_IP_ADDRESS)) { + /* delete requested, add already queued */ + TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next); + /* remove the aa->ifa from the restricted list */ + sctp_del_local_addr_restricted(stcb, aa->ifa); + /* free the asconf param */ + SCTP_FREE(aa, SCTP_M_ASC_ADDR); + SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_mgmt: delete removes queued entry\n"); + return (-1); + } + } /* for each aa */ + + /* adding new request to the queue */ + SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), + SCTP_M_ASC_ADDR); + if (aa == NULL) { + /* didn't get memory */ + SCTPDBG(SCTP_DEBUG_ASCONF1, "asconf_queue_mgmt: failed to get memory!\n"); + return (-1); + } + aa->special_del = 0; + /* fill in asconf address parameter fields */ + /* top level elements are "networked" during send */ + aa->ap.aph.ph.param_type = type; + aa->ifa = ifa; + atomic_add_int(&ifa->refcount, 1); + /* correlation_id filled in during send routine later... */ + if (ifa->address.sa.sa_family == AF_INET6) { + /* IPv6 address */ + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&ifa->address.sa; + sa = (struct sockaddr *)sin6; + aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS; + aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv6addr_param)); + aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + + sizeof(struct sctp_ipv6addr_param); + memcpy(&aa->ap.addrp.addr, &sin6->sin6_addr, + sizeof(struct in6_addr)); + } else if (ifa->address.sa.sa_family == AF_INET) { + /* IPv4 address */ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&ifa->address.sa; + sa = (struct sockaddr *)sin; + aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS; + aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv4addr_param)); + aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + + sizeof(struct sctp_ipv4addr_param); + memcpy(&aa->ap.addrp.addr, &sin->sin_addr, + sizeof(struct in_addr)); + } else { + /* invalid family! */ + SCTP_FREE(aa, SCTP_M_ASC_ADDR); + sctp_free_ifa(ifa); + return (-1); + } + aa->sent = 0; /* clear sent flag */ + + TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); +#ifdef SCTP_DEBUG + if (SCTP_BASE_SYSCTL(sctp_debug_on) && SCTP_DEBUG_ASCONF2) { + if (type == SCTP_ADD_IP_ADDRESS) { + SCTP_PRINTF("asconf_queue_mgmt: inserted asconf ADD_IP_ADDRESS: "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, sa); + } else if (type == SCTP_DEL_IP_ADDRESS) { + SCTP_PRINTF("asconf_queue_mgmt: appended asconf DEL_IP_ADDRESS: "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, sa); + } else { + SCTP_PRINTF("asconf_queue_mgmt: appended asconf SET_PRIM_ADDR: "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, sa); + } + } +#endif + + return (0); +} + + +/* + * add an asconf operation for the given ifa and type. + * type = SCTP_ADD_IP_ADDRESS, SCTP_DEL_IP_ADDRESS, SCTP_SET_PRIM_ADDR. + * returns 0 if completed, -1 if not completed, 1 if immediate send is + * advisable. + */ +static int +sctp_asconf_queue_add(struct sctp_tcb *stcb, struct sctp_ifa *ifa, + uint16_t type) +{ + uint32_t status; + int pending_delete_queued = 0; + + /* see if peer supports ASCONF */ + if (stcb->asoc.peer_supports_asconf == 0) { + return (-1); + } + /* + * if this is deleting the last address from the assoc, mark it as + * pending. + */ + if ((type == SCTP_DEL_IP_ADDRESS) && !stcb->asoc.asconf_del_pending && + (sctp_local_addr_count(stcb) < 2)) { + /* set the pending delete info only */ + stcb->asoc.asconf_del_pending = 1; + stcb->asoc.asconf_addr_del_pending = ifa; + atomic_add_int(&ifa->refcount, 1); + SCTPDBG(SCTP_DEBUG_ASCONF2, + "asconf_queue_add: mark delete last address pending\n"); + return (-1); + } + /* queue an asconf parameter */ + status = sctp_asconf_queue_mgmt(stcb, ifa, type); + + /* + * if this is an add, and there is a delete also pending (i.e. the + * last local address is being changed), queue the pending delete + * too. + */ + if ((type == SCTP_ADD_IP_ADDRESS) && stcb->asoc.asconf_del_pending && (status == 0)) { + /* queue in the pending delete */ + if (sctp_asconf_queue_mgmt(stcb, + stcb->asoc.asconf_addr_del_pending, + SCTP_DEL_IP_ADDRESS) == 0) { + SCTPDBG(SCTP_DEBUG_ASCONF2, "asconf_queue_add: queing pending delete\n"); + pending_delete_queued = 1; + /* clear out the pending delete info */ + stcb->asoc.asconf_del_pending = 0; + sctp_free_ifa(stcb->asoc.asconf_addr_del_pending); + stcb->asoc.asconf_addr_del_pending = NULL; + } + } + if (pending_delete_queued) { + struct sctp_nets *net; + + /* + * since we know that the only/last address is now being + * changed in this case, reset the cwnd/rto on all nets to + * start as a new address and path. Also clear the error + * counts to give the assoc the best chance to complete the + * address change. + */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, + net); + net->RTO = 0; + net->error_count = 0; + } + stcb->asoc.overall_error_count = 0; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_ASCONF, + __LINE__); + } + /* queue in an advisory set primary too */ + (void)sctp_asconf_queue_mgmt(stcb, ifa, SCTP_SET_PRIM_ADDR); + /* let caller know we should send this out immediately */ + status = 1; + } + return (status); +} + +/*- + * add an asconf delete IP address parameter to the queue by sockaddr and + * possibly with no sctp_ifa available. This is only called by the routine + * that checks the addresses in an INIT-ACK against the current address list. + * returns 0 if completed, non-zero if not completed. + * NOTE: if an add is already scheduled (and not yet sent out), simply + * remove it from queue. If a duplicate operation is found, ignore the + * new one. + */ +static int +sctp_asconf_queue_sa_delete(struct sctp_tcb *stcb, struct sockaddr *sa) +{ + struct sctp_ifa *ifa; + struct sctp_asconf_addr *aa, *aa_next; + uint32_t vrf_id; + + if (stcb == NULL) { + return (-1); + } + /* see if peer supports ASCONF */ + if (stcb->asoc.peer_supports_asconf == 0) { + return (-1); + } + /* make sure the request isn't already in the queue */ + for (aa = TAILQ_FIRST(&stcb->asoc.asconf_queue); aa != NULL; + aa = aa_next) { + aa_next = TAILQ_NEXT(aa, next); + /* address match? */ + if (sctp_asconf_addr_match(aa, sa) == 0) + continue; + /* is the request already in queue (sent or not) */ + if (aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS) { + return (-1); + } + /* is the negative request already in queue, and not sent */ + if (aa->sent == 1) + continue; + if (aa->ap.aph.ph.param_type == SCTP_ADD_IP_ADDRESS) { + /* add already queued, so remove existing entry */ + TAILQ_REMOVE(&stcb->asoc.asconf_queue, aa, next); + sctp_del_local_addr_restricted(stcb, aa->ifa); + /* free the entry */ + SCTP_FREE(aa, SCTP_M_ASC_ADDR); + return (-1); + } + } /* for each aa */ + + /* find any existing ifa-- NOTE ifa CAN be allowed to be NULL */ + if (stcb) { + vrf_id = stcb->asoc.vrf_id; + } else { + vrf_id = SCTP_DEFAULT_VRFID; + } + ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED); + + /* adding new request to the queue */ + SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), + SCTP_M_ASC_ADDR); + if (aa == NULL) { + /* didn't get memory */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "sctp_asconf_queue_sa_delete: failed to get memory!\n"); + return (-1); + } + aa->special_del = 0; + /* fill in asconf address parameter fields */ + /* top level elements are "networked" during send */ + aa->ap.aph.ph.param_type = SCTP_DEL_IP_ADDRESS; + aa->ifa = ifa; + if (ifa) + atomic_add_int(&ifa->refcount, 1); + /* correlation_id filled in during send routine later... */ + if (sa->sa_family == AF_INET6) { + /* IPv6 address */ + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)sa; + aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS; + aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv6addr_param)); + aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_ipv6addr_param); + memcpy(&aa->ap.addrp.addr, &sin6->sin6_addr, + sizeof(struct in6_addr)); + } else if (sa->sa_family == AF_INET) { + /* IPv4 address */ + struct sockaddr_in *sin = (struct sockaddr_in *)sa; + + aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS; + aa->ap.addrp.ph.param_length = (sizeof(struct sctp_ipv4addr_param)); + aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_paramhdr) + sizeof(struct sctp_ipv4addr_param); + memcpy(&aa->ap.addrp.addr, &sin->sin_addr, + sizeof(struct in_addr)); + } else { + /* invalid family! */ + SCTP_FREE(aa, SCTP_M_ASC_ADDR); + if (ifa) + sctp_free_ifa(ifa); + return (-1); + } + aa->sent = 0; /* clear sent flag */ + + /* delete goes to the back of the queue */ + TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); + + /* sa_ignore MEMLEAK {memory is put on the tailq} */ + return (0); +} + +/* + * find a specific asconf param on our "sent" queue + */ +static struct sctp_asconf_addr * +sctp_asconf_find_param(struct sctp_tcb *stcb, uint32_t correlation_id) +{ + struct sctp_asconf_addr *aa; + + TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) { + if (aa->ap.aph.correlation_id == correlation_id && + aa->sent == 1) { + /* found it */ + return (aa); + } + } + /* didn't find it */ + return (NULL); +} + +/* + * process an SCTP_ERROR_CAUSE_IND for a ASCONF-ACK parameter and do + * notifications based on the error response + */ +static void +sctp_asconf_process_error(struct sctp_tcb *stcb, + struct sctp_asconf_paramhdr *aph) +{ + struct sctp_error_cause *eh; + struct sctp_paramhdr *ph; + uint16_t param_type; + uint16_t error_code; + + eh = (struct sctp_error_cause *)(aph + 1); + ph = (struct sctp_paramhdr *)(eh + 1); + /* validate lengths */ + if (htons(eh->length) + sizeof(struct sctp_error_cause) > + htons(aph->ph.param_length)) { + /* invalid error cause length */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "asconf_process_error: cause element too long\n"); + return; + } + if (htons(ph->param_length) + sizeof(struct sctp_paramhdr) > + htons(eh->length)) { + /* invalid included TLV length */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "asconf_process_error: included TLV too long\n"); + return; + } + /* which error code ? */ + error_code = ntohs(eh->code); + param_type = ntohs(aph->ph.param_type); + /* FIX: this should go back up the REMOTE_ERROR ULP notify */ + switch (error_code) { + case SCTP_CAUSE_RESOURCE_SHORTAGE: + /* we allow ourselves to "try again" for this error */ + break; + default: + /* peer can't handle it... */ + switch (param_type) { + case SCTP_ADD_IP_ADDRESS: + case SCTP_DEL_IP_ADDRESS: + stcb->asoc.peer_supports_asconf = 0; + break; + case SCTP_SET_PRIM_ADDR: + stcb->asoc.peer_supports_asconf = 0; + break; + default: + break; + } + } +} + +/* + * process an asconf queue param. + * aparam: parameter to process, will be removed from the queue. + * flag: 1=success case, 0=failure case + */ +static void +sctp_asconf_process_param_ack(struct sctp_tcb *stcb, + struct sctp_asconf_addr *aparam, uint32_t flag) +{ + uint16_t param_type; + + /* process this param */ + param_type = aparam->ap.aph.ph.param_type; + switch (param_type) { + case SCTP_ADD_IP_ADDRESS: + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_param_ack: added IP address\n"); + sctp_asconf_addr_mgmt_ack(stcb, aparam->ifa, param_type, flag); + break; + case SCTP_DEL_IP_ADDRESS: + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_param_ack: deleted IP address\n"); + /* nothing really to do... lists already updated */ + break; + case SCTP_SET_PRIM_ADDR: + SCTPDBG(SCTP_DEBUG_ASCONF1, + "process_param_ack: set primary IP address\n"); + /* nothing to do... peer may start using this addr */ + if (flag == 0) + stcb->asoc.peer_supports_asconf = 0; + break; + default: + /* should NEVER happen */ + break; + } + + /* remove the param and free it */ + TAILQ_REMOVE(&stcb->asoc.asconf_queue, aparam, next); + if (aparam->ifa) + sctp_free_ifa(aparam->ifa); + SCTP_FREE(aparam, SCTP_M_ASC_ADDR); +} + +/* + * cleanup from a bad asconf ack parameter + */ +static void +sctp_asconf_ack_clear(struct sctp_tcb *stcb) +{ + /* assume peer doesn't really know how to do asconfs */ + stcb->asoc.peer_supports_asconf = 0; + /* XXX we could free the pending queue here */ +} + +void +sctp_handle_asconf_ack(struct mbuf *m, int offset, + struct sctp_asconf_ack_chunk *cp, struct sctp_tcb *stcb, + struct sctp_nets *net, int *abort_no_unlock) +{ + struct sctp_association *asoc; + uint32_t serial_num; + uint16_t ack_length; + struct sctp_asconf_paramhdr *aph; + struct sctp_asconf_addr *aa, *aa_next; + uint32_t last_error_id = 0; /* last error correlation id */ + uint32_t id; + struct sctp_asconf_addr *ap; + + /* asconf param buffer */ + uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE]; + + /* verify minimum length */ + if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_asconf_ack_chunk)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "handle_asconf_ack: chunk too small = %xh\n", + ntohs(cp->ch.chunk_length)); + return; + } + asoc = &stcb->asoc; + serial_num = ntohl(cp->serial_number); + + /* + * NOTE: we may want to handle this differently- currently, we will + * abort when we get an ack for the expected serial number + 1 (eg. + * we didn't send it), process an ack normally if it is the expected + * serial number, and re-send the previous ack for *ALL* other + * serial numbers + */ + + /* + * if the serial number is the next expected, but I didn't send it, + * abort the asoc, since someone probably just hijacked us... + */ + if (serial_num == (asoc->asconf_seq_out + 1)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got unexpected next serial number! Aborting asoc!\n"); + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_CAUSE_ILLEGAL_ASCONF_ACK, NULL, SCTP_SO_NOT_LOCKED); + *abort_no_unlock = 1; + return; + } + if (serial_num != asoc->asconf_seq_out_acked + 1) { + /* got a duplicate/unexpected ASCONF-ACK */ + SCTPDBG(SCTP_DEBUG_ASCONF1, "handle_asconf_ack: got duplicate/unexpected serial number = %xh (expected = %xh)\n", + serial_num, asoc->asconf_seq_out_acked + 1); + return; + } + if (serial_num == asoc->asconf_seq_out - 1) { + /* stop our timer */ + sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_ASCONF + SCTP_LOC_3); + } + /* process the ASCONF-ACK contents */ + ack_length = ntohs(cp->ch.chunk_length) - + sizeof(struct sctp_asconf_ack_chunk); + offset += sizeof(struct sctp_asconf_ack_chunk); + /* process through all parameters */ + while (ack_length >= sizeof(struct sctp_asconf_paramhdr)) { + unsigned int param_length, param_type; + + /* get pointer to next asconf parameter */ + aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, + sizeof(struct sctp_asconf_paramhdr), aparam_buf); + if (aph == NULL) { + /* can't get an asconf paramhdr */ + sctp_asconf_ack_clear(stcb); + return; + } + param_type = ntohs(aph->ph.param_type); + param_length = ntohs(aph->ph.param_length); + if (param_length > ack_length) { + sctp_asconf_ack_clear(stcb); + return; + } + if (param_length < sizeof(struct sctp_paramhdr)) { + sctp_asconf_ack_clear(stcb); + return; + } + /* get the complete parameter... */ + if (param_length > sizeof(aparam_buf)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "param length (%u) larger than buffer size!\n", param_length); + sctp_asconf_ack_clear(stcb); + return; + } + aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(m, offset, param_length, aparam_buf); + if (aph == NULL) { + sctp_asconf_ack_clear(stcb); + return; + } + /* correlation_id is transparent to peer, no ntohl needed */ + id = aph->correlation_id; + + switch (param_type) { + case SCTP_ERROR_CAUSE_IND: + last_error_id = id; + /* find the corresponding asconf param in our queue */ + ap = sctp_asconf_find_param(stcb, id); + if (ap == NULL) { + /* hmm... can't find this in our queue! */ + break; + } + /* process the parameter, failed flag */ + sctp_asconf_process_param_ack(stcb, ap, 0); + /* process the error response */ + sctp_asconf_process_error(stcb, aph); + break; + case SCTP_SUCCESS_REPORT: + /* find the corresponding asconf param in our queue */ + ap = sctp_asconf_find_param(stcb, id); + if (ap == NULL) { + /* hmm... can't find this in our queue! */ + break; + } + /* process the parameter, success flag */ + sctp_asconf_process_param_ack(stcb, ap, 1); + break; + default: + break; + } /* switch */ + + /* update remaining ASCONF-ACK message length to process */ + ack_length -= SCTP_SIZE32(param_length); + if (ack_length <= 0) { + /* no more data in the mbuf chain */ + break; + } + offset += SCTP_SIZE32(param_length); + } /* while */ + + /* + * if there are any "sent" params still on the queue, these are + * implicitly "success", or "failed" (if we got an error back) ... + * so process these appropriately + * + * we assume that the correlation_id's are monotonically increasing + * beginning from 1 and that we don't have *that* many outstanding + * at any given time + */ + if (last_error_id == 0) + last_error_id--;/* set to "max" value */ + for (aa = TAILQ_FIRST(&stcb->asoc.asconf_queue); aa != NULL; + aa = aa_next) { + aa_next = TAILQ_NEXT(aa, next); + if (aa->sent == 1) { + /* + * implicitly successful or failed if correlation_id + * < last_error_id, then success else, failure + */ + if (aa->ap.aph.correlation_id < last_error_id) + sctp_asconf_process_param_ack(stcb, aa, 1); + else + sctp_asconf_process_param_ack(stcb, aa, 0); + } else { + /* + * since we always process in order (FIFO queue) if + * we reach one that hasn't been sent, the rest + * should not have been sent either. so, we're + * done... + */ + break; + } + } + + /* update the next sequence number to use */ + asoc->asconf_seq_out_acked++; + /* remove the old ASCONF on our outbound queue */ + sctp_toss_old_asconf(stcb); + if (!TAILQ_EMPTY(&stcb->asoc.asconf_queue)) { +#ifdef SCTP_TIMER_BASED_ASCONF + /* we have more params, so restart our timer */ + sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, + stcb, net); +#else + /* we have more params, so send out more */ + sctp_send_asconf(stcb, net, SCTP_ADDR_NOT_LOCKED); +#endif + } +} + +#ifdef INET6 +static uint32_t +sctp_is_scopeid_in_nets(struct sctp_tcb *stcb, struct sockaddr *sa) +{ + struct sockaddr_in6 *sin6, *net6; + struct sctp_nets *net; + + if (sa->sa_family != AF_INET6) { + /* wrong family */ + return (0); + } + sin6 = (struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) == 0) { + /* not link local address */ + return (0); + } + /* hunt through our destination nets list for this scope_id */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if (((struct sockaddr *)(&net->ro._l_addr))->sa_family != + AF_INET6) + continue; + net6 = (struct sockaddr_in6 *)&net->ro._l_addr; + if (IN6_IS_ADDR_LINKLOCAL(&net6->sin6_addr) == 0) + continue; + if (sctp_is_same_scope(sin6, net6)) { + /* found one */ + return (1); + } + } + /* didn't find one */ + return (0); +} + +#endif + +/* + * address management functions + */ +static void +sctp_addr_mgmt_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_ifa *ifa, uint16_t type, int addr_locked) +{ + int status; + + + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0 && + sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) { + /* subset bound, no ASCONF allowed case, so ignore */ + return; + } + /* + * note: we know this is not the subset bound, no ASCONF case eg. + * this is boundall or subset bound w/ASCONF allowed + */ + + /* first, make sure it's a good address family */ + if (ifa->address.sa.sa_family != AF_INET6 && + ifa->address.sa.sa_family != AF_INET) { + return; + } + /* make sure we're "allowed" to add this type of addr */ + if (ifa->address.sa.sa_family == AF_INET6) { + /* invalid if we're not a v6 endpoint */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) + return; + /* is the v6 addr really valid ? */ + if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { + return; + } + } + /* put this address on the "pending/do not use yet" list */ + sctp_add_local_addr_restricted(stcb, ifa); + /* + * check address scope if address is out of scope, don't queue + * anything... note: this would leave the address on both inp and + * asoc lists + */ + switch (ifa->address.sa.sa_family) { +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&ifa->address.sin6; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* we skip unspecifed addresses */ + return; + } + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { + if (stcb->asoc.local_scope == 0) { + return; + } + /* is it the right link local scope? */ + if (sctp_is_scopeid_in_nets(stcb, &ifa->address.sa) == 0) { + return; + } + } + if (stcb->asoc.site_scope == 0 && + IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) { + return; + } + break; + } +#endif + case AF_INET: + { + struct sockaddr_in *sin; + struct in6pcb *inp6; + + inp6 = (struct in6pcb *)&inp->ip_inp.inp; + /* invalid if we are a v6 only endpoint */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(inp6)) + return; + + sin = (struct sockaddr_in *)&ifa->address.sa; + if (sin->sin_addr.s_addr == 0) { + /* we skip unspecifed addresses */ + return; + } + if (stcb->asoc.ipv4_local_scope == 0 && + IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { + return; + } + break; + } + default: + /* else, not AF_INET or AF_INET6, so skip */ + return; + } + + /* queue an asconf for this address add/delete */ + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF)) { + /* does the peer do asconf? */ + if (stcb->asoc.peer_supports_asconf) { + /* queue an asconf for this addr */ + status = sctp_asconf_queue_add(stcb, ifa, type); + + /* + * if queued ok, and in the open state, send out the + * ASCONF. If in the non-open state, these will be + * sent when the state goes open. + */ + if (status == 0 && + SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { +#ifdef SCTP_TIMER_BASED_ASCONF + sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, + stcb, stcb->asoc.primary_destination); +#else + sctp_send_asconf(stcb, stcb->asoc.primary_destination, + addr_locked); +#endif + } + } + } +} + + +int +sctp_asconf_iterator_ep(struct sctp_inpcb *inp, void *ptr, uint32_t val) +{ + struct sctp_asconf_iterator *asc; + struct sctp_ifa *ifa; + struct sctp_laddr *l; + int cnt_invalid = 0; + + asc = (struct sctp_asconf_iterator *)ptr; + LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) { + ifa = l->ifa; + if (ifa->address.sa.sa_family == AF_INET6) { + /* invalid if we're not a v6 endpoint */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { + cnt_invalid++; + if (asc->cnt == cnt_invalid) + return (1); + else + continue; + } + } else if (ifa->address.sa.sa_family == AF_INET) { + /* invalid if we are a v6 only endpoint */ + struct in6pcb *inp6; + + inp6 = (struct in6pcb *)&inp->ip_inp.inp; + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(inp6)) { + cnt_invalid++; + if (asc->cnt == cnt_invalid) + return (1); + else + continue; + } + } else { + /* invalid address family */ + cnt_invalid++; + if (asc->cnt == cnt_invalid) + return (1); + else + continue; + } + } + return (0); +} + +static int +sctp_asconf_iterator_ep_end(struct sctp_inpcb *inp, void *ptr, uint32_t val) +{ + struct sctp_ifa *ifa; + struct sctp_asconf_iterator *asc; + struct sctp_laddr *laddr, *nladdr, *l; + + /* Only for specific case not bound all */ + asc = (struct sctp_asconf_iterator *)ptr; + LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) { + ifa = l->ifa; + if (l->action == SCTP_ADD_IP_ADDRESS) { + LIST_FOREACH(laddr, &inp->sctp_addr_list, + sctp_nxt_addr) { + if (laddr->ifa == ifa) { + laddr->action = 0; + break; + } + } + } else if (l->action == SCTP_DEL_IP_ADDRESS) { + laddr = LIST_FIRST(&inp->sctp_addr_list); + while (laddr) { + nladdr = LIST_NEXT(laddr, sctp_nxt_addr); + /* remove only after all guys are done */ + if (laddr->ifa == ifa) { + sctp_del_local_addr_ep(inp, ifa); + } + laddr = nladdr; + } + } + } + return (0); +} + +void +sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + void *ptr, uint32_t val) +{ + struct sctp_asconf_iterator *asc; + struct sctp_ifa *ifa; + struct sctp_laddr *l; + int cnt_invalid = 0; + int type, status; + int num_queued = 0; + + asc = (struct sctp_asconf_iterator *)ptr; + LIST_FOREACH(l, &asc->list_of_work, sctp_nxt_addr) { + ifa = l->ifa; + type = l->action; + + /* address's vrf_id must be the vrf_id of the assoc */ + if (ifa->vrf_id != stcb->asoc.vrf_id) { + continue; + } + /* Same checks again for assoc */ + switch (ifa->address.sa.sa_family) { +#ifdef INET6 + case AF_INET6: + { + /* invalid if we're not a v6 endpoint */ + struct sockaddr_in6 *sin6; + + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { + cnt_invalid++; + if (asc->cnt == cnt_invalid) + return; + else + continue; + } + sin6 = (struct sockaddr_in6 *)&ifa->address.sin6; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* we skip unspecifed addresses */ + continue; + } + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { + if (stcb->asoc.local_scope == 0) { + continue; + } + /* is it the right link local scope? */ + if (sctp_is_scopeid_in_nets(stcb, &ifa->address.sa) == 0) { + continue; + } + } + break; + } +#endif + case AF_INET: + { + /* invalid if we are a v6 only endpoint */ + struct in6pcb *inp6; + struct sockaddr_in *sin; + + inp6 = (struct in6pcb *)&inp->ip_inp.inp; + /* invalid if we are a v6 only endpoint */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(inp6)) + continue; + + sin = (struct sockaddr_in *)&ifa->address.sa; + if (sin->sin_addr.s_addr == 0) { + /* we skip unspecifed addresses */ + continue; + } + if (stcb->asoc.ipv4_local_scope == 0 && + IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { + continue; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(inp6)) { + cnt_invalid++; + if (asc->cnt == cnt_invalid) + return; + else + continue; + } + break; + } + default: + /* invalid address family */ + cnt_invalid++; + if (asc->cnt == cnt_invalid) + return; + else + continue; + break; + } + + if (type == SCTP_ADD_IP_ADDRESS) { + /* prevent this address from being used as a source */ + sctp_add_local_addr_restricted(stcb, ifa); + } else if (type == SCTP_DEL_IP_ADDRESS) { + struct sctp_nets *net; + + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + sctp_rtentry_t *rt; + + /* delete this address if cached */ + if (net->ro._s_addr == ifa) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + net->src_addr_selected = 0; + rt = net->ro.ro_rt; + if (rt) { + RTFREE(rt); + net->ro.ro_rt = NULL; + } + /* + * Now we deleted our src address, + * should we not also now reset the + * cwnd/rto to start as if its a new + * address? + */ + stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); + net->RTO = 0; + + } + } + } else if (type == SCTP_SET_PRIM_ADDR) { + if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { + /* must validate the ifa is in the ep */ + if (sctp_is_addr_in_ep(stcb->sctp_ep, ifa) == 0) { + continue; + } + } else { + /* Need to check scopes for this guy */ + if (sctp_is_address_in_scope(ifa, + stcb->asoc.ipv4_addr_legal, + stcb->asoc.ipv6_addr_legal, + stcb->asoc.loopback_scope, + stcb->asoc.ipv4_local_scope, + stcb->asoc.local_scope, + stcb->asoc.site_scope, 0) == 0) { + continue; + } + } + } + /* queue an asconf for this address add/delete */ + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF) && + stcb->asoc.peer_supports_asconf) { + /* queue an asconf for this addr */ + status = sctp_asconf_queue_add(stcb, ifa, type); + /* + * if queued ok, and in the open state, update the + * count of queued params. If in the non-open + * state, these get sent when the assoc goes open. + */ + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { + if (status >= 0) { + num_queued++; + } + } + } + } + /* + * If we have queued params in the open state, send out an ASCONF. + */ + if (num_queued > 0) { + sctp_send_asconf(stcb, stcb->asoc.primary_destination, + SCTP_ADDR_NOT_LOCKED); + } +} + +void +sctp_asconf_iterator_end(void *ptr, uint32_t val) +{ + struct sctp_asconf_iterator *asc; + struct sctp_ifa *ifa; + struct sctp_laddr *l, *l_next; + + asc = (struct sctp_asconf_iterator *)ptr; + l = LIST_FIRST(&asc->list_of_work); + while (l != NULL) { + l_next = LIST_NEXT(l, sctp_nxt_addr); + ifa = l->ifa; + if (l->action == SCTP_ADD_IP_ADDRESS) { + /* Clear the defer use flag */ + ifa->localifa_flags &= ~SCTP_ADDR_DEFER_USE; + } + sctp_free_ifa(ifa); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), l); + SCTP_DECR_LADDR_COUNT(); + l = l_next; + } + SCTP_FREE(asc, SCTP_M_ASC_IT); +} + +/* + * sa is the sockaddr to ask the peer to set primary to. + * returns: 0 = completed, -1 = error + */ +int32_t +sctp_set_primary_ip_address_sa(struct sctp_tcb *stcb, struct sockaddr *sa) +{ + uint32_t vrf_id; + struct sctp_ifa *ifa; + + /* find the ifa for the desired set primary */ + vrf_id = stcb->asoc.vrf_id; + ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED); + if (ifa == NULL) { + /* Invalid address */ + return (-1); + } + /* queue an ASCONF:SET_PRIM_ADDR to be sent */ + if (!sctp_asconf_queue_add(stcb, ifa, SCTP_SET_PRIM_ADDR)) { + /* set primary queuing succeeded */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "set_primary_ip_address_sa: queued on tcb=%p, ", + stcb); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { +#ifdef SCTP_TIMER_BASED_ASCONF + sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, + stcb->sctp_ep, stcb, + stcb->asoc.primary_destination); +#else + sctp_send_asconf(stcb, stcb->asoc.primary_destination, + SCTP_ADDR_NOT_LOCKED); +#endif + } + } else { + SCTPDBG(SCTP_DEBUG_ASCONF1, "set_primary_ip_address_sa: failed to add to queue on tcb=%p, ", + stcb); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, sa); + return (-1); + } + return (0); +} + +void +sctp_set_primary_ip_address(struct sctp_ifa *ifa) +{ + struct sctp_inpcb *inp; + + /* go through all our PCB's */ + LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) { + struct sctp_tcb *stcb; + + /* process for all associations for this endpoint */ + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + /* queue an ASCONF:SET_PRIM_ADDR to be sent */ + if (!sctp_asconf_queue_add(stcb, ifa, + SCTP_SET_PRIM_ADDR)) { + /* set primary queuing succeeded */ + SCTPDBG(SCTP_DEBUG_ASCONF1, "set_primary_ip_address: queued on stcb=%p, ", + stcb); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &ifa->address.sa); + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { +#ifdef SCTP_TIMER_BASED_ASCONF + sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, + stcb->sctp_ep, stcb, + stcb->asoc.primary_destination); +#else + sctp_send_asconf(stcb, stcb->asoc.primary_destination, + SCTP_ADDR_NOT_LOCKED); +#endif + } + } + } /* for each stcb */ + } /* for each inp */ +} + +int +sctp_is_addr_pending(struct sctp_tcb *stcb, struct sctp_ifa *sctp_ifa) +{ + struct sctp_tmit_chunk *chk, *nchk; + unsigned int offset, asconf_limit; + struct sctp_asconf_chunk *acp; + struct sctp_asconf_paramhdr *aph; + uint8_t aparam_buf[SCTP_PARAM_BUFFER_SIZE]; + struct sctp_ipv6addr_param *p_addr; + int add_cnt, del_cnt; + uint16_t last_param_type; + + add_cnt = del_cnt = 0; + last_param_type = 0; + for (chk = TAILQ_FIRST(&stcb->asoc.asconf_send_queue); chk != NULL; + chk = nchk) { + /* get next chk */ + nchk = TAILQ_NEXT(chk, sctp_next); + + if (chk->data == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: No mbuf data?\n"); + continue; + } + offset = 0; + acp = mtod(chk->data, struct sctp_asconf_chunk *); + offset += sizeof(struct sctp_asconf_chunk); + asconf_limit = ntohs(acp->ch.chunk_length); + p_addr = (struct sctp_ipv6addr_param *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_paramhdr), aparam_buf); + if (p_addr == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: couldn't get lookup addr!\n"); + continue; + } + offset += ntohs(p_addr->ph.param_length); + + aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_asconf_paramhdr), aparam_buf); + if (aph == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: Empty ASCONF will be sent?\n"); + continue; + } + while (aph != NULL) { + unsigned int param_length, param_type; + + param_type = ntohs(aph->ph.param_type); + param_length = ntohs(aph->ph.param_length); + if (offset + param_length > asconf_limit) { + /* parameter goes beyond end of chunk! */ + break; + } + if (param_length > sizeof(aparam_buf)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: param length (%u) larger than buffer size!\n", param_length); + break; + } + if (param_length <= sizeof(struct sctp_paramhdr)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: param length(%u) too short\n", param_length); + break; + } + aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, param_length, aparam_buf); + if (aph == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "is_addr_pending: couldn't get entire param\n"); + break; + } + p_addr = (struct sctp_ipv6addr_param *)(aph + 1); + if (sctp_addr_match(p_addr, &sctp_ifa->address.sa) != 0) { + switch (param_type) { + case SCTP_ADD_IP_ADDRESS: + add_cnt++; + break; + case SCTP_DEL_IP_ADDRESS: + del_cnt++; + break; + default: + break; + } + last_param_type = param_type; + } + offset += SCTP_SIZE32(param_length); + if (offset >= asconf_limit) { + /* no more data in the mbuf chain */ + break; + } + /* get pointer to next asconf param */ + aph = (struct sctp_asconf_paramhdr *)sctp_m_getptr(chk->data, offset, sizeof(struct sctp_asconf_paramhdr), aparam_buf); + } + } + + /* + * we want to find the sequences which consist of ADD -> DEL -> ADD + * or DEL -> ADD + */ + if (add_cnt > del_cnt || + (add_cnt == del_cnt && last_param_type == SCTP_ADD_IP_ADDRESS)) { + return 1; + } + return 0; +} + +static struct sockaddr * +sctp_find_valid_localaddr(struct sctp_tcb *stcb, int addr_locked) +{ + struct sctp_vrf *vrf = NULL; + struct sctp_ifn *sctp_ifn; + struct sctp_ifa *sctp_ifa; + + if (addr_locked == SCTP_ADDR_NOT_LOCKED) + SCTP_IPI_ADDR_RLOCK(); + vrf = sctp_find_vrf(stcb->asoc.vrf_id); + if (vrf == NULL) { + if (addr_locked == SCTP_ADDR_NOT_LOCKED) + SCTP_IPI_ADDR_RUNLOCK(); + return (NULL); + } + LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { + if (stcb->asoc.loopback_scope == 0 && + SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { + /* Skip if loopback_scope not set */ + continue; + } + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + if (sctp_ifa->address.sa.sa_family == AF_INET && + stcb->asoc.ipv4_addr_legal) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&sctp_ifa->address.sa; + if (sin->sin_addr.s_addr == 0) { + /* skip unspecifed addresses */ + continue; + } + if (stcb->asoc.ipv4_local_scope == 0 && + IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) + continue; + + if (sctp_is_addr_restricted(stcb, sctp_ifa) && + (!sctp_is_addr_pending(stcb, sctp_ifa))) + continue; + /* found a valid local v4 address to use */ + if (addr_locked == SCTP_ADDR_NOT_LOCKED) + SCTP_IPI_ADDR_RUNLOCK(); + return (&sctp_ifa->address.sa); + } else if (sctp_ifa->address.sa.sa_family == AF_INET6 && + stcb->asoc.ipv6_addr_legal) { + struct sockaddr_in6 *sin6; + + if (sctp_ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { + continue; + } + sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* we skip unspecifed addresses */ + continue; + } + if (stcb->asoc.local_scope == 0 && + IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) + continue; + if (stcb->asoc.site_scope == 0 && + IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) + continue; + + if (sctp_is_addr_restricted(stcb, sctp_ifa) && + (!sctp_is_addr_pending(stcb, sctp_ifa))) + continue; + /* found a valid local v6 address to use */ + if (addr_locked == SCTP_ADDR_NOT_LOCKED) + SCTP_IPI_ADDR_RUNLOCK(); + return (&sctp_ifa->address.sa); + } + } + } + /* no valid addresses found */ + if (addr_locked == SCTP_ADDR_NOT_LOCKED) + SCTP_IPI_ADDR_RUNLOCK(); + return (NULL); +} + +static struct sockaddr * +sctp_find_valid_localaddr_ep(struct sctp_tcb *stcb) +{ + struct sctp_laddr *laddr; + + LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa == NULL) { + continue; + } + /* is the address restricted ? */ + if (sctp_is_addr_restricted(stcb, laddr->ifa) && + (!sctp_is_addr_pending(stcb, laddr->ifa))) + continue; + + /* found a valid local address to use */ + return (&laddr->ifa->address.sa); + } + /* no valid addresses found */ + return (NULL); +} + +/* + * builds an ASCONF chunk from queued ASCONF params. + * returns NULL on error (no mbuf, no ASCONF params queued, etc). + */ +struct mbuf * +sctp_compose_asconf(struct sctp_tcb *stcb, int *retlen, int addr_locked) +{ + struct mbuf *m_asconf, *m_asconf_chk; + struct sctp_asconf_addr *aa; + struct sctp_asconf_chunk *acp; + struct sctp_asconf_paramhdr *aph; + struct sctp_asconf_addr_param *aap; + uint32_t p_length; + uint32_t correlation_id = 1; /* 0 is reserved... */ + caddr_t ptr, lookup_ptr; + uint8_t lookup_used = 0; + + /* are there any asconf params to send? */ + TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) { + if (aa->sent == 0) + break; + } + if (aa == NULL) + return (NULL); + + /* + * get a chunk header mbuf and a cluster for the asconf params since + * it's simpler to fill in the asconf chunk header lookup address on + * the fly + */ + m_asconf_chk = sctp_get_mbuf_for_msg(sizeof(struct sctp_asconf_chunk), 0, M_DONTWAIT, 1, MT_DATA); + if (m_asconf_chk == NULL) { + /* no mbuf's */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "compose_asconf: couldn't get chunk mbuf!\n"); + return (NULL); + } + m_asconf = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + if (m_asconf == NULL) { + /* no mbuf's */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "compose_asconf: couldn't get mbuf!\n"); + sctp_m_freem(m_asconf_chk); + return (NULL); + } + SCTP_BUF_LEN(m_asconf_chk) = sizeof(struct sctp_asconf_chunk); + SCTP_BUF_LEN(m_asconf) = 0; + acp = mtod(m_asconf_chk, struct sctp_asconf_chunk *); + bzero(acp, sizeof(struct sctp_asconf_chunk)); + /* save pointers to lookup address and asconf params */ + lookup_ptr = (caddr_t)(acp + 1); /* after the header */ + ptr = mtod(m_asconf, caddr_t); /* beginning of cluster */ + + /* fill in chunk header info */ + acp->ch.chunk_type = SCTP_ASCONF; + acp->ch.chunk_flags = 0; + acp->serial_number = htonl(stcb->asoc.asconf_seq_out); + stcb->asoc.asconf_seq_out++; + + /* add parameters... up to smallest MTU allowed */ + TAILQ_FOREACH(aa, &stcb->asoc.asconf_queue, next) { + if (aa->sent) + continue; + /* get the parameter length */ + p_length = SCTP_SIZE32(aa->ap.aph.ph.param_length); + /* will it fit in current chunk? */ + if (SCTP_BUF_LEN(m_asconf) + p_length > stcb->asoc.smallest_mtu) { + /* won't fit, so we're done with this chunk */ + break; + } + /* assign (and store) a correlation id */ + aa->ap.aph.correlation_id = correlation_id++; + + /* + * fill in address if we're doing a delete this is a simple + * way for us to fill in the correlation address, which + * should only be used by the peer if we're deleting our + * source address and adding a new address (e.g. renumbering + * case) + */ + if (lookup_used == 0 && + (aa->special_del == 0) && + aa->ap.aph.ph.param_type == SCTP_DEL_IP_ADDRESS) { + struct sctp_ipv6addr_param *lookup; + uint16_t p_size, addr_size; + + lookup = (struct sctp_ipv6addr_param *)lookup_ptr; + lookup->ph.param_type = + htons(aa->ap.addrp.ph.param_type); + if (aa->ap.addrp.ph.param_type == SCTP_IPV6_ADDRESS) { + /* copy IPv6 address */ + p_size = sizeof(struct sctp_ipv6addr_param); + addr_size = sizeof(struct in6_addr); + } else { + /* copy IPv4 address */ + p_size = sizeof(struct sctp_ipv4addr_param); + addr_size = sizeof(struct in_addr); + } + lookup->ph.param_length = htons(SCTP_SIZE32(p_size)); + memcpy(lookup->addr, &aa->ap.addrp.addr, addr_size); + SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(p_size); + lookup_used = 1; + } + /* copy into current space */ + memcpy(ptr, &aa->ap, p_length); + + /* network elements and update lengths */ + aph = (struct sctp_asconf_paramhdr *)ptr; + aap = (struct sctp_asconf_addr_param *)ptr; + /* correlation_id is transparent to peer, no htonl needed */ + aph->ph.param_type = htons(aph->ph.param_type); + aph->ph.param_length = htons(aph->ph.param_length); + aap->addrp.ph.param_type = htons(aap->addrp.ph.param_type); + aap->addrp.ph.param_length = htons(aap->addrp.ph.param_length); + + SCTP_BUF_LEN(m_asconf) += SCTP_SIZE32(p_length); + ptr += SCTP_SIZE32(p_length); + + /* + * these params are removed off the pending list upon + * getting an ASCONF-ACK back from the peer, just set flag + */ + aa->sent = 1; + } + /* check to see if the lookup addr has been populated yet */ + if (lookup_used == 0) { + /* NOTE: if the address param is optional, can skip this... */ + /* add any valid (existing) address... */ + struct sctp_ipv6addr_param *lookup; + uint16_t p_size, addr_size; + struct sockaddr *found_addr; + caddr_t addr_ptr; + + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) + found_addr = sctp_find_valid_localaddr(stcb, + addr_locked); + else + found_addr = sctp_find_valid_localaddr_ep(stcb); + + lookup = (struct sctp_ipv6addr_param *)lookup_ptr; + if (found_addr != NULL) { + if (found_addr->sa_family == AF_INET6) { + /* copy IPv6 address */ + lookup->ph.param_type = + htons(SCTP_IPV6_ADDRESS); + p_size = sizeof(struct sctp_ipv6addr_param); + addr_size = sizeof(struct in6_addr); + addr_ptr = (caddr_t)&((struct sockaddr_in6 *) + found_addr)->sin6_addr; + } else { + /* copy IPv4 address */ + lookup->ph.param_type = + htons(SCTP_IPV4_ADDRESS); + p_size = sizeof(struct sctp_ipv4addr_param); + addr_size = sizeof(struct in_addr); + addr_ptr = (caddr_t)&((struct sockaddr_in *) + found_addr)->sin_addr; + } + lookup->ph.param_length = htons(SCTP_SIZE32(p_size)); + memcpy(lookup->addr, addr_ptr, addr_size); + SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(p_size); + lookup_used = 1; + } else { + /* uh oh... don't have any address?? */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "compose_asconf: no lookup addr!\n"); + /* for now, we send a IPv4 address of 0.0.0.0 */ + lookup->ph.param_type = htons(SCTP_IPV4_ADDRESS); + lookup->ph.param_length = htons(SCTP_SIZE32(sizeof(struct sctp_ipv4addr_param))); + bzero(lookup->addr, sizeof(struct in_addr)); + SCTP_BUF_LEN(m_asconf_chk) += SCTP_SIZE32(sizeof(struct sctp_ipv4addr_param)); + lookup_used = 1; + } + } + /* chain it all together */ + SCTP_BUF_NEXT(m_asconf_chk) = m_asconf; + *retlen = SCTP_BUF_LEN(m_asconf_chk) + SCTP_BUF_LEN(m_asconf); + acp->ch.chunk_length = ntohs(*retlen); + + return (m_asconf_chk); +} + +/* + * section to handle address changes before an association is up eg. changes + * during INIT/INIT-ACK/COOKIE-ECHO handshake + */ + +/* + * processes the (local) addresses in the INIT-ACK chunk + */ +static void +sctp_process_initack_addresses(struct sctp_tcb *stcb, struct mbuf *m, + unsigned int offset, unsigned int length) +{ + struct sctp_paramhdr tmp_param, *ph; + uint16_t plen, ptype; + struct sctp_ifa *sctp_ifa; + struct sctp_ipv6addr_param addr_store; + struct sockaddr_in6 sin6; + struct sockaddr_in sin; + struct sockaddr *sa; + uint32_t vrf_id; + + SCTPDBG(SCTP_DEBUG_ASCONF2, "processing init-ack addresses\n"); + if (stcb == NULL) /* Un-needed check for SA */ + return; + + /* convert to upper bound */ + length += offset; + + if ((offset + sizeof(struct sctp_paramhdr)) > length) { + return; + } + /* init the addresses */ + bzero(&sin6, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); + sin6.sin6_port = stcb->rport; + + bzero(&sin, sizeof(sin)); + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_port = stcb->rport; + + /* go through the addresses in the init-ack */ + ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, + sizeof(struct sctp_paramhdr), (uint8_t *) & tmp_param); + while (ph != NULL) { + ptype = ntohs(ph->param_type); + plen = ntohs(ph->param_length); + if (ptype == SCTP_IPV6_ADDRESS) { + struct sctp_ipv6addr_param *a6p; + + /* get the entire IPv6 address param */ + a6p = (struct sctp_ipv6addr_param *) + sctp_m_getptr(m, offset, + sizeof(struct sctp_ipv6addr_param), + (uint8_t *) & addr_store); + if (plen != sizeof(struct sctp_ipv6addr_param) || + a6p == NULL) { + return; + } + memcpy(&sin6.sin6_addr, a6p->addr, + sizeof(struct in6_addr)); + sa = (struct sockaddr *)&sin6; + } else if (ptype == SCTP_IPV4_ADDRESS) { + struct sctp_ipv4addr_param *a4p; + + /* get the entire IPv4 address param */ + a4p = (struct sctp_ipv4addr_param *)sctp_m_getptr(m, offset, + sizeof(struct sctp_ipv4addr_param), + (uint8_t *) & addr_store); + if (plen != sizeof(struct sctp_ipv4addr_param) || + a4p == NULL) { + return; + } + sin.sin_addr.s_addr = a4p->addr; + sa = (struct sockaddr *)&sin; + } else { + goto next_addr; + } + + /* see if this address really (still) exists */ + if (stcb) { + vrf_id = stcb->asoc.vrf_id; + } else { + vrf_id = SCTP_DEFAULT_VRFID; + } + sctp_ifa = sctp_find_ifa_by_addr(sa, vrf_id, + SCTP_ADDR_NOT_LOCKED); + if (sctp_ifa == NULL) { + /* address doesn't exist anymore */ + int status; + + /* are ASCONFs allowed ? */ + if ((sctp_is_feature_on(stcb->sctp_ep, + SCTP_PCB_FLAGS_DO_ASCONF)) && + stcb->asoc.peer_supports_asconf) { + /* queue an ASCONF DEL_IP_ADDRESS */ + status = sctp_asconf_queue_sa_delete(stcb, sa); + /* + * if queued ok, and in correct state, send + * out the ASCONF. + */ + if (status == 0 && + SCTP_GET_STATE(&stcb->asoc) == + SCTP_STATE_OPEN) { +#ifdef SCTP_TIMER_BASED_ASCONF + sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, + stcb->sctp_ep, stcb, + stcb->asoc.primary_destination); +#else + sctp_send_asconf(stcb, stcb->asoc.primary_destination, + SCTP_ADDR_NOT_LOCKED); +#endif + } + } + } +next_addr: + /* + * Sanity check: Make sure the length isn't 0, otherwise + * we'll be stuck in this loop for a long time... + */ + if (SCTP_SIZE32(plen) == 0) { + SCTP_PRINTF("process_initack_addrs: bad len (%d) type=%xh\n", + plen, ptype); + return; + } + /* get next parameter */ + offset += SCTP_SIZE32(plen); + if ((offset + sizeof(struct sctp_paramhdr)) > length) + return; + ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, + sizeof(struct sctp_paramhdr), (uint8_t *) & tmp_param); + } /* while */ +} + +/* FIX ME: need to verify return result for v6 address type if v6 disabled */ +/* + * checks to see if a specific address is in the initack address list returns + * 1 if found, 0 if not + */ +static uint32_t +sctp_addr_in_initack(struct sctp_tcb *stcb, struct mbuf *m, uint32_t offset, + uint32_t length, struct sockaddr *sa) +{ + struct sctp_paramhdr tmp_param, *ph; + uint16_t plen, ptype; + struct sctp_ipv6addr_param addr_store; + struct sockaddr_in *sin; + struct sctp_ipv4addr_param *a4p; + +#ifdef INET6 + struct sockaddr_in6 *sin6; + struct sctp_ipv6addr_param *a6p; + struct sockaddr_in6 sin6_tmp; + +#endif /* INET6 */ + + if ( +#ifdef INET6 + (sa->sa_family != AF_INET6) && +#endif /* INET6 */ + (sa->sa_family != AF_INET)) + return (0); + + SCTPDBG(SCTP_DEBUG_ASCONF2, "find_initack_addr: starting search for "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF2, sa); + /* convert to upper bound */ + length += offset; + + if ((offset + sizeof(struct sctp_paramhdr)) > length) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "find_initack_addr: invalid offset?\n"); + return (0); + } + /* go through the addresses in the init-ack */ + ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, + sizeof(struct sctp_paramhdr), (uint8_t *) & tmp_param); + while (ph != NULL) { + ptype = ntohs(ph->param_type); + plen = ntohs(ph->param_length); +#ifdef INET6 + if (ptype == SCTP_IPV6_ADDRESS && sa->sa_family == AF_INET6) { + /* get the entire IPv6 address param */ + a6p = (struct sctp_ipv6addr_param *) + sctp_m_getptr(m, offset, + sizeof(struct sctp_ipv6addr_param), + (uint8_t *) & addr_store); + if (plen != sizeof(struct sctp_ipv6addr_param) || + (ph == NULL) || + (a6p == NULL)) { + return (0); + } + sin6 = (struct sockaddr_in6 *)sa; + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { + /* create a copy and clear scope */ + memcpy(&sin6_tmp, sin6, + sizeof(struct sockaddr_in6)); + sin6 = &sin6_tmp; + in6_clearscope(&sin6->sin6_addr); + } + if (memcmp(&sin6->sin6_addr, a6p->addr, + sizeof(struct in6_addr)) == 0) { + /* found it */ + return (1); + } + } else +#endif /* INET6 */ + + if (ptype == SCTP_IPV4_ADDRESS && + sa->sa_family == AF_INET) { + /* get the entire IPv4 address param */ + a4p = (struct sctp_ipv4addr_param *)sctp_m_getptr(m, + offset, sizeof(struct sctp_ipv4addr_param), + (uint8_t *) & addr_store); + if (plen != sizeof(struct sctp_ipv4addr_param) || + (ph == NULL) || + (a4p == NULL)) { + return (0); + } + sin = (struct sockaddr_in *)sa; + if (sin->sin_addr.s_addr == a4p->addr) { + /* found it */ + return (1); + } + } + /* get next parameter */ + offset += SCTP_SIZE32(plen); + if (offset + sizeof(struct sctp_paramhdr) > length) + return (0); + ph = (struct sctp_paramhdr *) + sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), + (uint8_t *) & tmp_param); + } /* while */ + /* not found! */ + return (0); +} + +/* + * makes sure that the current endpoint local addr list is consistent with + * the new association (eg. subset bound, asconf allowed) adds addresses as + * necessary + */ +static void +sctp_check_address_list_ep(struct sctp_tcb *stcb, struct mbuf *m, int offset, + int length, struct sockaddr *init_addr) +{ + struct sctp_laddr *laddr; + + /* go through the endpoint list */ + LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { + /* be paranoid and validate the laddr */ + if (laddr->ifa == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, + "check_addr_list_ep: laddr->ifa is NULL"); + continue; + } + if (laddr->ifa == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "check_addr_list_ep: laddr->ifa->ifa_addr is NULL"); + continue; + } + /* do i have it implicitly? */ + if (sctp_cmpaddr(&laddr->ifa->address.sa, init_addr)) { + continue; + } + /* check to see if in the init-ack */ + if (!sctp_addr_in_initack(stcb, m, offset, length, + &laddr->ifa->address.sa)) { + /* try to add it */ + sctp_addr_mgmt_assoc(stcb->sctp_ep, stcb, laddr->ifa, + SCTP_ADD_IP_ADDRESS, SCTP_ADDR_NOT_LOCKED); + } + } +} + +/* + * makes sure that the current kernel address list is consistent with the new + * association (with all addrs bound) adds addresses as necessary + */ +static void +sctp_check_address_list_all(struct sctp_tcb *stcb, struct mbuf *m, int offset, + int length, struct sockaddr *init_addr, + uint16_t local_scope, uint16_t site_scope, + uint16_t ipv4_scope, uint16_t loopback_scope) +{ + struct sctp_vrf *vrf = NULL; + struct sctp_ifn *sctp_ifn; + struct sctp_ifa *sctp_ifa; + uint32_t vrf_id; + + if (stcb) { + vrf_id = stcb->asoc.vrf_id; + } else { + return; + } + SCTP_IPI_ADDR_RLOCK(); + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) { + SCTP_IPI_ADDR_RUNLOCK(); + return; + } + /* go through all our known interfaces */ + LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { + if (loopback_scope == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { + /* skip loopback interface */ + continue; + } + /* go through each interface address */ + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + /* do i have it implicitly? */ + if (sctp_cmpaddr(&sctp_ifa->address.sa, init_addr)) { + continue; + } + /* check to see if in the init-ack */ + if (!sctp_addr_in_initack(stcb, m, offset, length, + &sctp_ifa->address.sa)) { + /* try to add it */ + sctp_addr_mgmt_assoc(stcb->sctp_ep, stcb, + sctp_ifa, SCTP_ADD_IP_ADDRESS, + SCTP_ADDR_LOCKED); + } + } /* end foreach ifa */ + } /* end foreach ifn */ + SCTP_IPI_ADDR_RUNLOCK(); +} + +/* + * validates an init-ack chunk (from a cookie-echo) with current addresses + * adds addresses from the init-ack into our local address list, if needed + * queues asconf adds/deletes addresses as needed and makes appropriate list + * changes for source address selection m, offset: points to the start of the + * address list in an init-ack chunk length: total length of the address + * params only init_addr: address where my INIT-ACK was sent from + */ +void +sctp_check_address_list(struct sctp_tcb *stcb, struct mbuf *m, int offset, + int length, struct sockaddr *init_addr, + uint16_t local_scope, uint16_t site_scope, + uint16_t ipv4_scope, uint16_t loopback_scope) +{ + /* process the local addresses in the initack */ + sctp_process_initack_addresses(stcb, m, offset, length); + + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + /* bound all case */ + sctp_check_address_list_all(stcb, m, offset, length, init_addr, + local_scope, site_scope, ipv4_scope, loopback_scope); + } else { + /* subset bound case */ + if (sctp_is_feature_on(stcb->sctp_ep, + SCTP_PCB_FLAGS_DO_ASCONF)) { + /* asconf's allowed */ + sctp_check_address_list_ep(stcb, m, offset, length, + init_addr); + } + /* else, no asconfs allowed, so what we sent is what we get */ + } +} + +/* + * sctp_bindx() support + */ +uint32_t +sctp_addr_mgmt_ep_sa(struct sctp_inpcb *inp, struct sockaddr *sa, + uint32_t type, uint32_t vrf_id, struct sctp_ifa *sctp_ifap) +{ + struct sctp_ifa *ifa; + + if (sa->sa_len == 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EINVAL); + return (EINVAL); + } + if (sctp_ifap) { + ifa = sctp_ifap; + } else if (type == SCTP_ADD_IP_ADDRESS) { + /* For an add the address MUST be on the system */ + ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED); + } else if (type == SCTP_DEL_IP_ADDRESS) { + /* For a delete we need to find it in the inp */ + ifa = sctp_find_ifa_in_ep(inp, sa, SCTP_ADDR_NOT_LOCKED); + } else { + ifa = NULL; + } + if (ifa != NULL) { + if (type == SCTP_ADD_IP_ADDRESS) { + sctp_add_local_addr_ep(inp, ifa, type); + } else if (type == SCTP_DEL_IP_ADDRESS) { + struct sctp_laddr *laddr; + + if (inp->laddr_count < 2) { + /* can't delete the last local address */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EINVAL); + return (EINVAL); + } + LIST_FOREACH(laddr, &inp->sctp_addr_list, + sctp_nxt_addr) { + if (ifa == laddr->ifa) { + /* Mark in the delete */ + laddr->action = type; + } + } + } + if (!LIST_EMPTY(&inp->sctp_asoc_list)) { + /* + * There is no need to start the iterator if the inp + * has no associations. + */ + struct sctp_asconf_iterator *asc; + struct sctp_laddr *wi; + + SCTP_MALLOC(asc, struct sctp_asconf_iterator *, + sizeof(struct sctp_asconf_iterator), + SCTP_M_ASC_IT); + if (asc == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, ENOMEM); + return (ENOMEM); + } + wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); + if (wi == NULL) { + SCTP_FREE(asc, SCTP_M_ASC_IT); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_ASCONF, ENOMEM); + return (ENOMEM); + } + LIST_INIT(&asc->list_of_work); + asc->cnt = 1; + SCTP_INCR_LADDR_COUNT(); + wi->ifa = ifa; + wi->action = type; + atomic_add_int(&ifa->refcount, 1); + LIST_INSERT_HEAD(&asc->list_of_work, wi, sctp_nxt_addr); + (void)sctp_initiate_iterator(sctp_asconf_iterator_ep, + sctp_asconf_iterator_stcb, + sctp_asconf_iterator_ep_end, + SCTP_PCB_ANY_FLAGS, + SCTP_PCB_ANY_FEATURES, + SCTP_ASOC_ANY_STATE, + (void *)asc, 0, + sctp_asconf_iterator_end, inp, 0); + } + return (0); + } else { + /* invalid address! */ + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_ASCONF, EADDRNOTAVAIL); + return (EADDRNOTAVAIL); + } +} + +void +sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct sctp_asconf_addr *aa; + struct sctp_ifa *sctp_ifap; + struct sctp_asconf_tag_param *vtag; + struct sockaddr_in *to; + +#ifdef INET6 + struct sockaddr_in6 *to6; + +#endif + if (net == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: Missing net\n"); + return; + } + if (stcb == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "sctp_asconf_send_nat_state_update: Missing stcb\n"); + return; + } + /* + * Need to have in the asconf: - vtagparam(my_vtag/peer_vtag) - + * add(0.0.0.0) - del(0.0.0.0) - Any global addresses add(addr) + */ + SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), + SCTP_M_ASC_ADDR); + if (aa == NULL) { + /* didn't get memory */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "sctp_asconf_send_nat_state_update: failed to get memory!\n"); + return; + } + aa->special_del = 0; + /* fill in asconf address parameter fields */ + /* top level elements are "networked" during send */ + aa->ifa = NULL; + aa->sent = 0; /* clear sent flag */ + vtag = (struct sctp_asconf_tag_param *)&aa->ap.aph; + vtag->aph.ph.param_type = SCTP_NAT_VTAGS; + vtag->aph.ph.param_length = sizeof(struct sctp_asconf_tag_param); + vtag->local_vtag = htonl(stcb->asoc.my_vtag); + vtag->remote_vtag = htonl(stcb->asoc.peer_vtag); + TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); + + SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), + SCTP_M_ASC_ADDR); + if (aa == NULL) { + /* didn't get memory */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "sctp_asconf_send_nat_state_update: failed to get memory!\n"); + return; + } + memset(aa, 0, sizeof(struct sctp_asconf_addr)); + /* fill in asconf address parameter fields */ + /* ADD(0.0.0.0) */ + if (net->ro._l_addr.sa.sa_family == AF_INET) { + aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS; + aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addrv4_param); + aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS; + aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv4addr_param); + /* No need to add an address, we are using 0.0.0.0 */ + TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); + } +#ifdef INET6 + else if (net->ro._l_addr.sa.sa_family == AF_INET6) { + aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS; + aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addr_param); + aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS; + aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv6addr_param); + /* No need to add an address, we are using 0.0.0.0 */ + TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); + } +#endif /* INET6 */ + SCTP_MALLOC(aa, struct sctp_asconf_addr *, sizeof(*aa), + SCTP_M_ASC_ADDR); + if (aa == NULL) { + /* didn't get memory */ + SCTPDBG(SCTP_DEBUG_ASCONF1, + "sctp_asconf_send_nat_state_update: failed to get memory!\n"); + return; + } + memset(aa, 0, sizeof(struct sctp_asconf_addr)); + /* fill in asconf address parameter fields */ + /* ADD(0.0.0.0) */ + if (net->ro._l_addr.sa.sa_family == AF_INET) { + aa->ap.aph.ph.param_type = SCTP_ADD_IP_ADDRESS; + aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addrv4_param); + aa->ap.addrp.ph.param_type = SCTP_IPV4_ADDRESS; + aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv4addr_param); + /* No need to add an address, we are using 0.0.0.0 */ + TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); + } +#ifdef INET6 + else if (net->ro._l_addr.sa.sa_family == AF_INET6) { + aa->ap.aph.ph.param_type = SCTP_DEL_IP_ADDRESS; + aa->ap.aph.ph.param_length = sizeof(struct sctp_asconf_addr_param); + aa->ap.addrp.ph.param_type = SCTP_IPV6_ADDRESS; + aa->ap.addrp.ph.param_length = sizeof(struct sctp_ipv6addr_param); + /* No need to add an address, we are using 0.0.0.0 */ + TAILQ_INSERT_TAIL(&stcb->asoc.asconf_queue, aa, next); + } +#endif /* INET6 */ + /* Now we must hunt the addresses and add all global addresses */ + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + struct sctp_vrf *vrf = NULL; + struct sctp_ifn *sctp_ifnp; + uint32_t vrf_id; + + vrf_id = stcb->sctp_ep->def_vrf_id; + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) { + goto skip_rest; + } + SCTP_IPI_ADDR_RLOCK(); + LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) { + LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) { + if (sctp_ifap->address.sa.sa_family == AF_INET) { + to = &sctp_ifap->address.sin; + + if (IN4_ISPRIVATE_ADDRESS(&to->sin_addr)) { + continue; + } + if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) { + continue; + } + } +#ifdef INET6 + else if (sctp_ifap->address.sa.sa_family == AF_INET6) { + to6 = &sctp_ifap->address.sin6; + if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr)) { + continue; + } + if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) { + continue; + } + } +#endif + sctp_asconf_queue_mgmt(stcb, sctp_ifap, SCTP_ADD_IP_ADDRESS); + } + } + SCTP_IPI_ADDR_RUNLOCK(); + } else { + struct sctp_laddr *laddr; + + LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa == NULL) { + continue; + } + if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) + /* + * Address being deleted by the system, dont + * list. + */ + continue; + if (laddr->action == SCTP_DEL_IP_ADDRESS) { + /* + * Address being deleted on this ep don't + * list. + */ + continue; + } + sctp_ifap = laddr->ifa; + if (sctp_ifap->address.sa.sa_family == AF_INET) { + to = &sctp_ifap->address.sin; + + if (IN4_ISPRIVATE_ADDRESS(&to->sin_addr)) { + continue; + } + if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) { + continue; + } + } +#ifdef INET6 + else if (sctp_ifap->address.sa.sa_family == AF_INET6) { + to6 = &sctp_ifap->address.sin6; + if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr)) { + continue; + } + if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) { + continue; + } + } +#endif + sctp_asconf_queue_mgmt(stcb, sctp_ifap, SCTP_ADD_IP_ADDRESS); + } + } +skip_rest: + /* Now we must send the asconf into the queue */ + sctp_send_asconf(stcb, net, 0); +} diff --git a/freebsd/sys/netinet/sctp_asconf.h b/freebsd/sys/netinet/sctp_asconf.h new file mode 100644 index 00000000..ff8cf378 --- /dev/null +++ b/freebsd/sys/netinet/sctp_asconf.h @@ -0,0 +1,96 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_asconf.h,v 1.8 2005/03/06 16:04:16 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef _NETINET_SCTP_ASCONF_HH_ +#define _NETINET_SCTP_ASCONF_HH_ + +#if defined(_KERNEL) || defined(__Userspace__) + +/* + * function prototypes + */ +extern void sctp_asconf_cleanup(struct sctp_tcb *, struct sctp_nets *); + +extern struct mbuf *sctp_compose_asconf(struct sctp_tcb *, int *, int); + +extern void +sctp_handle_asconf(struct mbuf *, unsigned int, struct sctp_asconf_chunk *, + struct sctp_tcb *, int i); + +extern void +sctp_handle_asconf_ack(struct mbuf *, int, struct sctp_asconf_ack_chunk *, + struct sctp_tcb *, struct sctp_nets *, int *); + +extern uint32_t +sctp_addr_mgmt_ep_sa(struct sctp_inpcb *, struct sockaddr *, + uint32_t, uint32_t, struct sctp_ifa *); + + +extern int +sctp_asconf_iterator_ep(struct sctp_inpcb *inp, void *ptr, + uint32_t val); +extern void +sctp_asconf_iterator_stcb(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + void *ptr, uint32_t type); +extern void sctp_asconf_iterator_end(void *ptr, uint32_t val); + + +extern int32_t +sctp_set_primary_ip_address_sa(struct sctp_tcb *, + struct sockaddr *); + +extern void + sctp_set_primary_ip_address(struct sctp_ifa *ifa); + +extern void +sctp_check_address_list(struct sctp_tcb *, struct mbuf *, int, int, + struct sockaddr *, uint16_t, uint16_t, uint16_t, uint16_t); + +extern void + sctp_assoc_immediate_retrans(struct sctp_tcb *, struct sctp_nets *); +extern void + sctp_net_immediate_retrans(struct sctp_tcb *, struct sctp_nets *); + +extern void +sctp_asconf_send_nat_state_update(struct sctp_tcb *stcb, + struct sctp_nets *net); + +extern int + sctp_is_addr_pending(struct sctp_tcb *, struct sctp_ifa *); + +#endif /* _KERNEL */ + +#endif /* !_NETINET_SCTP_ASCONF_HH_ */ diff --git a/freebsd/sys/netinet/sctp_auth.c b/freebsd/sys/netinet/sctp_auth.c new file mode 100644 index 00000000..6c2bf908 --- /dev/null +++ b/freebsd/sys/netinet/sctp_auth.c @@ -0,0 +1,2128 @@ +#include + +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef SCTP_DEBUG +#define SCTP_AUTH_DEBUG (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_AUTH1) +#define SCTP_AUTH_DEBUG2 (SCTP_BASE_SYSCTL(sctp_debug_on) & SCTP_DEBUG_AUTH2) +#endif /* SCTP_DEBUG */ + + +void +sctp_clear_chunklist(sctp_auth_chklist_t * chklist) +{ + bzero(chklist, sizeof(*chklist)); + /* chklist->num_chunks = 0; */ +} + +sctp_auth_chklist_t * +sctp_alloc_chunklist(void) +{ + sctp_auth_chklist_t *chklist; + + SCTP_MALLOC(chklist, sctp_auth_chklist_t *, sizeof(*chklist), + SCTP_M_AUTH_CL); + if (chklist == NULL) { + SCTPDBG(SCTP_DEBUG_AUTH1, "sctp_alloc_chunklist: failed to get memory!\n"); + } else { + sctp_clear_chunklist(chklist); + } + return (chklist); +} + +void +sctp_free_chunklist(sctp_auth_chklist_t * list) +{ + if (list != NULL) + SCTP_FREE(list, SCTP_M_AUTH_CL); +} + +sctp_auth_chklist_t * +sctp_copy_chunklist(sctp_auth_chklist_t * list) +{ + sctp_auth_chklist_t *new_list; + + if (list == NULL) + return (NULL); + + /* get a new list */ + new_list = sctp_alloc_chunklist(); + if (new_list == NULL) + return (NULL); + /* copy it */ + bcopy(list, new_list, sizeof(*new_list)); + + return (new_list); +} + + +/* + * add a chunk to the required chunks list + */ +int +sctp_auth_add_chunk(uint8_t chunk, sctp_auth_chklist_t * list) +{ + if (list == NULL) + return (-1); + + /* is chunk restricted? */ + if ((chunk == SCTP_INITIATION) || + (chunk == SCTP_INITIATION_ACK) || + (chunk == SCTP_SHUTDOWN_COMPLETE) || + (chunk == SCTP_AUTHENTICATION)) { + return (-1); + } + if (list->chunks[chunk] == 0) { + list->chunks[chunk] = 1; + list->num_chunks++; + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP: added chunk %u (0x%02x) to Auth list\n", + chunk, chunk); + } + return (0); +} + +/* + * delete a chunk from the required chunks list + */ +int +sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t * list) +{ + if (list == NULL) + return (-1); + + /* is chunk restricted? */ + if ((chunk == SCTP_ASCONF) || + (chunk == SCTP_ASCONF_ACK)) { + return (-1); + } + if (list->chunks[chunk] == 1) { + list->chunks[chunk] = 0; + list->num_chunks--; + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP: deleted chunk %u (0x%02x) from Auth list\n", + chunk, chunk); + } + return (0); +} + +size_t +sctp_auth_get_chklist_size(const sctp_auth_chklist_t * list) +{ + if (list == NULL) + return (0); + else + return (list->num_chunks); +} + +/* + * set the default list of chunks requiring AUTH + */ +void +sctp_auth_set_default_chunks(sctp_auth_chklist_t * list) +{ + (void)sctp_auth_add_chunk(SCTP_ASCONF, list); + (void)sctp_auth_add_chunk(SCTP_ASCONF_ACK, list); +} + +/* + * return the current number and list of required chunks caller must + * guarantee ptr has space for up to 256 bytes + */ +int +sctp_serialize_auth_chunks(const sctp_auth_chklist_t * list, uint8_t * ptr) +{ + int i, count = 0; + + if (list == NULL) + return (0); + + for (i = 0; i < 256; i++) { + if (list->chunks[i] != 0) { + *ptr++ = i; + count++; + } + } + return (count); +} + +int +sctp_pack_auth_chunks(const sctp_auth_chklist_t * list, uint8_t * ptr) +{ + int i, size = 0; + + if (list == NULL) + return (0); + + if (list->num_chunks <= 32) { + /* just list them, one byte each */ + for (i = 0; i < 256; i++) { + if (list->chunks[i] != 0) { + *ptr++ = i; + size++; + } + } + } else { + int index, offset; + + /* pack into a 32 byte bitfield */ + for (i = 0; i < 256; i++) { + if (list->chunks[i] != 0) { + index = i / 8; + offset = i % 8; + ptr[index] |= (1 << offset); + } + } + size = 32; + } + return (size); +} + +int +sctp_unpack_auth_chunks(const uint8_t * ptr, uint8_t num_chunks, + sctp_auth_chklist_t * list) +{ + int i; + int size; + + if (list == NULL) + return (0); + + if (num_chunks <= 32) { + /* just pull them, one byte each */ + for (i = 0; i < num_chunks; i++) { + (void)sctp_auth_add_chunk(*ptr++, list); + } + size = num_chunks; + } else { + int index, offset; + + /* unpack from a 32 byte bitfield */ + for (index = 0; index < 32; index++) { + for (offset = 0; offset < 8; offset++) { + if (ptr[index] & (1 << offset)) { + (void)sctp_auth_add_chunk((index * 8) + offset, list); + } + } + } + size = 32; + } + return (size); +} + + +/* + * allocate structure space for a key of length keylen + */ +sctp_key_t * +sctp_alloc_key(uint32_t keylen) +{ + sctp_key_t *new_key; + + SCTP_MALLOC(new_key, sctp_key_t *, sizeof(*new_key) + keylen, + SCTP_M_AUTH_KY); + if (new_key == NULL) { + /* out of memory */ + return (NULL); + } + new_key->keylen = keylen; + return (new_key); +} + +void +sctp_free_key(sctp_key_t * key) +{ + if (key != NULL) + SCTP_FREE(key, SCTP_M_AUTH_KY); +} + +void +sctp_print_key(sctp_key_t * key, const char *str) +{ + uint32_t i; + + if (key == NULL) { + printf("%s: [Null key]\n", str); + return; + } + printf("%s: len %u, ", str, key->keylen); + if (key->keylen) { + for (i = 0; i < key->keylen; i++) + printf("%02x", key->key[i]); + printf("\n"); + } else { + printf("[Null key]\n"); + } +} + +void +sctp_show_key(sctp_key_t * key, const char *str) +{ + uint32_t i; + + if (key == NULL) { + printf("%s: [Null key]\n", str); + return; + } + printf("%s: len %u, ", str, key->keylen); + if (key->keylen) { + for (i = 0; i < key->keylen; i++) + printf("%02x", key->key[i]); + printf("\n"); + } else { + printf("[Null key]\n"); + } +} + +static uint32_t +sctp_get_keylen(sctp_key_t * key) +{ + if (key != NULL) + return (key->keylen); + else + return (0); +} + +/* + * generate a new random key of length 'keylen' + */ +sctp_key_t * +sctp_generate_random_key(uint32_t keylen) +{ + sctp_key_t *new_key; + + /* validate keylen */ + if (keylen > SCTP_AUTH_RANDOM_SIZE_MAX) + keylen = SCTP_AUTH_RANDOM_SIZE_MAX; + + new_key = sctp_alloc_key(keylen); + if (new_key == NULL) { + /* out of memory */ + return (NULL); + } + SCTP_READ_RANDOM(new_key->key, keylen); + new_key->keylen = keylen; + return (new_key); +} + +sctp_key_t * +sctp_set_key(uint8_t * key, uint32_t keylen) +{ + sctp_key_t *new_key; + + new_key = sctp_alloc_key(keylen); + if (new_key == NULL) { + /* out of memory */ + return (NULL); + } + bcopy(key, new_key->key, keylen); + return (new_key); +} + +/*- + * given two keys of variable size, compute which key is "larger/smaller" + * returns: 1 if key1 > key2 + * -1 if key1 < key2 + * 0 if key1 = key2 + */ +static int +sctp_compare_key(sctp_key_t * key1, sctp_key_t * key2) +{ + uint32_t maxlen; + uint32_t i; + uint32_t key1len, key2len; + uint8_t *key_1, *key_2; + uint8_t temp[SCTP_AUTH_RANDOM_SIZE_MAX]; + + /* sanity/length check */ + key1len = sctp_get_keylen(key1); + key2len = sctp_get_keylen(key2); + if ((key1len == 0) && (key2len == 0)) + return (0); + else if (key1len == 0) + return (-1); + else if (key2len == 0) + return (1); + + if (key1len != key2len) { + if (key1len >= key2len) + maxlen = key1len; + else + maxlen = key2len; + bzero(temp, maxlen); + if (key1len < maxlen) { + /* prepend zeroes to key1 */ + bcopy(key1->key, temp + (maxlen - key1len), key1len); + key_1 = temp; + key_2 = key2->key; + } else { + /* prepend zeroes to key2 */ + bcopy(key2->key, temp + (maxlen - key2len), key2len); + key_1 = key1->key; + key_2 = temp; + } + } else { + maxlen = key1len; + key_1 = key1->key; + key_2 = key2->key; + } + + for (i = 0; i < maxlen; i++) { + if (*key_1 > *key_2) + return (1); + else if (*key_1 < *key_2) + return (-1); + key_1++; + key_2++; + } + + /* keys are equal value, so check lengths */ + if (key1len == key2len) + return (0); + else if (key1len < key2len) + return (-1); + else + return (1); +} + +/* + * generate the concatenated keying material based on the two keys and the + * shared key (if available). draft-ietf-tsvwg-auth specifies the specific + * order for concatenation + */ +sctp_key_t * +sctp_compute_hashkey(sctp_key_t * key1, sctp_key_t * key2, sctp_key_t * shared) +{ + uint32_t keylen; + sctp_key_t *new_key; + uint8_t *key_ptr; + + keylen = sctp_get_keylen(key1) + sctp_get_keylen(key2) + + sctp_get_keylen(shared); + + if (keylen > 0) { + /* get space for the new key */ + new_key = sctp_alloc_key(keylen); + if (new_key == NULL) { + /* out of memory */ + return (NULL); + } + new_key->keylen = keylen; + key_ptr = new_key->key; + } else { + /* all keys empty/null?! */ + return (NULL); + } + + /* concatenate the keys */ + if (sctp_compare_key(key1, key2) <= 0) { + /* key is shared + key1 + key2 */ + if (sctp_get_keylen(shared)) { + bcopy(shared->key, key_ptr, shared->keylen); + key_ptr += shared->keylen; + } + if (sctp_get_keylen(key1)) { + bcopy(key1->key, key_ptr, key1->keylen); + key_ptr += key1->keylen; + } + if (sctp_get_keylen(key2)) { + bcopy(key2->key, key_ptr, key2->keylen); + key_ptr += key2->keylen; + } + } else { + /* key is shared + key2 + key1 */ + if (sctp_get_keylen(shared)) { + bcopy(shared->key, key_ptr, shared->keylen); + key_ptr += shared->keylen; + } + if (sctp_get_keylen(key2)) { + bcopy(key2->key, key_ptr, key2->keylen); + key_ptr += key2->keylen; + } + if (sctp_get_keylen(key1)) { + bcopy(key1->key, key_ptr, key1->keylen); + key_ptr += key1->keylen; + } + } + return (new_key); +} + + +sctp_sharedkey_t * +sctp_alloc_sharedkey(void) +{ + sctp_sharedkey_t *new_key; + + SCTP_MALLOC(new_key, sctp_sharedkey_t *, sizeof(*new_key), + SCTP_M_AUTH_KY); + if (new_key == NULL) { + /* out of memory */ + return (NULL); + } + new_key->keyid = 0; + new_key->key = NULL; + new_key->refcount = 1; + new_key->deactivated = 0; + return (new_key); +} + +void +sctp_free_sharedkey(sctp_sharedkey_t * skey) +{ + if (skey == NULL) + return; + + if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&skey->refcount)) { + if (skey->key != NULL) + sctp_free_key(skey->key); + SCTP_FREE(skey, SCTP_M_AUTH_KY); + } +} + +sctp_sharedkey_t * +sctp_find_sharedkey(struct sctp_keyhead *shared_keys, uint16_t key_id) +{ + sctp_sharedkey_t *skey; + + LIST_FOREACH(skey, shared_keys, next) { + if (skey->keyid == key_id) + return (skey); + } + return (NULL); +} + +int +sctp_insert_sharedkey(struct sctp_keyhead *shared_keys, + sctp_sharedkey_t * new_skey) +{ + sctp_sharedkey_t *skey; + + if ((shared_keys == NULL) || (new_skey == NULL)) + return (EINVAL); + + /* insert into an empty list? */ + if (LIST_EMPTY(shared_keys)) { + LIST_INSERT_HEAD(shared_keys, new_skey, next); + return (0); + } + /* insert into the existing list, ordered by key id */ + LIST_FOREACH(skey, shared_keys, next) { + if (new_skey->keyid < skey->keyid) { + /* insert it before here */ + LIST_INSERT_BEFORE(skey, new_skey, next); + return (0); + } else if (new_skey->keyid == skey->keyid) { + /* replace the existing key */ + /* verify this key *can* be replaced */ + if ((skey->deactivated) && (skey->refcount > 1)) { + SCTPDBG(SCTP_DEBUG_AUTH1, + "can't replace shared key id %u\n", + new_skey->keyid); + return (EBUSY); + } + SCTPDBG(SCTP_DEBUG_AUTH1, + "replacing shared key id %u\n", + new_skey->keyid); + LIST_INSERT_BEFORE(skey, new_skey, next); + LIST_REMOVE(skey, next); + sctp_free_sharedkey(skey); + return (0); + } + if (LIST_NEXT(skey, next) == NULL) { + /* belongs at the end of the list */ + LIST_INSERT_AFTER(skey, new_skey, next); + return (0); + } + } + /* shouldn't reach here */ + return (0); +} + +void +sctp_auth_key_acquire(struct sctp_tcb *stcb, uint16_t key_id) +{ + sctp_sharedkey_t *skey; + + /* find the shared key */ + skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, key_id); + + /* bump the ref count */ + if (skey) { + atomic_add_int(&skey->refcount, 1); + SCTPDBG(SCTP_DEBUG_AUTH2, + "%s: stcb %p key %u refcount acquire to %d\n", + __FUNCTION__, stcb, key_id, skey->refcount); + } +} + +void +sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t key_id) +{ + sctp_sharedkey_t *skey; + + /* find the shared key */ + skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, key_id); + + /* decrement the ref count */ + if (skey) { + sctp_free_sharedkey(skey); + SCTPDBG(SCTP_DEBUG_AUTH2, + "%s: stcb %p key %u refcount release to %d\n", + __FUNCTION__, stcb, key_id, skey->refcount); + + /* see if a notification should be generated */ + if ((skey->refcount <= 1) && (skey->deactivated)) { + /* notify ULP that key is no longer used */ + sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb, + key_id, 0, SCTP_SO_LOCKED); + SCTPDBG(SCTP_DEBUG_AUTH2, + "%s: stcb %p key %u no longer used, %d\n", + __FUNCTION__, stcb, key_id, skey->refcount); + } + } +} + +static sctp_sharedkey_t * +sctp_copy_sharedkey(const sctp_sharedkey_t * skey) +{ + sctp_sharedkey_t *new_skey; + + if (skey == NULL) + return (NULL); + new_skey = sctp_alloc_sharedkey(); + if (new_skey == NULL) + return (NULL); + if (skey->key != NULL) + new_skey->key = sctp_set_key(skey->key->key, skey->key->keylen); + else + new_skey->key = NULL; + new_skey->keyid = skey->keyid; + return (new_skey); +} + +int +sctp_copy_skeylist(const struct sctp_keyhead *src, struct sctp_keyhead *dest) +{ + sctp_sharedkey_t *skey, *new_skey; + int count = 0; + + if ((src == NULL) || (dest == NULL)) + return (0); + LIST_FOREACH(skey, src, next) { + new_skey = sctp_copy_sharedkey(skey); + if (new_skey != NULL) { + (void)sctp_insert_sharedkey(dest, new_skey); + count++; + } + } + return (count); +} + + +sctp_hmaclist_t * +sctp_alloc_hmaclist(uint8_t num_hmacs) +{ + sctp_hmaclist_t *new_list; + int alloc_size; + + alloc_size = sizeof(*new_list) + num_hmacs * sizeof(new_list->hmac[0]); + SCTP_MALLOC(new_list, sctp_hmaclist_t *, alloc_size, + SCTP_M_AUTH_HL); + if (new_list == NULL) { + /* out of memory */ + return (NULL); + } + new_list->max_algo = num_hmacs; + new_list->num_algo = 0; + return (new_list); +} + +void +sctp_free_hmaclist(sctp_hmaclist_t * list) +{ + if (list != NULL) { + SCTP_FREE(list, SCTP_M_AUTH_HL); + list = NULL; + } +} + +int +sctp_auth_add_hmacid(sctp_hmaclist_t * list, uint16_t hmac_id) +{ + int i; + + if (list == NULL) + return (-1); + if (list->num_algo == list->max_algo) { + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP: HMAC id list full, ignoring add %u\n", hmac_id); + return (-1); + } + if ((hmac_id != SCTP_AUTH_HMAC_ID_SHA1) && +#ifdef HAVE_SHA224 + (hmac_id != SCTP_AUTH_HMAC_ID_SHA224) && +#endif +#ifdef HAVE_SHA2 + (hmac_id != SCTP_AUTH_HMAC_ID_SHA256) && + (hmac_id != SCTP_AUTH_HMAC_ID_SHA384) && + (hmac_id != SCTP_AUTH_HMAC_ID_SHA512) && +#endif + 1) { + return (-1); + } + /* Now is it already in the list */ + for (i = 0; i < list->num_algo; i++) { + if (list->hmac[i] == hmac_id) { + /* already in list */ + return (-1); + } + } + SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: add HMAC id %u to list\n", hmac_id); + list->hmac[list->num_algo++] = hmac_id; + return (0); +} + +sctp_hmaclist_t * +sctp_copy_hmaclist(sctp_hmaclist_t * list) +{ + sctp_hmaclist_t *new_list; + int i; + + if (list == NULL) + return (NULL); + /* get a new list */ + new_list = sctp_alloc_hmaclist(list->max_algo); + if (new_list == NULL) + return (NULL); + /* copy it */ + new_list->max_algo = list->max_algo; + new_list->num_algo = list->num_algo; + for (i = 0; i < list->num_algo; i++) + new_list->hmac[i] = list->hmac[i]; + return (new_list); +} + +sctp_hmaclist_t * +sctp_default_supported_hmaclist(void) +{ + sctp_hmaclist_t *new_list; + + new_list = sctp_alloc_hmaclist(2); + if (new_list == NULL) + return (NULL); + (void)sctp_auth_add_hmacid(new_list, SCTP_AUTH_HMAC_ID_SHA1); + (void)sctp_auth_add_hmacid(new_list, SCTP_AUTH_HMAC_ID_SHA256); + return (new_list); +} + +/*- + * HMAC algos are listed in priority/preference order + * find the best HMAC id to use for the peer based on local support + */ +uint16_t +sctp_negotiate_hmacid(sctp_hmaclist_t * peer, sctp_hmaclist_t * local) +{ + int i, j; + + if ((local == NULL) || (peer == NULL)) + return (SCTP_AUTH_HMAC_ID_RSVD); + + for (i = 0; i < peer->num_algo; i++) { + for (j = 0; j < local->num_algo; j++) { + if (peer->hmac[i] == local->hmac[j]) { + /* found the "best" one */ + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP: negotiated peer HMAC id %u\n", + peer->hmac[i]); + return (peer->hmac[i]); + } + } + } + /* didn't find one! */ + return (SCTP_AUTH_HMAC_ID_RSVD); +} + +/*- + * serialize the HMAC algo list and return space used + * caller must guarantee ptr has appropriate space + */ +int +sctp_serialize_hmaclist(sctp_hmaclist_t * list, uint8_t * ptr) +{ + int i; + uint16_t hmac_id; + + if (list == NULL) + return (0); + + for (i = 0; i < list->num_algo; i++) { + hmac_id = htons(list->hmac[i]); + bcopy(&hmac_id, ptr, sizeof(hmac_id)); + ptr += sizeof(hmac_id); + } + return (list->num_algo * sizeof(hmac_id)); +} + +int +sctp_verify_hmac_param(struct sctp_auth_hmac_algo *hmacs, uint32_t num_hmacs) +{ + uint32_t i; + uint16_t hmac_id; + uint32_t sha1_supported = 0; + + for (i = 0; i < num_hmacs; i++) { + hmac_id = ntohs(hmacs->hmac_ids[i]); + if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) + sha1_supported = 1; + } + /* all HMAC id's are supported */ + if (sha1_supported == 0) + return (-1); + else + return (0); +} + +sctp_authinfo_t * +sctp_alloc_authinfo(void) +{ + sctp_authinfo_t *new_authinfo; + + SCTP_MALLOC(new_authinfo, sctp_authinfo_t *, sizeof(*new_authinfo), + SCTP_M_AUTH_IF); + + if (new_authinfo == NULL) { + /* out of memory */ + return (NULL); + } + bzero(new_authinfo, sizeof(*new_authinfo)); + return (new_authinfo); +} + +void +sctp_free_authinfo(sctp_authinfo_t * authinfo) +{ + if (authinfo == NULL) + return; + + if (authinfo->random != NULL) + sctp_free_key(authinfo->random); + if (authinfo->peer_random != NULL) + sctp_free_key(authinfo->peer_random); + if (authinfo->assoc_key != NULL) + sctp_free_key(authinfo->assoc_key); + if (authinfo->recv_key != NULL) + sctp_free_key(authinfo->recv_key); + + /* We are NOT dynamically allocating authinfo's right now... */ + /* SCTP_FREE(authinfo, SCTP_M_AUTH_??); */ +} + + +uint32_t +sctp_get_auth_chunk_len(uint16_t hmac_algo) +{ + int size; + + size = sizeof(struct sctp_auth_chunk) + sctp_get_hmac_digest_len(hmac_algo); + return (SCTP_SIZE32(size)); +} + +uint32_t +sctp_get_hmac_digest_len(uint16_t hmac_algo) +{ + switch (hmac_algo) { + case SCTP_AUTH_HMAC_ID_SHA1: + return (SCTP_AUTH_DIGEST_LEN_SHA1); +#ifdef HAVE_SHA224 + case SCTP_AUTH_HMAC_ID_SHA224: + return (SCTP_AUTH_DIGEST_LEN_SHA224); +#endif +#ifdef HAVE_SHA2 + case SCTP_AUTH_HMAC_ID_SHA256: + return (SCTP_AUTH_DIGEST_LEN_SHA256); + case SCTP_AUTH_HMAC_ID_SHA384: + return (SCTP_AUTH_DIGEST_LEN_SHA384); + case SCTP_AUTH_HMAC_ID_SHA512: + return (SCTP_AUTH_DIGEST_LEN_SHA512); +#endif + default: + /* unknown HMAC algorithm: can't do anything */ + return (0); + } /* end switch */ +} + +static inline int +sctp_get_hmac_block_len(uint16_t hmac_algo) +{ + switch (hmac_algo) { + case SCTP_AUTH_HMAC_ID_SHA1: +#ifdef HAVE_SHA224 + case SCTP_AUTH_HMAC_ID_SHA224: +#endif + return (64); +#ifdef HAVE_SHA2 + case SCTP_AUTH_HMAC_ID_SHA256: + return (64); + case SCTP_AUTH_HMAC_ID_SHA384: + case SCTP_AUTH_HMAC_ID_SHA512: + return (128); +#endif + case SCTP_AUTH_HMAC_ID_RSVD: + default: + /* unknown HMAC algorithm: can't do anything */ + return (0); + } /* end switch */ +} + +static void +sctp_hmac_init(uint16_t hmac_algo, sctp_hash_context_t * ctx) +{ + switch (hmac_algo) { + case SCTP_AUTH_HMAC_ID_SHA1: + SHA1_Init(&ctx->sha1); + break; +#ifdef HAVE_SHA224 + case SCTP_AUTH_HMAC_ID_SHA224: + break; +#endif +#ifdef HAVE_SHA2 + case SCTP_AUTH_HMAC_ID_SHA256: + SHA256_Init(&ctx->sha256); + break; + case SCTP_AUTH_HMAC_ID_SHA384: + SHA384_Init(&ctx->sha384); + break; + case SCTP_AUTH_HMAC_ID_SHA512: + SHA512_Init(&ctx->sha512); + break; +#endif + case SCTP_AUTH_HMAC_ID_RSVD: + default: + /* unknown HMAC algorithm: can't do anything */ + return; + } /* end switch */ +} + +static void +sctp_hmac_update(uint16_t hmac_algo, sctp_hash_context_t * ctx, + uint8_t * text, uint32_t textlen) +{ + switch (hmac_algo) { + case SCTP_AUTH_HMAC_ID_SHA1: + SHA1_Update(&ctx->sha1, text, textlen); + break; +#ifdef HAVE_SHA224 + case SCTP_AUTH_HMAC_ID_SHA224: + break; +#endif +#ifdef HAVE_SHA2 + case SCTP_AUTH_HMAC_ID_SHA256: + SHA256_Update(&ctx->sha256, text, textlen); + break; + case SCTP_AUTH_HMAC_ID_SHA384: + SHA384_Update(&ctx->sha384, text, textlen); + break; + case SCTP_AUTH_HMAC_ID_SHA512: + SHA512_Update(&ctx->sha512, text, textlen); + break; +#endif + case SCTP_AUTH_HMAC_ID_RSVD: + default: + /* unknown HMAC algorithm: can't do anything */ + return; + } /* end switch */ +} + +static void +sctp_hmac_final(uint16_t hmac_algo, sctp_hash_context_t * ctx, + uint8_t * digest) +{ + switch (hmac_algo) { + case SCTP_AUTH_HMAC_ID_SHA1: + SHA1_Final(digest, &ctx->sha1); + break; +#ifdef HAVE_SHA224 + case SCTP_AUTH_HMAC_ID_SHA224: + break; +#endif +#ifdef HAVE_SHA2 + case SCTP_AUTH_HMAC_ID_SHA256: + SHA256_Final(digest, &ctx->sha256); + break; + case SCTP_AUTH_HMAC_ID_SHA384: + /* SHA384 is truncated SHA512 */ + SHA384_Final(digest, &ctx->sha384); + break; + case SCTP_AUTH_HMAC_ID_SHA512: + SHA512_Final(digest, &ctx->sha512); + break; +#endif + case SCTP_AUTH_HMAC_ID_RSVD: + default: + /* unknown HMAC algorithm: can't do anything */ + return; + } /* end switch */ +} + +/*- + * Keyed-Hashing for Message Authentication: FIPS 198 (RFC 2104) + * + * Compute the HMAC digest using the desired hash key, text, and HMAC + * algorithm. Resulting digest is placed in 'digest' and digest length + * is returned, if the HMAC was performed. + * + * WARNING: it is up to the caller to supply sufficient space to hold the + * resultant digest. + */ +uint32_t +sctp_hmac(uint16_t hmac_algo, uint8_t * key, uint32_t keylen, + uint8_t * text, uint32_t textlen, uint8_t * digest) +{ + uint32_t digestlen; + uint32_t blocklen; + sctp_hash_context_t ctx; + uint8_t ipad[128], opad[128]; /* keyed hash inner/outer pads */ + uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; + uint32_t i; + + /* sanity check the material and length */ + if ((key == NULL) || (keylen == 0) || (text == NULL) || + (textlen == 0) || (digest == NULL)) { + /* can't do HMAC with empty key or text or digest store */ + return (0); + } + /* validate the hmac algo and get the digest length */ + digestlen = sctp_get_hmac_digest_len(hmac_algo); + if (digestlen == 0) + return (0); + + /* hash the key if it is longer than the hash block size */ + blocklen = sctp_get_hmac_block_len(hmac_algo); + if (keylen > blocklen) { + sctp_hmac_init(hmac_algo, &ctx); + sctp_hmac_update(hmac_algo, &ctx, key, keylen); + sctp_hmac_final(hmac_algo, &ctx, temp); + /* set the hashed key as the key */ + keylen = digestlen; + key = temp; + } + /* initialize the inner/outer pads with the key and "append" zeroes */ + bzero(ipad, blocklen); + bzero(opad, blocklen); + bcopy(key, ipad, keylen); + bcopy(key, opad, keylen); + + /* XOR the key with ipad and opad values */ + for (i = 0; i < blocklen; i++) { + ipad[i] ^= 0x36; + opad[i] ^= 0x5c; + } + + /* perform inner hash */ + sctp_hmac_init(hmac_algo, &ctx); + sctp_hmac_update(hmac_algo, &ctx, ipad, blocklen); + sctp_hmac_update(hmac_algo, &ctx, text, textlen); + sctp_hmac_final(hmac_algo, &ctx, temp); + + /* perform outer hash */ + sctp_hmac_init(hmac_algo, &ctx); + sctp_hmac_update(hmac_algo, &ctx, opad, blocklen); + sctp_hmac_update(hmac_algo, &ctx, temp, digestlen); + sctp_hmac_final(hmac_algo, &ctx, digest); + + return (digestlen); +} + +/* mbuf version */ +uint32_t +sctp_hmac_m(uint16_t hmac_algo, uint8_t * key, uint32_t keylen, + struct mbuf *m, uint32_t m_offset, uint8_t * digest, uint32_t trailer) +{ + uint32_t digestlen; + uint32_t blocklen; + sctp_hash_context_t ctx; + uint8_t ipad[128], opad[128]; /* keyed hash inner/outer pads */ + uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; + uint32_t i; + struct mbuf *m_tmp; + + /* sanity check the material and length */ + if ((key == NULL) || (keylen == 0) || (m == NULL) || (digest == NULL)) { + /* can't do HMAC with empty key or text or digest store */ + return (0); + } + /* validate the hmac algo and get the digest length */ + digestlen = sctp_get_hmac_digest_len(hmac_algo); + if (digestlen == 0) + return (0); + + /* hash the key if it is longer than the hash block size */ + blocklen = sctp_get_hmac_block_len(hmac_algo); + if (keylen > blocklen) { + sctp_hmac_init(hmac_algo, &ctx); + sctp_hmac_update(hmac_algo, &ctx, key, keylen); + sctp_hmac_final(hmac_algo, &ctx, temp); + /* set the hashed key as the key */ + keylen = digestlen; + key = temp; + } + /* initialize the inner/outer pads with the key and "append" zeroes */ + bzero(ipad, blocklen); + bzero(opad, blocklen); + bcopy(key, ipad, keylen); + bcopy(key, opad, keylen); + + /* XOR the key with ipad and opad values */ + for (i = 0; i < blocklen; i++) { + ipad[i] ^= 0x36; + opad[i] ^= 0x5c; + } + + /* perform inner hash */ + sctp_hmac_init(hmac_algo, &ctx); + sctp_hmac_update(hmac_algo, &ctx, ipad, blocklen); + /* find the correct starting mbuf and offset (get start of text) */ + m_tmp = m; + while ((m_tmp != NULL) && (m_offset >= (uint32_t) SCTP_BUF_LEN(m_tmp))) { + m_offset -= SCTP_BUF_LEN(m_tmp); + m_tmp = SCTP_BUF_NEXT(m_tmp); + } + /* now use the rest of the mbuf chain for the text */ + while (m_tmp != NULL) { + if ((SCTP_BUF_NEXT(m_tmp) == NULL) && trailer) { + sctp_hmac_update(hmac_algo, &ctx, mtod(m_tmp, uint8_t *) + m_offset, + SCTP_BUF_LEN(m_tmp) - (trailer + m_offset)); + } else { + sctp_hmac_update(hmac_algo, &ctx, mtod(m_tmp, uint8_t *) + m_offset, + SCTP_BUF_LEN(m_tmp) - m_offset); + } + + /* clear the offset since it's only for the first mbuf */ + m_offset = 0; + m_tmp = SCTP_BUF_NEXT(m_tmp); + } + sctp_hmac_final(hmac_algo, &ctx, temp); + + /* perform outer hash */ + sctp_hmac_init(hmac_algo, &ctx); + sctp_hmac_update(hmac_algo, &ctx, opad, blocklen); + sctp_hmac_update(hmac_algo, &ctx, temp, digestlen); + sctp_hmac_final(hmac_algo, &ctx, digest); + + return (digestlen); +} + +/*- + * verify the HMAC digest using the desired hash key, text, and HMAC + * algorithm. + * Returns -1 on error, 0 on success. + */ +int +sctp_verify_hmac(uint16_t hmac_algo, uint8_t * key, uint32_t keylen, + uint8_t * text, uint32_t textlen, + uint8_t * digest, uint32_t digestlen) +{ + uint32_t len; + uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; + + /* sanity check the material and length */ + if ((key == NULL) || (keylen == 0) || + (text == NULL) || (textlen == 0) || (digest == NULL)) { + /* can't do HMAC with empty key or text or digest */ + return (-1); + } + len = sctp_get_hmac_digest_len(hmac_algo); + if ((len == 0) || (digestlen != len)) + return (-1); + + /* compute the expected hash */ + if (sctp_hmac(hmac_algo, key, keylen, text, textlen, temp) != len) + return (-1); + + if (memcmp(digest, temp, digestlen) != 0) + return (-1); + else + return (0); +} + + +/* + * computes the requested HMAC using a key struct (which may be modified if + * the keylen exceeds the HMAC block len). + */ +uint32_t +sctp_compute_hmac(uint16_t hmac_algo, sctp_key_t * key, uint8_t * text, + uint32_t textlen, uint8_t * digest) +{ + uint32_t digestlen; + uint32_t blocklen; + sctp_hash_context_t ctx; + uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; + + /* sanity check */ + if ((key == NULL) || (text == NULL) || (textlen == 0) || + (digest == NULL)) { + /* can't do HMAC with empty key or text or digest store */ + return (0); + } + /* validate the hmac algo and get the digest length */ + digestlen = sctp_get_hmac_digest_len(hmac_algo); + if (digestlen == 0) + return (0); + + /* hash the key if it is longer than the hash block size */ + blocklen = sctp_get_hmac_block_len(hmac_algo); + if (key->keylen > blocklen) { + sctp_hmac_init(hmac_algo, &ctx); + sctp_hmac_update(hmac_algo, &ctx, key->key, key->keylen); + sctp_hmac_final(hmac_algo, &ctx, temp); + /* save the hashed key as the new key */ + key->keylen = digestlen; + bcopy(temp, key->key, key->keylen); + } + return (sctp_hmac(hmac_algo, key->key, key->keylen, text, textlen, + digest)); +} + +/* mbuf version */ +uint32_t +sctp_compute_hmac_m(uint16_t hmac_algo, sctp_key_t * key, struct mbuf *m, + uint32_t m_offset, uint8_t * digest) +{ + uint32_t digestlen; + uint32_t blocklen; + sctp_hash_context_t ctx; + uint8_t temp[SCTP_AUTH_DIGEST_LEN_MAX]; + + /* sanity check */ + if ((key == NULL) || (m == NULL) || (digest == NULL)) { + /* can't do HMAC with empty key or text or digest store */ + return (0); + } + /* validate the hmac algo and get the digest length */ + digestlen = sctp_get_hmac_digest_len(hmac_algo); + if (digestlen == 0) + return (0); + + /* hash the key if it is longer than the hash block size */ + blocklen = sctp_get_hmac_block_len(hmac_algo); + if (key->keylen > blocklen) { + sctp_hmac_init(hmac_algo, &ctx); + sctp_hmac_update(hmac_algo, &ctx, key->key, key->keylen); + sctp_hmac_final(hmac_algo, &ctx, temp); + /* save the hashed key as the new key */ + key->keylen = digestlen; + bcopy(temp, key->key, key->keylen); + } + return (sctp_hmac_m(hmac_algo, key->key, key->keylen, m, m_offset, digest, 0)); +} + +int +sctp_auth_is_supported_hmac(sctp_hmaclist_t * list, uint16_t id) +{ + int i; + + if ((list == NULL) || (id == SCTP_AUTH_HMAC_ID_RSVD)) + return (0); + + for (i = 0; i < list->num_algo; i++) + if (list->hmac[i] == id) + return (1); + + /* not in the list */ + return (0); +} + + +/*- + * clear any cached key(s) if they match the given key id on an association. + * the cached key(s) will be recomputed and re-cached at next use. + * ASSUMES TCB_LOCK is already held + */ +void +sctp_clear_cachedkeys(struct sctp_tcb *stcb, uint16_t keyid) +{ + if (stcb == NULL) + return; + + if (keyid == stcb->asoc.authinfo.assoc_keyid) { + sctp_free_key(stcb->asoc.authinfo.assoc_key); + stcb->asoc.authinfo.assoc_key = NULL; + } + if (keyid == stcb->asoc.authinfo.recv_keyid) { + sctp_free_key(stcb->asoc.authinfo.recv_key); + stcb->asoc.authinfo.recv_key = NULL; + } +} + +/*- + * clear any cached key(s) if they match the given key id for all assocs on + * an endpoint. + * ASSUMES INP_WLOCK is already held + */ +void +sctp_clear_cachedkeys_ep(struct sctp_inpcb *inp, uint16_t keyid) +{ + struct sctp_tcb *stcb; + + if (inp == NULL) + return; + + /* clear the cached keys on all assocs on this instance */ + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + sctp_clear_cachedkeys(stcb, keyid); + SCTP_TCB_UNLOCK(stcb); + } +} + +/*- + * delete a shared key from an association + * ASSUMES TCB_LOCK is already held + */ +int +sctp_delete_sharedkey(struct sctp_tcb *stcb, uint16_t keyid) +{ + sctp_sharedkey_t *skey; + + if (stcb == NULL) + return (-1); + + /* is the keyid the assoc active sending key */ + if (keyid == stcb->asoc.authinfo.active_keyid) + return (-1); + + /* does the key exist? */ + skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); + if (skey == NULL) + return (-1); + + /* are there other refcount holders on the key? */ + if (skey->refcount > 1) + return (-1); + + /* remove it */ + LIST_REMOVE(skey, next); + sctp_free_sharedkey(skey); /* frees skey->key as well */ + + /* clear any cached keys */ + sctp_clear_cachedkeys(stcb, keyid); + return (0); +} + +/*- + * deletes a shared key from the endpoint + * ASSUMES INP_WLOCK is already held + */ +int +sctp_delete_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid) +{ + sctp_sharedkey_t *skey; + + if (inp == NULL) + return (-1); + + /* is the keyid the active sending key on the endpoint */ + if (keyid == inp->sctp_ep.default_keyid) + return (-1); + + /* does the key exist? */ + skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid); + if (skey == NULL) + return (-1); + + /* endpoint keys are not refcounted */ + + /* remove it */ + LIST_REMOVE(skey, next); + sctp_free_sharedkey(skey); /* frees skey->key as well */ + + /* clear any cached keys */ + sctp_clear_cachedkeys_ep(inp, keyid); + return (0); +} + +/*- + * set the active key on an association + * ASSUMES TCB_LOCK is already held + */ +int +sctp_auth_setactivekey(struct sctp_tcb *stcb, uint16_t keyid) +{ + sctp_sharedkey_t *skey = NULL; + + /* find the key on the assoc */ + skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); + if (skey == NULL) { + /* that key doesn't exist */ + return (-1); + } + if ((skey->deactivated) && (skey->refcount > 1)) { + /* can't reactivate a deactivated key with other refcounts */ + return (-1); + } + /* set the (new) active key */ + stcb->asoc.authinfo.active_keyid = keyid; + /* reset the deactivated flag */ + skey->deactivated = 0; + + return (0); +} + +/*- + * set the active key on an endpoint + * ASSUMES INP_WLOCK is already held + */ +int +sctp_auth_setactivekey_ep(struct sctp_inpcb *inp, uint16_t keyid) +{ + sctp_sharedkey_t *skey; + + /* find the key */ + skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid); + if (skey == NULL) { + /* that key doesn't exist */ + return (-1); + } + inp->sctp_ep.default_keyid = keyid; + return (0); +} + +/*- + * deactivates a shared key from the association + * ASSUMES INP_WLOCK is already held + */ +int +sctp_deact_sharedkey(struct sctp_tcb *stcb, uint16_t keyid) +{ + sctp_sharedkey_t *skey; + + if (stcb == NULL) + return (-1); + + /* is the keyid the assoc active sending key */ + if (keyid == stcb->asoc.authinfo.active_keyid) + return (-1); + + /* does the key exist? */ + skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); + if (skey == NULL) + return (-1); + + /* are there other refcount holders on the key? */ + if (skey->refcount == 1) { + /* no other users, send a notification for this key */ + sctp_ulp_notify(SCTP_NOTIFY_AUTH_FREE_KEY, stcb, keyid, 0, + SCTP_SO_LOCKED); + } + /* mark the key as deactivated */ + skey->deactivated = 1; + + return (0); +} + +/*- + * deactivates a shared key from the endpoint + * ASSUMES INP_WLOCK is already held + */ +int +sctp_deact_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid) +{ + sctp_sharedkey_t *skey; + + if (inp == NULL) + return (-1); + + /* is the keyid the active sending key on the endpoint */ + if (keyid == inp->sctp_ep.default_keyid) + return (-1); + + /* does the key exist? */ + skey = sctp_find_sharedkey(&inp->sctp_ep.shared_keys, keyid); + if (skey == NULL) + return (-1); + + /* endpoint keys are not refcounted */ + + /* remove it */ + LIST_REMOVE(skey, next); + sctp_free_sharedkey(skey); /* frees skey->key as well */ + + return (0); +} + +/* + * get local authentication parameters from cookie (from INIT-ACK) + */ +void +sctp_auth_get_cookie_params(struct sctp_tcb *stcb, struct mbuf *m, + uint32_t offset, uint32_t length) +{ + struct sctp_paramhdr *phdr, tmp_param; + uint16_t plen, ptype; + uint8_t random_store[SCTP_PARAM_BUFFER_SIZE]; + struct sctp_auth_random *p_random = NULL; + uint16_t random_len = 0; + uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE]; + struct sctp_auth_hmac_algo *hmacs = NULL; + uint16_t hmacs_len = 0; + uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE]; + struct sctp_auth_chunk_list *chunks = NULL; + uint16_t num_chunks = 0; + sctp_key_t *new_key; + uint32_t keylen; + + /* convert to upper bound */ + length += offset; + + phdr = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, + sizeof(struct sctp_paramhdr), (uint8_t *) & tmp_param); + while (phdr != NULL) { + ptype = ntohs(phdr->param_type); + plen = ntohs(phdr->param_length); + + if ((plen == 0) || (offset + plen > length)) + break; + + if (ptype == SCTP_RANDOM) { + if (plen > sizeof(random_store)) + break; + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)random_store, min(plen, sizeof(random_store))); + if (phdr == NULL) + return; + /* save the random and length for the key */ + p_random = (struct sctp_auth_random *)phdr; + random_len = plen - sizeof(*p_random); + } else if (ptype == SCTP_HMAC_LIST) { + int num_hmacs; + int i; + + if (plen > sizeof(hmacs_store)) + break; + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)hmacs_store, min(plen, sizeof(hmacs_store))); + if (phdr == NULL) + return; + /* save the hmacs list and num for the key */ + hmacs = (struct sctp_auth_hmac_algo *)phdr; + hmacs_len = plen - sizeof(*hmacs); + num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]); + if (stcb->asoc.local_hmacs != NULL) + sctp_free_hmaclist(stcb->asoc.local_hmacs); + stcb->asoc.local_hmacs = sctp_alloc_hmaclist(num_hmacs); + if (stcb->asoc.local_hmacs != NULL) { + for (i = 0; i < num_hmacs; i++) { + (void)sctp_auth_add_hmacid(stcb->asoc.local_hmacs, + ntohs(hmacs->hmac_ids[i])); + } + } + } else if (ptype == SCTP_CHUNK_LIST) { + int i; + + if (plen > sizeof(chunks_store)) + break; + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)chunks_store, min(plen, sizeof(chunks_store))); + if (phdr == NULL) + return; + chunks = (struct sctp_auth_chunk_list *)phdr; + num_chunks = plen - sizeof(*chunks); + /* save chunks list and num for the key */ + if (stcb->asoc.local_auth_chunks != NULL) + sctp_clear_chunklist(stcb->asoc.local_auth_chunks); + else + stcb->asoc.local_auth_chunks = sctp_alloc_chunklist(); + for (i = 0; i < num_chunks; i++) { + (void)sctp_auth_add_chunk(chunks->chunk_types[i], + stcb->asoc.local_auth_chunks); + } + } + /* get next parameter */ + offset += SCTP_SIZE32(plen); + if (offset + sizeof(struct sctp_paramhdr) > length) + break; + phdr = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(struct sctp_paramhdr), + (uint8_t *) & tmp_param); + } + /* concatenate the full random key */ + keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len; + if (chunks != NULL) { + keylen += sizeof(*chunks) + num_chunks; + } + new_key = sctp_alloc_key(keylen); + if (new_key != NULL) { + /* copy in the RANDOM */ + if (p_random != NULL) { + keylen = sizeof(*p_random) + random_len; + bcopy(p_random, new_key->key, keylen); + } + /* append in the AUTH chunks */ + if (chunks != NULL) { + bcopy(chunks, new_key->key + keylen, + sizeof(*chunks) + num_chunks); + keylen += sizeof(*chunks) + num_chunks; + } + /* append in the HMACs */ + if (hmacs != NULL) { + bcopy(hmacs, new_key->key + keylen, + sizeof(*hmacs) + hmacs_len); + } + } + if (stcb->asoc.authinfo.random != NULL) + sctp_free_key(stcb->asoc.authinfo.random); + stcb->asoc.authinfo.random = new_key; + stcb->asoc.authinfo.random_len = random_len; + sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid); + sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid); + + /* negotiate what HMAC to use for the peer */ + stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs, + stcb->asoc.local_hmacs); + + /* copy defaults from the endpoint */ + /* FIX ME: put in cookie? */ + stcb->asoc.authinfo.active_keyid = stcb->sctp_ep->sctp_ep.default_keyid; + /* copy out the shared key list (by reference) from the endpoint */ + (void)sctp_copy_skeylist(&stcb->sctp_ep->sctp_ep.shared_keys, + &stcb->asoc.shared_keys); +} + +/* + * compute and fill in the HMAC digest for a packet + */ +void +sctp_fill_hmac_digest_m(struct mbuf *m, uint32_t auth_offset, + struct sctp_auth_chunk *auth, struct sctp_tcb *stcb, uint16_t keyid) +{ + uint32_t digestlen; + sctp_sharedkey_t *skey; + sctp_key_t *key; + + if ((stcb == NULL) || (auth == NULL)) + return; + + /* zero the digest + chunk padding */ + digestlen = sctp_get_hmac_digest_len(stcb->asoc.peer_hmac_id); + bzero(auth->hmac, SCTP_SIZE32(digestlen)); + + /* is the desired key cached? */ + if ((keyid != stcb->asoc.authinfo.assoc_keyid) || + (stcb->asoc.authinfo.assoc_key == NULL)) { + if (stcb->asoc.authinfo.assoc_key != NULL) { + /* free the old cached key */ + sctp_free_key(stcb->asoc.authinfo.assoc_key); + } + skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, keyid); + /* the only way skey is NULL is if null key id 0 is used */ + if (skey != NULL) + key = skey->key; + else + key = NULL; + /* compute a new assoc key and cache it */ + stcb->asoc.authinfo.assoc_key = + sctp_compute_hashkey(stcb->asoc.authinfo.random, + stcb->asoc.authinfo.peer_random, key); + stcb->asoc.authinfo.assoc_keyid = keyid; + SCTPDBG(SCTP_DEBUG_AUTH1, "caching key id %u\n", + stcb->asoc.authinfo.assoc_keyid); +#ifdef SCTP_DEBUG + if (SCTP_AUTH_DEBUG) + sctp_print_key(stcb->asoc.authinfo.assoc_key, + "Assoc Key"); +#endif + } + /* set in the active key id */ + auth->shared_key_id = htons(keyid); + + /* compute and fill in the digest */ + (void)sctp_compute_hmac_m(stcb->asoc.peer_hmac_id, stcb->asoc.authinfo.assoc_key, + m, auth_offset, auth->hmac); +} + + +static void +sctp_bzero_m(struct mbuf *m, uint32_t m_offset, uint32_t size) +{ + struct mbuf *m_tmp; + uint8_t *data; + + /* sanity check */ + if (m == NULL) + return; + + /* find the correct starting mbuf and offset (get start position) */ + m_tmp = m; + while ((m_tmp != NULL) && (m_offset >= (uint32_t) SCTP_BUF_LEN(m_tmp))) { + m_offset -= SCTP_BUF_LEN(m_tmp); + m_tmp = SCTP_BUF_NEXT(m_tmp); + } + /* now use the rest of the mbuf chain */ + while ((m_tmp != NULL) && (size > 0)) { + data = mtod(m_tmp, uint8_t *) + m_offset; + if (size > (uint32_t) SCTP_BUF_LEN(m_tmp)) { + bzero(data, SCTP_BUF_LEN(m_tmp)); + size -= SCTP_BUF_LEN(m_tmp); + } else { + bzero(data, size); + size = 0; + } + /* clear the offset since it's only for the first mbuf */ + m_offset = 0; + m_tmp = SCTP_BUF_NEXT(m_tmp); + } +} + +/*- + * process the incoming Authentication chunk + * return codes: + * -1 on any authentication error + * 0 on authentication verification + */ +int +sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *auth, + struct mbuf *m, uint32_t offset) +{ + uint16_t chunklen; + uint16_t shared_key_id; + uint16_t hmac_id; + sctp_sharedkey_t *skey; + uint32_t digestlen; + uint8_t digest[SCTP_AUTH_DIGEST_LEN_MAX]; + uint8_t computed_digest[SCTP_AUTH_DIGEST_LEN_MAX]; + + /* auth is checked for NULL by caller */ + chunklen = ntohs(auth->ch.chunk_length); + if (chunklen < sizeof(*auth)) { + SCTP_STAT_INCR(sctps_recvauthfailed); + return (-1); + } + SCTP_STAT_INCR(sctps_recvauth); + + /* get the auth params */ + shared_key_id = ntohs(auth->shared_key_id); + hmac_id = ntohs(auth->hmac_id); + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP AUTH Chunk: shared key %u, HMAC id %u\n", + shared_key_id, hmac_id); + + /* is the indicated HMAC supported? */ + if (!sctp_auth_is_supported_hmac(stcb->asoc.local_hmacs, hmac_id)) { + struct mbuf *m_err; + struct sctp_auth_invalid_hmac *err; + + SCTP_STAT_INCR(sctps_recvivalhmacid); + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP Auth: unsupported HMAC id %u\n", + hmac_id); + /* + * report this in an Error Chunk: Unsupported HMAC + * Identifier + */ + m_err = sctp_get_mbuf_for_msg(sizeof(*err), 0, M_DONTWAIT, + 1, MT_HEADER); + if (m_err != NULL) { + /* pre-reserve some space */ + SCTP_BUF_RESV_UF(m_err, sizeof(struct sctp_chunkhdr)); + /* fill in the error */ + err = mtod(m_err, struct sctp_auth_invalid_hmac *); + bzero(err, sizeof(*err)); + err->ph.param_type = htons(SCTP_CAUSE_UNSUPPORTED_HMACID); + err->ph.param_length = htons(sizeof(*err)); + err->hmac_id = ntohs(hmac_id); + SCTP_BUF_LEN(m_err) = sizeof(*err); + /* queue it */ + sctp_queue_op_err(stcb, m_err); + } + return (-1); + } + /* get the indicated shared key, if available */ + if ((stcb->asoc.authinfo.recv_key == NULL) || + (stcb->asoc.authinfo.recv_keyid != shared_key_id)) { + /* find the shared key on the assoc first */ + skey = sctp_find_sharedkey(&stcb->asoc.shared_keys, + shared_key_id); + /* if the shared key isn't found, discard the chunk */ + if (skey == NULL) { + SCTP_STAT_INCR(sctps_recvivalkeyid); + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP Auth: unknown key id %u\n", + shared_key_id); + return (-1); + } + /* generate a notification if this is a new key id */ + if (stcb->asoc.authinfo.recv_keyid != shared_key_id) + /* + * sctp_ulp_notify(SCTP_NOTIFY_AUTH_NEW_KEY, stcb, + * shared_key_id, (void + * *)stcb->asoc.authinfo.recv_keyid); + */ + sctp_notify_authentication(stcb, SCTP_AUTH_NEWKEY, + shared_key_id, stcb->asoc.authinfo.recv_keyid, + SCTP_SO_NOT_LOCKED); + /* compute a new recv assoc key and cache it */ + if (stcb->asoc.authinfo.recv_key != NULL) + sctp_free_key(stcb->asoc.authinfo.recv_key); + stcb->asoc.authinfo.recv_key = + sctp_compute_hashkey(stcb->asoc.authinfo.random, + stcb->asoc.authinfo.peer_random, skey->key); + stcb->asoc.authinfo.recv_keyid = shared_key_id; +#ifdef SCTP_DEBUG + if (SCTP_AUTH_DEBUG) + sctp_print_key(stcb->asoc.authinfo.recv_key, "Recv Key"); +#endif + } + /* validate the digest length */ + digestlen = sctp_get_hmac_digest_len(hmac_id); + if (chunklen < (sizeof(*auth) + digestlen)) { + /* invalid digest length */ + SCTP_STAT_INCR(sctps_recvauthfailed); + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP Auth: chunk too short for HMAC\n"); + return (-1); + } + /* save a copy of the digest, zero the pseudo header, and validate */ + bcopy(auth->hmac, digest, digestlen); + sctp_bzero_m(m, offset + sizeof(*auth), SCTP_SIZE32(digestlen)); + (void)sctp_compute_hmac_m(hmac_id, stcb->asoc.authinfo.recv_key, + m, offset, computed_digest); + + /* compare the computed digest with the one in the AUTH chunk */ + if (memcmp(digest, computed_digest, digestlen) != 0) { + SCTP_STAT_INCR(sctps_recvauthfailed); + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP Auth: HMAC digest check failed\n"); + return (-1); + } + return (0); +} + +/* + * Generate NOTIFICATION + */ +void +sctp_notify_authentication(struct sctp_tcb *stcb, uint32_t indication, + uint16_t keyid, uint16_t alt_keyid, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + struct mbuf *m_notify; + struct sctp_authkey_event *auth; + struct sctp_queued_to_read *control; + + if ((stcb == NULL) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) + ) { + /* If the socket is gone we are out of here */ + return; + } + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_AUTHEVNT)) + /* event not enabled */ + return; + + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_authkey_event), + 0, M_DONTWAIT, 1, MT_HEADER); + if (m_notify == NULL) + /* no space left */ + return; + + SCTP_BUF_LEN(m_notify) = 0; + auth = mtod(m_notify, struct sctp_authkey_event *); + auth->auth_type = SCTP_AUTHENTICATION_EVENT; + auth->auth_flags = 0; + auth->auth_length = sizeof(*auth); + auth->auth_keynumber = keyid; + auth->auth_altkeynumber = alt_keyid; + auth->auth_indication = indication; + auth->auth_assoc_id = sctp_get_associd(stcb); + + SCTP_BUF_LEN(m_notify) = sizeof(*auth); + SCTP_BUF_NEXT(m_notify) = NULL; + + /* append to socket */ + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->spec_flags = M_NOTIFICATION; + control->length = SCTP_BUF_LEN(m_notify); + /* not that we need this */ + control->tail_mbuf = m_notify; + sctp_add_to_readq(stcb->sctp_ep, stcb, control, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked); +} + + +/*- + * validates the AUTHentication related parameters in an INIT/INIT-ACK + * Note: currently only used for INIT as INIT-ACK is handled inline + * with sctp_load_addresses_from_init() + */ +int +sctp_validate_init_auth_params(struct mbuf *m, int offset, int limit) +{ + struct sctp_paramhdr *phdr, parm_buf; + uint16_t ptype, plen; + int peer_supports_asconf = 0; + int peer_supports_auth = 0; + int got_random = 0, got_hmacs = 0, got_chklist = 0; + uint8_t saw_asconf = 0; + uint8_t saw_asconf_ack = 0; + + /* go through each of the params. */ + phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf)); + while (phdr) { + ptype = ntohs(phdr->param_type); + plen = ntohs(phdr->param_length); + + if (offset + plen > limit) { + break; + } + if (plen < sizeof(struct sctp_paramhdr)) { + break; + } + if (ptype == SCTP_SUPPORTED_CHUNK_EXT) { + /* A supported extension chunk */ + struct sctp_supported_chunk_types_param *pr_supported; + uint8_t local_store[SCTP_PARAM_BUFFER_SIZE]; + int num_ent, i; + + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)&local_store, min(plen, sizeof(local_store))); + if (phdr == NULL) { + return (-1); + } + pr_supported = (struct sctp_supported_chunk_types_param *)phdr; + num_ent = plen - sizeof(struct sctp_paramhdr); + for (i = 0; i < num_ent; i++) { + switch (pr_supported->chunk_types[i]) { + case SCTP_ASCONF: + case SCTP_ASCONF_ACK: + peer_supports_asconf = 1; + break; + case SCTP_AUTHENTICATION: + peer_supports_auth = 1; + break; + default: + /* one we don't care about */ + break; + } + } + } else if (ptype == SCTP_RANDOM) { + got_random = 1; + /* enforce the random length */ + if (plen != (sizeof(struct sctp_auth_random) + + SCTP_AUTH_RANDOM_SIZE_REQUIRED)) { + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP: invalid RANDOM len\n"); + return (-1); + } + } else if (ptype == SCTP_HMAC_LIST) { + uint8_t store[SCTP_PARAM_BUFFER_SIZE]; + struct sctp_auth_hmac_algo *hmacs; + int num_hmacs; + + if (plen > sizeof(store)) + break; + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)store, min(plen, sizeof(store))); + if (phdr == NULL) + return (-1); + hmacs = (struct sctp_auth_hmac_algo *)phdr; + num_hmacs = (plen - sizeof(*hmacs)) / + sizeof(hmacs->hmac_ids[0]); + /* validate the hmac list */ + if (sctp_verify_hmac_param(hmacs, num_hmacs)) { + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP: invalid HMAC param\n"); + return (-1); + } + got_hmacs = 1; + } else if (ptype == SCTP_CHUNK_LIST) { + int i, num_chunks; + uint8_t chunks_store[SCTP_SMALL_CHUNK_STORE]; + + /* did the peer send a non-empty chunk list? */ + struct sctp_auth_chunk_list *chunks = NULL; + + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)chunks_store, + min(plen, sizeof(chunks_store))); + if (phdr == NULL) + return (-1); + + /*- + * Flip through the list and mark that the + * peer supports asconf/asconf_ack. + */ + chunks = (struct sctp_auth_chunk_list *)phdr; + num_chunks = plen - sizeof(*chunks); + for (i = 0; i < num_chunks; i++) { + /* record asconf/asconf-ack if listed */ + if (chunks->chunk_types[i] == SCTP_ASCONF) + saw_asconf = 1; + if (chunks->chunk_types[i] == SCTP_ASCONF_ACK) + saw_asconf_ack = 1; + + } + if (num_chunks) + got_chklist = 1; + } + offset += SCTP_SIZE32(plen); + if (offset >= limit) { + break; + } + phdr = sctp_get_next_param(m, offset, &parm_buf, + sizeof(parm_buf)); + } + /* validate authentication required parameters */ + if (got_random && got_hmacs) { + peer_supports_auth = 1; + } else { + peer_supports_auth = 0; + } + if (!peer_supports_auth && got_chklist) { + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP: peer sent chunk list w/o AUTH\n"); + return (-1); + } + if (!SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) && peer_supports_asconf && + !peer_supports_auth) { + SCTPDBG(SCTP_DEBUG_AUTH1, + "SCTP: peer supports ASCONF but not AUTH\n"); + return (-1); + } else if ((peer_supports_asconf) && (peer_supports_auth) && + ((saw_asconf == 0) || (saw_asconf_ack == 0))) { + return (-2); + } + return (0); +} + +void +sctp_initialize_auth_params(struct sctp_inpcb *inp, struct sctp_tcb *stcb) +{ + uint16_t chunks_len = 0; + uint16_t hmacs_len = 0; + uint16_t random_len = SCTP_AUTH_RANDOM_SIZE_DEFAULT; + sctp_key_t *new_key; + uint16_t keylen; + + /* initialize hmac list from endpoint */ + stcb->asoc.local_hmacs = sctp_copy_hmaclist(inp->sctp_ep.local_hmacs); + if (stcb->asoc.local_hmacs != NULL) { + hmacs_len = stcb->asoc.local_hmacs->num_algo * + sizeof(stcb->asoc.local_hmacs->hmac[0]); + } + /* initialize auth chunks list from endpoint */ + stcb->asoc.local_auth_chunks = + sctp_copy_chunklist(inp->sctp_ep.local_auth_chunks); + if (stcb->asoc.local_auth_chunks != NULL) { + int i; + + for (i = 0; i < 256; i++) { + if (stcb->asoc.local_auth_chunks->chunks[i]) + chunks_len++; + } + } + /* copy defaults from the endpoint */ + stcb->asoc.authinfo.active_keyid = inp->sctp_ep.default_keyid; + + /* copy out the shared key list (by reference) from the endpoint */ + (void)sctp_copy_skeylist(&inp->sctp_ep.shared_keys, + &stcb->asoc.shared_keys); + + /* now set the concatenated key (random + chunks + hmacs) */ + /* key includes parameter headers */ + keylen = (3 * sizeof(struct sctp_paramhdr)) + random_len + chunks_len + + hmacs_len; + new_key = sctp_alloc_key(keylen); + if (new_key != NULL) { + struct sctp_paramhdr *ph; + int plen; + + /* generate and copy in the RANDOM */ + ph = (struct sctp_paramhdr *)new_key->key; + ph->param_type = htons(SCTP_RANDOM); + plen = sizeof(*ph) + random_len; + ph->param_length = htons(plen); + SCTP_READ_RANDOM(new_key->key + sizeof(*ph), random_len); + keylen = plen; + + /* append in the AUTH chunks */ + /* NOTE: currently we always have chunks to list */ + ph = (struct sctp_paramhdr *)(new_key->key + keylen); + ph->param_type = htons(SCTP_CHUNK_LIST); + plen = sizeof(*ph) + chunks_len; + ph->param_length = htons(plen); + keylen += sizeof(*ph); + if (stcb->asoc.local_auth_chunks) { + int i; + + for (i = 0; i < 256; i++) { + if (stcb->asoc.local_auth_chunks->chunks[i]) + new_key->key[keylen++] = i; + } + } + /* append in the HMACs */ + ph = (struct sctp_paramhdr *)(new_key->key + keylen); + ph->param_type = htons(SCTP_HMAC_LIST); + plen = sizeof(*ph) + hmacs_len; + ph->param_length = htons(plen); + keylen += sizeof(*ph); + (void)sctp_serialize_hmaclist(stcb->asoc.local_hmacs, + new_key->key + keylen); + } + if (stcb->asoc.authinfo.random != NULL) + sctp_free_key(stcb->asoc.authinfo.random); + stcb->asoc.authinfo.random = new_key; + stcb->asoc.authinfo.random_len = random_len; +} diff --git a/freebsd/sys/netinet/sctp_auth.h b/freebsd/sys/netinet/sctp_auth.h new file mode 100644 index 00000000..da4dc09b --- /dev/null +++ b/freebsd/sys/netinet/sctp_auth.h @@ -0,0 +1,235 @@ +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __SCTP_AUTH_HH__ +#define __SCTP_AUTH_HH__ + + +/* digest lengths */ +#define SCTP_AUTH_DIGEST_LEN_SHA1 20 +#define SCTP_AUTH_DIGEST_LEN_SHA224 28 +#define SCTP_AUTH_DIGEST_LEN_SHA256 32 +#define SCTP_AUTH_DIGEST_LEN_SHA384 48 +#define SCTP_AUTH_DIGEST_LEN_SHA512 64 +#define SCTP_AUTH_DIGEST_LEN_MAX 64 + +/* random sizes */ +#define SCTP_AUTH_RANDOM_SIZE_DEFAULT 32 +#define SCTP_AUTH_RANDOM_SIZE_REQUIRED 32 +#define SCTP_AUTH_RANDOM_SIZE_MAX 256 + +/* union of all supported HMAC algorithm contexts */ +typedef union sctp_hash_context { + SHA1_CTX sha1; +#ifdef HAVE_SHA2 + SHA256_CTX sha256; + SHA384_CTX sha384; + SHA512_CTX sha512; +#endif +} sctp_hash_context_t; + +typedef struct sctp_key { + uint32_t keylen; + uint8_t key[]; +} sctp_key_t; + +typedef struct sctp_shared_key { + LIST_ENTRY(sctp_shared_key) next; + sctp_key_t *key; /* key text */ + uint32_t refcount; /* reference count */ + uint16_t keyid; /* shared key ID */ + uint8_t deactivated; /* key is deactivated */ +} sctp_sharedkey_t; + +LIST_HEAD(sctp_keyhead, sctp_shared_key); + +/* authentication chunks list */ +typedef struct sctp_auth_chklist { + uint8_t chunks[256]; + uint8_t num_chunks; +} sctp_auth_chklist_t; + +/* hmac algos supported list */ +typedef struct sctp_hmaclist { + uint16_t max_algo; /* max algorithms allocated */ + uint16_t num_algo; /* num algorithms used */ + uint16_t hmac[]; +} sctp_hmaclist_t; + +/* authentication info */ +typedef struct sctp_authinfo { + sctp_key_t *random; /* local random key (concatenated) */ + uint32_t random_len; /* local random number length for param */ + sctp_key_t *peer_random;/* peer's random key (concatenated) */ + sctp_key_t *assoc_key; /* cached concatenated send key */ + sctp_key_t *recv_key; /* cached concatenated recv key */ + uint16_t active_keyid; /* active send keyid */ + uint16_t assoc_keyid; /* current send keyid (cached) */ + uint16_t recv_keyid; /* last recv keyid (cached) */ +} sctp_authinfo_t; + + + +/* + * Macros + */ +#define sctp_auth_is_required_chunk(chunk, list) ((list == NULL) ? (0) : (list->chunks[chunk] != 0)) + +/* + * function prototypes + */ + +/* socket option api functions */ +extern sctp_auth_chklist_t *sctp_alloc_chunklist(void); +extern void sctp_free_chunklist(sctp_auth_chklist_t * chklist); +extern void sctp_clear_chunklist(sctp_auth_chklist_t * chklist); +extern sctp_auth_chklist_t *sctp_copy_chunklist(sctp_auth_chklist_t * chklist); +extern int sctp_auth_add_chunk(uint8_t chunk, sctp_auth_chklist_t * list); +extern int sctp_auth_delete_chunk(uint8_t chunk, sctp_auth_chklist_t * list); +extern size_t sctp_auth_get_chklist_size(const sctp_auth_chklist_t * list); +extern void sctp_auth_set_default_chunks(sctp_auth_chklist_t * list); +extern int +sctp_serialize_auth_chunks(const sctp_auth_chklist_t * list, + uint8_t * ptr); +extern int +sctp_pack_auth_chunks(const sctp_auth_chklist_t * list, + uint8_t * ptr); +extern int +sctp_unpack_auth_chunks(const uint8_t * ptr, uint8_t num_chunks, + sctp_auth_chklist_t * list); + +/* key handling */ +extern sctp_key_t *sctp_alloc_key(uint32_t keylen); +extern void sctp_free_key(sctp_key_t * key); +extern void sctp_print_key(sctp_key_t * key, const char *str); +extern void sctp_show_key(sctp_key_t * key, const char *str); +extern sctp_key_t *sctp_generate_random_key(uint32_t keylen); +extern sctp_key_t *sctp_set_key(uint8_t * key, uint32_t keylen); +extern sctp_key_t * +sctp_compute_hashkey(sctp_key_t * key1, sctp_key_t * key2, + sctp_key_t * shared); + +/* shared key handling */ +extern sctp_sharedkey_t *sctp_alloc_sharedkey(void); +extern void sctp_free_sharedkey(sctp_sharedkey_t * skey); +extern sctp_sharedkey_t * +sctp_find_sharedkey(struct sctp_keyhead *shared_keys, + uint16_t key_id); +extern int +sctp_insert_sharedkey(struct sctp_keyhead *shared_keys, + sctp_sharedkey_t * new_skey); +extern int +sctp_copy_skeylist(const struct sctp_keyhead *src, + struct sctp_keyhead *dest); + +/* ref counts on shared keys, by key id */ +extern void sctp_auth_key_acquire(struct sctp_tcb *stcb, uint16_t keyid); +extern void sctp_auth_key_release(struct sctp_tcb *stcb, uint16_t keyid); + + +/* hmac list handling */ +extern sctp_hmaclist_t *sctp_alloc_hmaclist(uint8_t num_hmacs); +extern void sctp_free_hmaclist(sctp_hmaclist_t * list); +extern int sctp_auth_add_hmacid(sctp_hmaclist_t * list, uint16_t hmac_id); +extern sctp_hmaclist_t *sctp_copy_hmaclist(sctp_hmaclist_t * list); +extern sctp_hmaclist_t *sctp_default_supported_hmaclist(void); +extern uint16_t +sctp_negotiate_hmacid(sctp_hmaclist_t * peer, + sctp_hmaclist_t * local); +extern int sctp_serialize_hmaclist(sctp_hmaclist_t * list, uint8_t * ptr); +extern int +sctp_verify_hmac_param(struct sctp_auth_hmac_algo *hmacs, + uint32_t num_hmacs); + +extern sctp_authinfo_t *sctp_alloc_authinfo(void); +extern void sctp_free_authinfo(sctp_authinfo_t * authinfo); + +/* keyed-HMAC functions */ +extern uint32_t sctp_get_auth_chunk_len(uint16_t hmac_algo); +extern uint32_t sctp_get_hmac_digest_len(uint16_t hmac_algo); +extern uint32_t +sctp_hmac(uint16_t hmac_algo, uint8_t * key, uint32_t keylen, + uint8_t * text, uint32_t textlen, uint8_t * digest); +extern int +sctp_verify_hmac(uint16_t hmac_algo, uint8_t * key, uint32_t keylen, + uint8_t * text, uint32_t textlen, uint8_t * digest, uint32_t digestlen); +extern uint32_t +sctp_compute_hmac(uint16_t hmac_algo, sctp_key_t * key, + uint8_t * text, uint32_t textlen, uint8_t * digest); +extern int sctp_auth_is_supported_hmac(sctp_hmaclist_t * list, uint16_t id); + +/* mbuf versions */ +extern uint32_t +sctp_hmac_m(uint16_t hmac_algo, uint8_t * key, uint32_t keylen, + struct mbuf *m, uint32_t m_offset, uint8_t * digest, uint32_t trailer); +extern uint32_t +sctp_compute_hmac_m(uint16_t hmac_algo, sctp_key_t * key, + struct mbuf *m, uint32_t m_offset, uint8_t * digest); + +/* + * authentication routines + */ +extern void sctp_clear_cachedkeys(struct sctp_tcb *stcb, uint16_t keyid); +extern void sctp_clear_cachedkeys_ep(struct sctp_inpcb *inp, uint16_t keyid); +extern int sctp_delete_sharedkey(struct sctp_tcb *stcb, uint16_t keyid); +extern int sctp_delete_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid); +extern int sctp_auth_setactivekey(struct sctp_tcb *stcb, uint16_t keyid); +extern int sctp_auth_setactivekey_ep(struct sctp_inpcb *inp, uint16_t keyid); +extern int sctp_deact_sharedkey(struct sctp_tcb *stcb, uint16_t keyid); +extern int sctp_deact_sharedkey_ep(struct sctp_inpcb *inp, uint16_t keyid); + +extern void +sctp_auth_get_cookie_params(struct sctp_tcb *stcb, struct mbuf *m, + uint32_t offset, uint32_t length); +extern void +sctp_fill_hmac_digest_m(struct mbuf *m, uint32_t auth_offset, + struct sctp_auth_chunk *auth, struct sctp_tcb *stcb, uint16_t key_id); +extern struct mbuf * +sctp_add_auth_chunk(struct mbuf *m, struct mbuf **m_end, + struct sctp_auth_chunk **auth_ret, uint32_t * offset, + struct sctp_tcb *stcb, uint8_t chunk); +extern int +sctp_handle_auth(struct sctp_tcb *stcb, struct sctp_auth_chunk *ch, + struct mbuf *m, uint32_t offset); +extern void +sctp_notify_authentication(struct sctp_tcb *stcb, + uint32_t indication, uint16_t keyid, uint16_t alt_keyid, int so_locked); +extern int +sctp_validate_init_auth_params(struct mbuf *m, int offset, + int limit); +extern void +sctp_initialize_auth_params(struct sctp_inpcb *inp, + struct sctp_tcb *stcb); + +/* test functions */ +#endif /* __SCTP_AUTH_HH__ */ diff --git a/freebsd/sys/netinet/sctp_bsd_addr.c b/freebsd/sys/netinet/sctp_bsd_addr.c new file mode 100644 index 00000000..8782e681 --- /dev/null +++ b/freebsd/sys/netinet/sctp_bsd_addr.c @@ -0,0 +1,562 @@ +#include + +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_output.c,v 1.46 2005/03/06 16:04:17 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Declare all of our malloc named types */ +MALLOC_DEFINE(SCTP_M_MAP, "sctp_map", "sctp asoc map descriptor"); +MALLOC_DEFINE(SCTP_M_STRMI, "sctp_stri", "sctp stream in array"); +MALLOC_DEFINE(SCTP_M_STRMO, "sctp_stro", "sctp stream out array"); +MALLOC_DEFINE(SCTP_M_ASC_ADDR, "sctp_aadr", "sctp asconf address"); +MALLOC_DEFINE(SCTP_M_ASC_IT, "sctp_a_it", "sctp asconf iterator"); +MALLOC_DEFINE(SCTP_M_AUTH_CL, "sctp_atcl", "sctp auth chunklist"); +MALLOC_DEFINE(SCTP_M_AUTH_KY, "sctp_atky", "sctp auth key"); +MALLOC_DEFINE(SCTP_M_AUTH_HL, "sctp_athm", "sctp auth hmac list"); +MALLOC_DEFINE(SCTP_M_AUTH_IF, "sctp_athi", "sctp auth info"); +MALLOC_DEFINE(SCTP_M_STRESET, "sctp_stre", "sctp stream reset"); +MALLOC_DEFINE(SCTP_M_CMSG, "sctp_cmsg", "sctp CMSG buffer"); +MALLOC_DEFINE(SCTP_M_COPYAL, "sctp_cpal", "sctp copy all"); +MALLOC_DEFINE(SCTP_M_VRF, "sctp_vrf", "sctp vrf struct"); +MALLOC_DEFINE(SCTP_M_IFA, "sctp_ifa", "sctp ifa struct"); +MALLOC_DEFINE(SCTP_M_IFN, "sctp_ifn", "sctp ifn struct"); +MALLOC_DEFINE(SCTP_M_TIMW, "sctp_timw", "sctp time block"); +MALLOC_DEFINE(SCTP_M_MVRF, "sctp_mvrf", "sctp mvrf pcb list"); +MALLOC_DEFINE(SCTP_M_ITER, "sctp_iter", "sctp iterator control"); +MALLOC_DEFINE(SCTP_M_SOCKOPT, "sctp_socko", "sctp socket option"); + +/* Global NON-VNET structure that controls the iterator */ +struct iterator_control sctp_it_ctl; +static int __sctp_thread_based_iterator_started = 0; + + +static void +sctp_cleanup_itqueue(void) +{ + struct sctp_iterator *it; + + while ((it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead)) != NULL) { + if (it->function_atend != NULL) { + (*it->function_atend) (it->pointer, it->val); + } + TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); + SCTP_FREE(it, SCTP_M_ITER); + } +} + + +void +sctp_wakeup_iterator(void) +{ + wakeup(&sctp_it_ctl.iterator_running); +} + +static void +sctp_iterator_thread(void *v) +{ + SCTP_IPI_ITERATOR_WQ_LOCK(); + while (1) { + msleep(&sctp_it_ctl.iterator_running, + &sctp_it_ctl.ipi_iterator_wq_mtx, + 0, "waiting_for_work", 0); + if (sctp_it_ctl.iterator_flags & SCTP_ITERATOR_MUST_EXIT) { + SCTP_IPI_ITERATOR_WQ_DESTROY(); + SCTP_ITERATOR_LOCK_DESTROY(); + sctp_cleanup_itqueue(); + __sctp_thread_based_iterator_started = 0; + kthread_exit(); + } + sctp_iterator_worker(); + } +} + +void +sctp_startup_iterator(void) +{ + if (__sctp_thread_based_iterator_started) { + /* You only get one */ + return; + } + /* init the iterator head */ + __sctp_thread_based_iterator_started = 1; + sctp_it_ctl.iterator_running = 0; + sctp_it_ctl.iterator_flags = 0; + sctp_it_ctl.cur_it = NULL; + SCTP_ITERATOR_LOCK_INIT(); + SCTP_IPI_ITERATOR_WQ_INIT(); + TAILQ_INIT(&sctp_it_ctl.iteratorhead); + + int ret; + + ret = kproc_create(sctp_iterator_thread, + (void *)NULL, + &sctp_it_ctl.thread_proc, + RFPROC, + SCTP_KTHREAD_PAGES, + SCTP_KTRHEAD_NAME); +} + +#ifdef INET6 + +void +sctp_gather_internal_ifa_flags(struct sctp_ifa *ifa) +{ + struct in6_ifaddr *ifa6; + + ifa6 = (struct in6_ifaddr *)ifa->ifa; + ifa->flags = ifa6->ia6_flags; + if (!MODULE_GLOBAL(ip6_use_deprecated)) { + if (ifa->flags & + IN6_IFF_DEPRECATED) { + ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; + } else { + ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; + } + } else { + ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; + } + if (ifa->flags & + (IN6_IFF_DETACHED | + IN6_IFF_ANYCAST | + IN6_IFF_NOTREADY)) { + ifa->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; + } else { + ifa->localifa_flags &= ~SCTP_ADDR_IFA_UNUSEABLE; + } +} + +#endif /* INET6 */ + + +static uint32_t +sctp_is_desired_interface_type(struct ifaddr *ifa) +{ + int result; + + /* check the interface type to see if it's one we care about */ + switch (ifa->ifa_ifp->if_type) { + case IFT_ETHER: + case IFT_ISO88023: + case IFT_ISO88024: + case IFT_ISO88025: + case IFT_ISO88026: + case IFT_STARLAN: + case IFT_P10: + case IFT_P80: + case IFT_HY: + case IFT_FDDI: + case IFT_XETHER: + case IFT_ISDNBASIC: + case IFT_ISDNPRIMARY: + case IFT_PTPSERIAL: + case IFT_OTHER: + case IFT_PPP: + case IFT_LOOP: + case IFT_SLIP: + case IFT_GIF: + case IFT_L2VLAN: + case IFT_IP: + case IFT_IPOVERCDLC: + case IFT_IPOVERCLAW: + case IFT_VIRTUALIPADDRESS: + result = 1; + break; + default: + result = 0; + } + + return (result); +} + + + + +static void +sctp_init_ifns_for_vrf(int vrfid) +{ + /* + * Here we must apply ANY locks needed by the IFN we access and also + * make sure we lock any IFA that exists as we float through the + * list of IFA's + */ + struct ifnet *ifn; + struct ifaddr *ifa; + struct in6_ifaddr *ifa6; + struct sctp_ifa *sctp_ifa; + uint32_t ifa_flags; + + IFNET_RLOCK(); + TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_list) { + IF_ADDR_LOCK(ifn); + TAILQ_FOREACH(ifa, &ifn->if_addrlist, ifa_list) { + if (ifa->ifa_addr == NULL) { + continue; + } + if ((ifa->ifa_addr->sa_family != AF_INET) && (ifa->ifa_addr->sa_family != AF_INET6)) { + /* non inet/inet6 skip */ + continue; + } + if (ifa->ifa_addr->sa_family == AF_INET6) { + if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr)) { + /* skip unspecifed addresses */ + continue; + } + } else { + if (((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr == 0) { + continue; + } + } + if (sctp_is_desired_interface_type(ifa) == 0) { + /* non desired type */ + continue; + } + if (ifa->ifa_addr->sa_family == AF_INET6) { + ifa6 = (struct in6_ifaddr *)ifa; + ifa_flags = ifa6->ia6_flags; + } else { + ifa_flags = 0; + } + sctp_ifa = sctp_add_addr_to_vrf(vrfid, + (void *)ifn, + ifn->if_index, + ifn->if_type, + ifn->if_xname, + (void *)ifa, + ifa->ifa_addr, + ifa_flags, + 0); + if (sctp_ifa) { + sctp_ifa->localifa_flags &= ~SCTP_ADDR_DEFER_USE; + } + } + IF_ADDR_UNLOCK(ifn); + } + IFNET_RUNLOCK(); +} + +void +sctp_init_vrf_list(int vrfid) +{ + if (vrfid > SCTP_MAX_VRF_ID) + /* can't do that */ + return; + + /* Don't care about return here */ + (void)sctp_allocate_vrf(vrfid); + + /* + * Now we need to build all the ifn's for this vrf and there + * addresses + */ + sctp_init_ifns_for_vrf(vrfid); +} + +void +sctp_addr_change(struct ifaddr *ifa, int cmd) +{ + uint32_t ifa_flags = 0; + + /* + * BSD only has one VRF, if this changes we will need to hook in the + * right things here to get the id to pass to the address managment + * routine. + */ + if (SCTP_BASE_VAR(first_time) == 0) { + /* Special test to see if my ::1 will showup with this */ + SCTP_BASE_VAR(first_time) = 1; + sctp_init_ifns_for_vrf(SCTP_DEFAULT_VRFID); + } + if ((cmd != RTM_ADD) && (cmd != RTM_DELETE)) { + /* don't know what to do with this */ + return; + } + if (ifa->ifa_addr == NULL) { + return; + } + if ((ifa->ifa_addr->sa_family != AF_INET) && (ifa->ifa_addr->sa_family != AF_INET6)) { + /* non inet/inet6 skip */ + return; + } + if (ifa->ifa_addr->sa_family == AF_INET6) { + ifa_flags = ((struct in6_ifaddr *)ifa)->ia6_flags; + if (IN6_IS_ADDR_UNSPECIFIED(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr)) { + /* skip unspecifed addresses */ + return; + } + } else { + if (((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr == 0) { + return; + } + } + + if (sctp_is_desired_interface_type(ifa) == 0) { + /* non desired type */ + return; + } + if (cmd == RTM_ADD) { + (void)sctp_add_addr_to_vrf(SCTP_DEFAULT_VRFID, (void *)ifa->ifa_ifp, + ifa->ifa_ifp->if_index, ifa->ifa_ifp->if_type, + ifa->ifa_ifp->if_xname, + (void *)ifa, ifa->ifa_addr, ifa_flags, 1); + } else { + + sctp_del_addr_from_vrf(SCTP_DEFAULT_VRFID, ifa->ifa_addr, + ifa->ifa_ifp->if_index, + ifa->ifa_ifp->if_xname + ); + /* + * We don't bump refcount here so when it completes the + * final delete will happen. + */ + } +} + +void + sctp_add_or_del_interfaces(int (*pred) (struct ifnet *), int add){ + struct ifnet *ifn; + struct ifaddr *ifa; + + IFNET_RLOCK(); + TAILQ_FOREACH(ifn, &MODULE_GLOBAL(ifnet), if_list) { + if (!(*pred) (ifn)) { + continue; + } + TAILQ_FOREACH(ifa, &ifn->if_addrlist, ifa_list) { + sctp_addr_change(ifa, add ? RTM_ADD : RTM_DELETE); + } + } + IFNET_RUNLOCK(); +} + +struct mbuf * +sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header, + int how, int allonebuf, int type) +{ + struct mbuf *m = NULL; + + m = m_getm2(NULL, space_needed, how, type, want_header ? M_PKTHDR : 0); + if (m == NULL) { + /* bad, no memory */ + return (m); + } + if (allonebuf) { + int siz; + + if (SCTP_BUF_IS_EXTENDED(m)) { + siz = SCTP_BUF_EXTEND_SIZE(m); + } else { + if (want_header) + siz = MHLEN; + else + siz = MLEN; + } + if (siz < space_needed) { + m_freem(m); + return (NULL); + } + } + if (SCTP_BUF_NEXT(m)) { + sctp_m_freem(SCTP_BUF_NEXT(m)); + SCTP_BUF_NEXT(m) = NULL; + } +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + if (SCTP_BUF_IS_EXTENDED(m)) { + sctp_log_mb(m, SCTP_MBUF_IALLOC); + } + } +#endif + return (m); +} + + +#ifdef SCTP_PACKET_LOGGING +void +sctp_packet_log(struct mbuf *m, int length) +{ + int *lenat, thisone; + void *copyto; + uint32_t *tick_tock; + int total_len; + int grabbed_lock = 0; + int value, newval, thisend, thisbegin; + + /* + * Buffer layout. -sizeof this entry (total_len) -previous end + * (value) -ticks of log (ticks) o -ip packet o -as logged - + * where this started (thisbegin) x <--end points here + */ + total_len = SCTP_SIZE32((length + (4 * sizeof(int)))); + /* Log a packet to the buffer. */ + if (total_len > SCTP_PACKET_LOG_SIZE) { + /* Can't log this packet I have not a buffer big enough */ + return; + } + if (length < (int)(SCTP_MIN_V4_OVERHEAD + sizeof(struct sctp_cookie_ack_chunk))) { + return; + } + atomic_add_int(&SCTP_BASE_VAR(packet_log_writers), 1); +try_again: + if (SCTP_BASE_VAR(packet_log_writers) > SCTP_PKTLOG_WRITERS_NEED_LOCK) { + SCTP_IP_PKTLOG_LOCK(); + grabbed_lock = 1; +again_locked: + value = SCTP_BASE_VAR(packet_log_end); + newval = SCTP_BASE_VAR(packet_log_end) + total_len; + if (newval >= SCTP_PACKET_LOG_SIZE) { + /* we wrapped */ + thisbegin = 0; + thisend = total_len; + } else { + thisbegin = SCTP_BASE_VAR(packet_log_end); + thisend = newval; + } + if (!(atomic_cmpset_int(&SCTP_BASE_VAR(packet_log_end), value, thisend))) { + goto again_locked; + } + } else { + value = SCTP_BASE_VAR(packet_log_end); + newval = SCTP_BASE_VAR(packet_log_end) + total_len; + if (newval >= SCTP_PACKET_LOG_SIZE) { + /* we wrapped */ + thisbegin = 0; + thisend = total_len; + } else { + thisbegin = SCTP_BASE_VAR(packet_log_end); + thisend = newval; + } + if (!(atomic_cmpset_int(&SCTP_BASE_VAR(packet_log_end), value, thisend))) { + goto try_again; + } + } + /* Sanity check */ + if (thisend >= SCTP_PACKET_LOG_SIZE) { + printf("Insanity stops a log thisbegin:%d thisend:%d writers:%d lock:%d end:%d\n", + thisbegin, + thisend, + SCTP_BASE_VAR(packet_log_writers), + grabbed_lock, + SCTP_BASE_VAR(packet_log_end)); + SCTP_BASE_VAR(packet_log_end) = 0; + goto no_log; + + } + lenat = (int *)&SCTP_BASE_VAR(packet_log_buffer)[thisbegin]; + *lenat = total_len; + lenat++; + *lenat = value; + lenat++; + tick_tock = (uint32_t *) lenat; + lenat++; + *tick_tock = sctp_get_tick_count(); + copyto = (void *)lenat; + thisone = thisend - sizeof(int); + lenat = (int *)&SCTP_BASE_VAR(packet_log_buffer)[thisone]; + *lenat = thisbegin; + if (grabbed_lock) { + SCTP_IP_PKTLOG_UNLOCK(); + grabbed_lock = 0; + } + m_copydata(m, 0, length, (caddr_t)copyto); +no_log: + if (grabbed_lock) { + SCTP_IP_PKTLOG_UNLOCK(); + } + atomic_subtract_int(&SCTP_BASE_VAR(packet_log_writers), 1); +} + + +int +sctp_copy_out_packet_log(uint8_t * target, int length) +{ + /* + * We wind through the packet log starting at start copying up to + * length bytes out. We return the number of bytes copied. + */ + int tocopy, this_copy; + int *lenat; + int did_delay = 0; + + tocopy = length; + if (length < (int)(2 * sizeof(int))) { + /* not enough room */ + return (0); + } + if (SCTP_PKTLOG_WRITERS_NEED_LOCK) { + atomic_add_int(&SCTP_BASE_VAR(packet_log_writers), SCTP_PKTLOG_WRITERS_NEED_LOCK); +again: + if ((did_delay == 0) && (SCTP_BASE_VAR(packet_log_writers) != SCTP_PKTLOG_WRITERS_NEED_LOCK)) { + /* + * we delay here for just a moment hoping the + * writer(s) that were present when we entered will + * have left and we only have locking ones that will + * contend with us for the lock. This does not + * assure 100% access, but its good enough for a + * logging facility like this. + */ + did_delay = 1; + DELAY(10); + goto again; + } + } + SCTP_IP_PKTLOG_LOCK(); + lenat = (int *)target; + *lenat = SCTP_BASE_VAR(packet_log_end); + lenat++; + this_copy = min((length - sizeof(int)), SCTP_PACKET_LOG_SIZE); + memcpy((void *)lenat, (void *)SCTP_BASE_VAR(packet_log_buffer), this_copy); + if (SCTP_PKTLOG_WRITERS_NEED_LOCK) { + atomic_subtract_int(&SCTP_BASE_VAR(packet_log_writers), + SCTP_PKTLOG_WRITERS_NEED_LOCK); + } + SCTP_IP_PKTLOG_UNLOCK(); + return (this_copy + sizeof(int)); +} + +#endif diff --git a/freebsd/sys/netinet/sctp_bsd_addr.h b/freebsd/sys/netinet/sctp_bsd_addr.h new file mode 100644 index 00000000..67d65dc6 --- /dev/null +++ b/freebsd/sys/netinet/sctp_bsd_addr.h @@ -0,0 +1,63 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_bsd_addr_h__ +#define __sctp_bsd_addr_h__ +#include + +#if defined(_KERNEL) || defined(__Userspace__) + +extern struct iterator_control sctp_it_ctl; +void sctp_wakeup_iterator(void); + +void sctp_startup_iterator(void); + + +#ifdef INET6 +void sctp_gather_internal_ifa_flags(struct sctp_ifa *ifa); + +#endif + +#ifdef SCTP_PACKET_LOGGING + +void sctp_packet_log(struct mbuf *m, int length); +int sctp_copy_out_packet_log(uint8_t * target, int length); + +#endif + +void sctp_addr_change(struct ifaddr *ifa, int cmd); + +void sctp_add_or_del_interfaces(int (*pred) (struct ifnet *), int add); + +#endif +#endif diff --git a/freebsd/sys/netinet/sctp_cc_functions.c b/freebsd/sys/netinet/sctp_cc_functions.c new file mode 100644 index 00000000..668fd673 --- /dev/null +++ b/freebsd/sys/netinet/sctp_cc_functions.c @@ -0,0 +1,1565 @@ +#include + +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +__FBSDID("$FreeBSD$"); + +void +sctp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + struct sctp_association *assoc; + uint32_t cwnd_in_mtu; + + assoc = &stcb->asoc; + /* + * We take the minimum of the burst limit and the initial congestion + * window. The initial congestion window is at least two times the + * MTU. + */ + cwnd_in_mtu = SCTP_BASE_SYSCTL(sctp_initial_cwnd); + if ((assoc->max_burst > 0) && (cwnd_in_mtu > assoc->max_burst)) + cwnd_in_mtu = assoc->max_burst; + net->cwnd = (net->mtu - sizeof(struct sctphdr)) * cwnd_in_mtu; + net->ssthresh = assoc->peers_rwnd; + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & + (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) { + sctp_log_cwnd(stcb, net, 0, SCTP_CWND_INITIALIZATION); + } +} + +void +sctp_cwnd_update_after_fr(struct sctp_tcb *stcb, + struct sctp_association *asoc) +{ + struct sctp_nets *net; + + /*- + * CMT fast recovery code. Need to debug. ((sctp_cmt_on_off == 1) && + * (net->fast_retran_loss_recovery == 0))) + */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + if ((asoc->fast_retran_loss_recovery == 0) || + (asoc->sctp_cmt_on_off == 1)) { + /* out of a RFC2582 Fast recovery window? */ + if (net->net_ack > 0) { + /* + * per section 7.2.3, are there any + * destinations that had a fast retransmit + * to them. If so what we need to do is + * adjust ssthresh and cwnd. + */ + struct sctp_tmit_chunk *lchk; + int old_cwnd = net->cwnd; + + net->ssthresh = net->cwnd / 2; + if (net->ssthresh < (net->mtu * 2)) { + net->ssthresh = 2 * net->mtu; + } + net->cwnd = net->ssthresh; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), + SCTP_CWND_LOG_FROM_FR); + } + lchk = TAILQ_FIRST(&asoc->send_queue); + + net->partial_bytes_acked = 0; + /* Turn on fast recovery window */ + asoc->fast_retran_loss_recovery = 1; + if (lchk == NULL) { + /* Mark end of the window */ + asoc->fast_recovery_tsn = asoc->sending_seq - 1; + } else { + asoc->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; + } + + /* + * CMT fast recovery -- per destination + * recovery variable. + */ + net->fast_retran_loss_recovery = 1; + + if (lchk == NULL) { + /* Mark end of the window */ + net->fast_recovery_tsn = asoc->sending_seq - 1; + } else { + net->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; + } + + /* + * Disable Nonce Sum Checking and store the + * resync tsn + */ + asoc->nonce_sum_check = 0; + asoc->nonce_resync_tsn = asoc->fast_recovery_tsn + 1; + + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32); + sctp_timer_start(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, net); + } + } else if (net->net_ack > 0) { + /* + * Mark a peg that we WOULD have done a cwnd + * reduction but RFC2582 prevented this action. + */ + SCTP_STAT_INCR(sctps_fastretransinrtt); + } + } +} + +void +sctp_cwnd_update_after_sack(struct sctp_tcb *stcb, + struct sctp_association *asoc, + int accum_moved, int reneged_all, int will_exit) +{ + struct sctp_nets *net; + + /******************************/ + /* update cwnd and Early FR */ + /******************************/ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + +#ifdef JANA_CMT_FAST_RECOVERY + /* + * CMT fast recovery code. Need to debug. + */ + if (net->fast_retran_loss_recovery && net->new_pseudo_cumack) { + if (compare_with_wrap(asoc->last_acked_seq, + net->fast_recovery_tsn, MAX_TSN) || + (asoc->last_acked_seq == net->fast_recovery_tsn) || + compare_with_wrap(net->pseudo_cumack, net->fast_recovery_tsn, MAX_TSN) || + (net->pseudo_cumack == net->fast_recovery_tsn)) { + net->will_exit_fast_recovery = 1; + } + } +#endif + if (SCTP_BASE_SYSCTL(sctp_early_fr)) { + /* + * So, first of all do we need to have a Early FR + * timer running? + */ + if ((!TAILQ_EMPTY(&asoc->sent_queue) && + (net->ref_count > 1) && + (net->flight_size < net->cwnd)) || + (reneged_all)) { + /* + * yes, so in this case stop it if its + * running, and then restart it. Reneging + * all is a special case where we want to + * run the Early FR timer and then force the + * last few unacked to be sent, causing us + * to illicit a sack with gaps to force out + * the others. + */ + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpidsck2); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_20); + } + SCTP_STAT_INCR(sctps_earlyfrstrid); + sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net); + } else { + /* No, stop it if its running */ + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpidsck3); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_21); + } + } + } + /* if nothing was acked on this destination skip it */ + if (net->net_ack == 0) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, 0, SCTP_CWND_LOG_FROM_SACK); + } + continue; + } + if (net->net_ack2 > 0) { + /* + * Karn's rule applies to clearing error count, this + * is optional. + */ + net->error_count = 0; + if ((net->dest_state & SCTP_ADDR_NOT_REACHABLE) == + SCTP_ADDR_NOT_REACHABLE) { + /* addr came good */ + net->dest_state &= ~SCTP_ADDR_NOT_REACHABLE; + net->dest_state |= SCTP_ADDR_REACHABLE; + sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, + SCTP_RECEIVED_SACK, (void *)net, SCTP_SO_NOT_LOCKED); + /* now was it the primary? if so restore */ + if (net->dest_state & SCTP_ADDR_WAS_PRIMARY) { + (void)sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net); + } + } + /* + * JRS 5/14/07 - If CMT PF is on and the destination + * is in PF state, set the destination to active + * state and set the cwnd to one or two MTU's based + * on whether PF1 or PF2 is being used. + * + * Should we stop any running T3 timer here? + */ + if ((asoc->sctp_cmt_on_off == 1) && + (asoc->sctp_cmt_pf > 0) && + ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) { + net->dest_state &= ~SCTP_ADDR_PF; + net->cwnd = net->mtu * asoc->sctp_cmt_pf; + SCTPDBG(SCTP_DEBUG_INDATA1, "Destination %p moved from PF to reachable with cwnd %d.\n", + net, net->cwnd); + /* + * Since the cwnd value is explicitly set, + * skip the code that updates the cwnd + * value. + */ + goto skip_cwnd_update; + } + } +#ifdef JANA_CMT_FAST_RECOVERY + /* + * CMT fast recovery code + */ + /* + * if (sctp_cmt_on_off == 1 && + * net->fast_retran_loss_recovery && + * net->will_exit_fast_recovery == 0) { @@@ Do something } + * else if (sctp_cmt_on_off == 0 && + * asoc->fast_retran_loss_recovery && will_exit == 0) { + */ +#endif + + if (asoc->fast_retran_loss_recovery && + (will_exit == 0) && + (asoc->sctp_cmt_on_off == 0)) { + /* + * If we are in loss recovery we skip any cwnd + * update + */ + goto skip_cwnd_update; + } + /* + * CMT: CUC algorithm. Update cwnd if pseudo-cumack has + * moved. + */ + if (accum_moved || + ((asoc->sctp_cmt_on_off == 1) && net->new_pseudo_cumack)) { + /* If the cumulative ack moved we can proceed */ + if (net->cwnd <= net->ssthresh) { + /* We are in slow start */ + if (net->flight_size + net->net_ack >= net->cwnd) { + if (net->net_ack > (net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable))) { + net->cwnd += (net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable)); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->mtu, + SCTP_CWND_LOG_FROM_SS); + } + } else { + net->cwnd += net->net_ack; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->net_ack, + SCTP_CWND_LOG_FROM_SS); + } + } + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, net->net_ack, + SCTP_CWND_LOG_NOADV_SS); + } + } + } else { + /* We are in congestion avoidance */ + /* + * Add to pba + */ + net->partial_bytes_acked += net->net_ack; + + if ((net->flight_size + net->net_ack >= net->cwnd) && + (net->partial_bytes_acked >= net->cwnd)) { + net->partial_bytes_acked -= net->cwnd; + net->cwnd += net->mtu; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->mtu, + SCTP_CWND_LOG_FROM_CA); + } + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, net->net_ack, + SCTP_CWND_LOG_NOADV_CA); + } + } + } + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, net->mtu, + SCTP_CWND_LOG_NO_CUMACK); + } + } +skip_cwnd_update: + /* + * NOW, according to Karn's rule do we need to restore the + * RTO timer back? Check our net_ack2. If not set then we + * have a ambiguity.. i.e. all data ack'd was sent to more + * than one place. + */ + if (net->net_ack2) { + /* restore any doubled timers */ + net->RTO = ((net->lastsa >> 2) + net->lastsv) >> 1; + if (net->RTO < stcb->asoc.minrto) { + net->RTO = stcb->asoc.minrto; + } + if (net->RTO > stcb->asoc.maxrto) { + net->RTO = stcb->asoc.maxrto; + } + } + } +} + +void +sctp_cwnd_update_after_timeout(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + int old_cwnd = net->cwnd; + + net->ssthresh = max(net->cwnd / 2, 4 * net->mtu); + net->cwnd = net->mtu; + net->partial_bytes_acked = 0; + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->cwnd - old_cwnd, SCTP_CWND_LOG_FROM_RTX); + } +} + +void +sctp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + int old_cwnd = net->cwnd; + + SCTP_STAT_INCR(sctps_ecnereducedcwnd); + net->ssthresh = net->cwnd / 2; + if (net->ssthresh < net->mtu) { + net->ssthresh = net->mtu; + /* here back off the timer as well, to slow us down */ + net->RTO <<= 1; + } + net->cwnd = net->ssthresh; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT); + } +} + +void +sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb, + struct sctp_nets *net, struct sctp_pktdrop_chunk *cp, + uint32_t * bottle_bw, uint32_t * on_queue) +{ + uint32_t bw_avail; + int rtt, incr; + int old_cwnd = net->cwnd; + + /* need real RTT for this calc */ + rtt = ((net->lastsa >> 2) + net->lastsv) >> 1; + /* get bottle neck bw */ + *bottle_bw = ntohl(cp->bottle_bw); + /* and whats on queue */ + *on_queue = ntohl(cp->current_onq); + /* + * adjust the on-queue if our flight is more it could be that the + * router has not yet gotten data "in-flight" to it + */ + if (*on_queue < net->flight_size) + *on_queue = net->flight_size; + /* calculate the available space */ + bw_avail = (*bottle_bw * rtt) / 1000; + if (bw_avail > *bottle_bw) { + /* + * Cap the growth to no more than the bottle neck. This can + * happen as RTT slides up due to queues. It also means if + * you have more than a 1 second RTT with a empty queue you + * will be limited to the bottle_bw per second no matter if + * other points have 1/2 the RTT and you could get more + * out... + */ + bw_avail = *bottle_bw; + } + if (*on_queue > bw_avail) { + /* + * No room for anything else don't allow anything else to be + * "added to the fire". + */ + int seg_inflight, seg_onqueue, my_portion; + + net->partial_bytes_acked = 0; + + /* how much are we over queue size? */ + incr = *on_queue - bw_avail; + if (stcb->asoc.seen_a_sack_this_pkt) { + /* + * undo any cwnd adjustment that the sack might have + * made + */ + net->cwnd = net->prev_cwnd; + } + /* Now how much of that is mine? */ + seg_inflight = net->flight_size / net->mtu; + seg_onqueue = *on_queue / net->mtu; + my_portion = (incr * seg_inflight) / seg_onqueue; + + /* Have I made an adjustment already */ + if (net->cwnd > net->flight_size) { + /* + * for this flight I made an adjustment we need to + * decrease the portion by a share our previous + * adjustment. + */ + int diff_adj; + + diff_adj = net->cwnd - net->flight_size; + if (diff_adj > my_portion) + my_portion = 0; + else + my_portion -= diff_adj; + } + /* + * back down to the previous cwnd (assume we have had a sack + * before this packet). minus what ever portion of the + * overage is my fault. + */ + net->cwnd -= my_portion; + + /* we will NOT back down more than 1 MTU */ + if (net->cwnd <= net->mtu) { + net->cwnd = net->mtu; + } + /* force into CA */ + net->ssthresh = net->cwnd - 1; + } else { + /* + * Take 1/4 of the space left or max burst up .. whichever + * is less. + */ + incr = min((bw_avail - *on_queue) >> 2, + stcb->asoc.max_burst * net->mtu); + net->cwnd += incr; + } + if (net->cwnd > bw_avail) { + /* We can't exceed the pipe size */ + net->cwnd = bw_avail; + } + if (net->cwnd < net->mtu) { + /* We always have 1 MTU */ + net->cwnd = net->mtu; + } + if (net->cwnd - old_cwnd != 0) { + /* log only changes */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), + SCTP_CWND_LOG_FROM_SAT); + } + } +} + +void +sctp_cwnd_update_after_output(struct sctp_tcb *stcb, + struct sctp_nets *net, int burst_limit) +{ + int old_cwnd = net->cwnd; + + if (net->ssthresh < net->cwnd) + net->ssthresh = net->cwnd; + net->cwnd = (net->flight_size + (burst_limit * net->mtu)); + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_BRST); + } +} + +void +sctp_cwnd_update_after_fr_timer(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, struct sctp_nets *net) +{ + int old_cwnd = net->cwnd; + + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_EARLY_FR_TMR, SCTP_SO_NOT_LOCKED); + /* + * make a small adjustment to cwnd and force to CA. + */ + if (net->cwnd > net->mtu) + /* drop down one MTU after sending */ + net->cwnd -= net->mtu; + if (net->cwnd < net->ssthresh) + /* still in SS move to CA */ + net->ssthresh = net->cwnd - 1; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, (old_cwnd - net->cwnd), SCTP_CWND_LOG_FROM_FR); + } +} + +struct sctp_hs_raise_drop { + int32_t cwnd; + int32_t increase; + int32_t drop_percent; +}; + +#define SCTP_HS_TABLE_SIZE 73 + +struct sctp_hs_raise_drop sctp_cwnd_adjust[SCTP_HS_TABLE_SIZE] = { + {38, 1, 50}, /* 0 */ + {118, 2, 44}, /* 1 */ + {221, 3, 41}, /* 2 */ + {347, 4, 38}, /* 3 */ + {495, 5, 37}, /* 4 */ + {663, 6, 35}, /* 5 */ + {851, 7, 34}, /* 6 */ + {1058, 8, 33}, /* 7 */ + {1284, 9, 32}, /* 8 */ + {1529, 10, 31}, /* 9 */ + {1793, 11, 30}, /* 10 */ + {2076, 12, 29}, /* 11 */ + {2378, 13, 28}, /* 12 */ + {2699, 14, 28}, /* 13 */ + {3039, 15, 27}, /* 14 */ + {3399, 16, 27}, /* 15 */ + {3778, 17, 26}, /* 16 */ + {4177, 18, 26}, /* 17 */ + {4596, 19, 25}, /* 18 */ + {5036, 20, 25}, /* 19 */ + {5497, 21, 24}, /* 20 */ + {5979, 22, 24}, /* 21 */ + {6483, 23, 23}, /* 22 */ + {7009, 24, 23}, /* 23 */ + {7558, 25, 22}, /* 24 */ + {8130, 26, 22}, /* 25 */ + {8726, 27, 22}, /* 26 */ + {9346, 28, 21}, /* 27 */ + {9991, 29, 21}, /* 28 */ + {10661, 30, 21}, /* 29 */ + {11358, 31, 20}, /* 30 */ + {12082, 32, 20}, /* 31 */ + {12834, 33, 20}, /* 32 */ + {13614, 34, 19}, /* 33 */ + {14424, 35, 19}, /* 34 */ + {15265, 36, 19}, /* 35 */ + {16137, 37, 19}, /* 36 */ + {17042, 38, 18}, /* 37 */ + {17981, 39, 18}, /* 38 */ + {18955, 40, 18}, /* 39 */ + {19965, 41, 17}, /* 40 */ + {21013, 42, 17}, /* 41 */ + {22101, 43, 17}, /* 42 */ + {23230, 44, 17}, /* 43 */ + {24402, 45, 16}, /* 44 */ + {25618, 46, 16}, /* 45 */ + {26881, 47, 16}, /* 46 */ + {28193, 48, 16}, /* 47 */ + {29557, 49, 15}, /* 48 */ + {30975, 50, 15}, /* 49 */ + {32450, 51, 15}, /* 50 */ + {33986, 52, 15}, /* 51 */ + {35586, 53, 14}, /* 52 */ + {37253, 54, 14}, /* 53 */ + {38992, 55, 14}, /* 54 */ + {40808, 56, 14}, /* 55 */ + {42707, 57, 13}, /* 56 */ + {44694, 58, 13}, /* 57 */ + {46776, 59, 13}, /* 58 */ + {48961, 60, 13}, /* 59 */ + {51258, 61, 13}, /* 60 */ + {53677, 62, 12}, /* 61 */ + {56230, 63, 12}, /* 62 */ + {58932, 64, 12}, /* 63 */ + {61799, 65, 12}, /* 64 */ + {64851, 66, 11}, /* 65 */ + {68113, 67, 11}, /* 66 */ + {71617, 68, 11}, /* 67 */ + {75401, 69, 10}, /* 68 */ + {79517, 70, 10}, /* 69 */ + {84035, 71, 10}, /* 70 */ + {89053, 72, 10}, /* 71 */ + {94717, 73, 9} /* 72 */ +}; + +static void +sctp_hs_cwnd_increase(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + int cur_val, i, indx, incr; + + cur_val = net->cwnd >> 10; + indx = SCTP_HS_TABLE_SIZE - 1; +#ifdef SCTP_DEBUG + printf("HS CC CAlled.\n"); +#endif + if (cur_val < sctp_cwnd_adjust[0].cwnd) { + /* normal mode */ + if (net->net_ack > net->mtu) { + net->cwnd += net->mtu; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->mtu, SCTP_CWND_LOG_FROM_SS); + } + } else { + net->cwnd += net->net_ack; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->net_ack, SCTP_CWND_LOG_FROM_SS); + } + } + } else { + for (i = net->last_hs_used; i < SCTP_HS_TABLE_SIZE; i++) { + if (cur_val < sctp_cwnd_adjust[i].cwnd) { + indx = i; + break; + } + } + net->last_hs_used = indx; + incr = ((sctp_cwnd_adjust[indx].increase) << 10); + net->cwnd += incr; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, incr, SCTP_CWND_LOG_FROM_SS); + } + } +} + +static void +sctp_hs_cwnd_decrease(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + int cur_val, i, indx; + int old_cwnd = net->cwnd; + + cur_val = net->cwnd >> 10; + if (cur_val < sctp_cwnd_adjust[0].cwnd) { + /* normal mode */ + net->ssthresh = net->cwnd / 2; + if (net->ssthresh < (net->mtu * 2)) { + net->ssthresh = 2 * net->mtu; + } + net->cwnd = net->ssthresh; + } else { + /* drop by the proper amount */ + net->ssthresh = net->cwnd - (int)((net->cwnd / 100) * + sctp_cwnd_adjust[net->last_hs_used].drop_percent); + net->cwnd = net->ssthresh; + /* now where are we */ + indx = net->last_hs_used; + cur_val = net->cwnd >> 10; + /* reset where we are in the table */ + if (cur_val < sctp_cwnd_adjust[0].cwnd) { + /* feel out of hs */ + net->last_hs_used = 0; + } else { + for (i = indx; i >= 1; i--) { + if (cur_val > sctp_cwnd_adjust[i - 1].cwnd) { + break; + } + } + net->last_hs_used = indx; + } + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_FR); + } +} + +void +sctp_hs_cwnd_update_after_fr(struct sctp_tcb *stcb, + struct sctp_association *asoc) +{ + struct sctp_nets *net; + + /* + * CMT fast recovery code. Need to debug. ((sctp_cmt_on_off == 1) && + * (net->fast_retran_loss_recovery == 0))) + */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + if ((asoc->fast_retran_loss_recovery == 0) || + (asoc->sctp_cmt_on_off == 1)) { + /* out of a RFC2582 Fast recovery window? */ + if (net->net_ack > 0) { + /* + * per section 7.2.3, are there any + * destinations that had a fast retransmit + * to them. If so what we need to do is + * adjust ssthresh and cwnd. + */ + struct sctp_tmit_chunk *lchk; + + sctp_hs_cwnd_decrease(stcb, net); + + lchk = TAILQ_FIRST(&asoc->send_queue); + + net->partial_bytes_acked = 0; + /* Turn on fast recovery window */ + asoc->fast_retran_loss_recovery = 1; + if (lchk == NULL) { + /* Mark end of the window */ + asoc->fast_recovery_tsn = asoc->sending_seq - 1; + } else { + asoc->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; + } + + /* + * CMT fast recovery -- per destination + * recovery variable. + */ + net->fast_retran_loss_recovery = 1; + + if (lchk == NULL) { + /* Mark end of the window */ + net->fast_recovery_tsn = asoc->sending_seq - 1; + } else { + net->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; + } + + /* + * Disable Nonce Sum Checking and store the + * resync tsn + */ + asoc->nonce_sum_check = 0; + asoc->nonce_resync_tsn = asoc->fast_recovery_tsn + 1; + + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32); + sctp_timer_start(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, net); + } + } else if (net->net_ack > 0) { + /* + * Mark a peg that we WOULD have done a cwnd + * reduction but RFC2582 prevented this action. + */ + SCTP_STAT_INCR(sctps_fastretransinrtt); + } + } +} + +void +sctp_hs_cwnd_update_after_sack(struct sctp_tcb *stcb, + struct sctp_association *asoc, + int accum_moved, int reneged_all, int will_exit) +{ + struct sctp_nets *net; + + /******************************/ + /* update cwnd and Early FR */ + /******************************/ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + +#ifdef JANA_CMT_FAST_RECOVERY + /* + * CMT fast recovery code. Need to debug. + */ + if (net->fast_retran_loss_recovery && net->new_pseudo_cumack) { + if (compare_with_wrap(asoc->last_acked_seq, + net->fast_recovery_tsn, MAX_TSN) || + (asoc->last_acked_seq == net->fast_recovery_tsn) || + compare_with_wrap(net->pseudo_cumack, net->fast_recovery_tsn, MAX_TSN) || + (net->pseudo_cumack == net->fast_recovery_tsn)) { + net->will_exit_fast_recovery = 1; + } + } +#endif + if (SCTP_BASE_SYSCTL(sctp_early_fr)) { + /* + * So, first of all do we need to have a Early FR + * timer running? + */ + if ((!TAILQ_EMPTY(&asoc->sent_queue) && + (net->ref_count > 1) && + (net->flight_size < net->cwnd)) || + (reneged_all)) { + /* + * yes, so in this case stop it if its + * running, and then restart it. Reneging + * all is a special case where we want to + * run the Early FR timer and then force the + * last few unacked to be sent, causing us + * to illicit a sack with gaps to force out + * the others. + */ + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpidsck2); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_20); + } + SCTP_STAT_INCR(sctps_earlyfrstrid); + sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net); + } else { + /* No, stop it if its running */ + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpidsck3); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_21); + } + } + } + /* if nothing was acked on this destination skip it */ + if (net->net_ack == 0) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, 0, SCTP_CWND_LOG_FROM_SACK); + } + continue; + } + if (net->net_ack2 > 0) { + /* + * Karn's rule applies to clearing error count, this + * is optional. + */ + net->error_count = 0; + if ((net->dest_state & SCTP_ADDR_NOT_REACHABLE) == + SCTP_ADDR_NOT_REACHABLE) { + /* addr came good */ + net->dest_state &= ~SCTP_ADDR_NOT_REACHABLE; + net->dest_state |= SCTP_ADDR_REACHABLE; + sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, + SCTP_RECEIVED_SACK, (void *)net, SCTP_SO_NOT_LOCKED); + /* now was it the primary? if so restore */ + if (net->dest_state & SCTP_ADDR_WAS_PRIMARY) { + (void)sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net); + } + } + /* + * JRS 5/14/07 - If CMT PF is on and the destination + * is in PF state, set the destination to active + * state and set the cwnd to one or two MTU's based + * on whether PF1 or PF2 is being used. + * + * Should we stop any running T3 timer here? + */ + if ((asoc->sctp_cmt_on_off == 1) && + (asoc->sctp_cmt_pf > 0) && + ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) { + net->dest_state &= ~SCTP_ADDR_PF; + net->cwnd = net->mtu * asoc->sctp_cmt_pf; + SCTPDBG(SCTP_DEBUG_INDATA1, "Destination %p moved from PF to reachable with cwnd %d.\n", + net, net->cwnd); + /* + * Since the cwnd value is explicitly set, + * skip the code that updates the cwnd + * value. + */ + goto skip_cwnd_update; + } + } +#ifdef JANA_CMT_FAST_RECOVERY + /* + * CMT fast recovery code + */ + /* + * if (sctp_cmt_on_off == 1 && + * net->fast_retran_loss_recovery && + * net->will_exit_fast_recovery == 0) { @@@ Do something } + * else if (sctp_cmt_on_off == 0 && + * asoc->fast_retran_loss_recovery && will_exit == 0) { + */ +#endif + + if (asoc->fast_retran_loss_recovery && + (will_exit == 0) && + (asoc->sctp_cmt_on_off == 0)) { + /* + * If we are in loss recovery we skip any cwnd + * update + */ + goto skip_cwnd_update; + } + /* + * CMT: CUC algorithm. Update cwnd if pseudo-cumack has + * moved. + */ + if (accum_moved || + ((asoc->sctp_cmt_on_off == 1) && net->new_pseudo_cumack)) { + /* If the cumulative ack moved we can proceed */ + if (net->cwnd <= net->ssthresh) { + /* We are in slow start */ + if (net->flight_size + net->net_ack >= net->cwnd) { + + sctp_hs_cwnd_increase(stcb, net); + + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, net->net_ack, + SCTP_CWND_LOG_NOADV_SS); + } + } + } else { + /* We are in congestion avoidance */ + net->partial_bytes_acked += net->net_ack; + if ((net->flight_size + net->net_ack >= net->cwnd) && + (net->partial_bytes_acked >= net->cwnd)) { + net->partial_bytes_acked -= net->cwnd; + net->cwnd += net->mtu; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->mtu, + SCTP_CWND_LOG_FROM_CA); + } + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, net->net_ack, + SCTP_CWND_LOG_NOADV_CA); + } + } + } + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, net->mtu, + SCTP_CWND_LOG_NO_CUMACK); + } + } +skip_cwnd_update: + /* + * NOW, according to Karn's rule do we need to restore the + * RTO timer back? Check our net_ack2. If not set then we + * have a ambiguity.. i.e. all data ack'd was sent to more + * than one place. + */ + if (net->net_ack2) { + /* restore any doubled timers */ + net->RTO = ((net->lastsa >> 2) + net->lastsv) >> 1; + if (net->RTO < stcb->asoc.minrto) { + net->RTO = stcb->asoc.minrto; + } + if (net->RTO > stcb->asoc.maxrto) { + net->RTO = stcb->asoc.maxrto; + } + } + } +} + + +/* + * H-TCP congestion control. The algorithm is detailed in: + * R.N.Shorten, D.J.Leith: + * "H-TCP: TCP for high-speed and long-distance networks" + * Proc. PFLDnet, Argonne, 2004. + * http://www.hamilton.ie/net/htcp3.pdf + */ + + +static int use_rtt_scaling = 1; +static int use_bandwidth_switch = 1; + +static inline int +between(uint32_t seq1, uint32_t seq2, uint32_t seq3) +{ + return seq3 - seq2 >= seq1 - seq2; +} + +static inline uint32_t +htcp_cong_time(struct htcp *ca) +{ + return sctp_get_tick_count() - ca->last_cong; +} + +static inline uint32_t +htcp_ccount(struct htcp *ca) +{ + return htcp_cong_time(ca) / ca->minRTT; +} + +static inline void +htcp_reset(struct htcp *ca) +{ + ca->undo_last_cong = ca->last_cong; + ca->undo_maxRTT = ca->maxRTT; + ca->undo_old_maxB = ca->old_maxB; + ca->last_cong = sctp_get_tick_count(); +} + +#ifdef SCTP_NOT_USED + +static uint32_t +htcp_cwnd_undo(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + net->htcp_ca.last_cong = net->htcp_ca.undo_last_cong; + net->htcp_ca.maxRTT = net->htcp_ca.undo_maxRTT; + net->htcp_ca.old_maxB = net->htcp_ca.undo_old_maxB; + return max(net->cwnd, ((net->ssthresh / net->mtu << 7) / net->htcp_ca.beta) * net->mtu); +} + +#endif + +static inline void +measure_rtt(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + uint32_t srtt = net->lastsa >> 3; + + /* keep track of minimum RTT seen so far, minRTT is zero at first */ + if (net->htcp_ca.minRTT > srtt || !net->htcp_ca.minRTT) + net->htcp_ca.minRTT = srtt; + + /* max RTT */ + if (net->fast_retran_ip == 0 && net->ssthresh < 0xFFFF && htcp_ccount(&net->htcp_ca) > 3) { + if (net->htcp_ca.maxRTT < net->htcp_ca.minRTT) + net->htcp_ca.maxRTT = net->htcp_ca.minRTT; + if (net->htcp_ca.maxRTT < srtt && srtt <= net->htcp_ca.maxRTT + MSEC_TO_TICKS(20)) + net->htcp_ca.maxRTT = srtt; + } +} + +static void +measure_achieved_throughput(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + uint32_t now = sctp_get_tick_count(); + + if (net->fast_retran_ip == 0) + net->htcp_ca.bytes_acked = net->net_ack; + + if (!use_bandwidth_switch) + return; + + /* achieved throughput calculations */ + /* JRS - not 100% sure of this statement */ + if (net->fast_retran_ip == 1) { + net->htcp_ca.bytecount = 0; + net->htcp_ca.lasttime = now; + return; + } + net->htcp_ca.bytecount += net->net_ack; + + if (net->htcp_ca.bytecount >= net->cwnd - ((net->htcp_ca.alpha >> 7 ? : 1) * net->mtu) + && now - net->htcp_ca.lasttime >= net->htcp_ca.minRTT + && net->htcp_ca.minRTT > 0) { + uint32_t cur_Bi = net->htcp_ca.bytecount / net->mtu * hz / (now - net->htcp_ca.lasttime); + + if (htcp_ccount(&net->htcp_ca) <= 3) { + /* just after backoff */ + net->htcp_ca.minB = net->htcp_ca.maxB = net->htcp_ca.Bi = cur_Bi; + } else { + net->htcp_ca.Bi = (3 * net->htcp_ca.Bi + cur_Bi) / 4; + if (net->htcp_ca.Bi > net->htcp_ca.maxB) + net->htcp_ca.maxB = net->htcp_ca.Bi; + if (net->htcp_ca.minB > net->htcp_ca.maxB) + net->htcp_ca.minB = net->htcp_ca.maxB; + } + net->htcp_ca.bytecount = 0; + net->htcp_ca.lasttime = now; + } +} + +static inline void +htcp_beta_update(struct htcp *ca, uint32_t minRTT, uint32_t maxRTT) +{ + if (use_bandwidth_switch) { + uint32_t maxB = ca->maxB; + uint32_t old_maxB = ca->old_maxB; + + ca->old_maxB = ca->maxB; + + if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) { + ca->beta = BETA_MIN; + ca->modeswitch = 0; + return; + } + } + if (ca->modeswitch && minRTT > (uint32_t) MSEC_TO_TICKS(10) && maxRTT) { + ca->beta = (minRTT << 7) / maxRTT; + if (ca->beta < BETA_MIN) + ca->beta = BETA_MIN; + else if (ca->beta > BETA_MAX) + ca->beta = BETA_MAX; + } else { + ca->beta = BETA_MIN; + ca->modeswitch = 1; + } +} + +static inline void +htcp_alpha_update(struct htcp *ca) +{ + uint32_t minRTT = ca->minRTT; + uint32_t factor = 1; + uint32_t diff = htcp_cong_time(ca); + + if (diff > (uint32_t) hz) { + diff -= hz; + factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / hz)) / hz; + } + if (use_rtt_scaling && minRTT) { + uint32_t scale = (hz << 3) / (10 * minRTT); + + scale = min(max(scale, 1U << 2), 10U << 3); /* clamping ratio to + * interval [0.5,10]<<3 */ + factor = (factor << 3) / scale; + if (!factor) + factor = 1; + } + ca->alpha = 2 * factor * ((1 << 7) - ca->beta); + if (!ca->alpha) + ca->alpha = ALPHA_BASE; +} + +/* After we have the rtt data to calculate beta, we'd still prefer to wait one + * rtt before we adjust our beta to ensure we are working from a consistent + * data. + * + * This function should be called when we hit a congestion event since only at + * that point do we really have a real sense of maxRTT (the queues en route + * were getting just too full now). + */ +static void +htcp_param_update(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + uint32_t minRTT = net->htcp_ca.minRTT; + uint32_t maxRTT = net->htcp_ca.maxRTT; + + htcp_beta_update(&net->htcp_ca, minRTT, maxRTT); + htcp_alpha_update(&net->htcp_ca); + + /* + * add slowly fading memory for maxRTT to accommodate routing + * changes etc + */ + if (minRTT > 0 && maxRTT > minRTT) + net->htcp_ca.maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100; +} + +static uint32_t +htcp_recalc_ssthresh(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + htcp_param_update(stcb, net); + return max(((net->cwnd / net->mtu * net->htcp_ca.beta) >> 7) * net->mtu, 2U * net->mtu); +} + +static void +htcp_cong_avoid(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + /*- + * How to handle these functions? + * if (!tcp_is_cwnd_limited(sk, in_flight)) RRS - good question. + * return; + */ + if (net->cwnd <= net->ssthresh) { + /* We are in slow start */ + if (net->flight_size + net->net_ack >= net->cwnd) { + if (net->net_ack > (net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable))) { + net->cwnd += (net->mtu * SCTP_BASE_SYSCTL(sctp_L2_abc_variable)); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->mtu, + SCTP_CWND_LOG_FROM_SS); + } + } else { + net->cwnd += net->net_ack; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->net_ack, + SCTP_CWND_LOG_FROM_SS); + } + } + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, net->net_ack, + SCTP_CWND_LOG_NOADV_SS); + } + } + } else { + measure_rtt(stcb, net); + + /* + * In dangerous area, increase slowly. In theory this is + * net->cwnd += alpha / net->cwnd + */ + /* What is snd_cwnd_cnt?? */ + if (((net->partial_bytes_acked / net->mtu * net->htcp_ca.alpha) >> 7) * net->mtu >= net->cwnd) { + /*- + * Does SCTP have a cwnd clamp? + * if (net->snd_cwnd < net->snd_cwnd_clamp) - Nope (RRS). + */ + net->cwnd += net->mtu; + net->partial_bytes_acked = 0; + htcp_alpha_update(&net->htcp_ca); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->mtu, + SCTP_CWND_LOG_FROM_CA); + } + } else { + net->partial_bytes_acked += net->net_ack; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, net->net_ack, + SCTP_CWND_LOG_NOADV_CA); + } + } + + net->htcp_ca.bytes_acked = net->mtu; + } +} + +#ifdef SCTP_NOT_USED +/* Lower bound on congestion window. */ +static uint32_t +htcp_min_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + return net->ssthresh; +} + +#endif + +static void +htcp_init(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + memset(&net->htcp_ca, 0, sizeof(struct htcp)); + net->htcp_ca.alpha = ALPHA_BASE; + net->htcp_ca.beta = BETA_MIN; + net->htcp_ca.bytes_acked = net->mtu; + net->htcp_ca.last_cong = sctp_get_tick_count(); +} + +void +sctp_htcp_set_initial_cc_param(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + /* + * We take the max of the burst limit times a MTU or the + * INITIAL_CWND. We then limit this to 4 MTU's of sending. + */ + net->cwnd = min((net->mtu * 4), max((2 * net->mtu), SCTP_INITIAL_CWND)); + net->ssthresh = stcb->asoc.peers_rwnd; + htcp_init(stcb, net); + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) { + sctp_log_cwnd(stcb, net, 0, SCTP_CWND_INITIALIZATION); + } +} + +void +sctp_htcp_cwnd_update_after_sack(struct sctp_tcb *stcb, + struct sctp_association *asoc, + int accum_moved, int reneged_all, int will_exit) +{ + struct sctp_nets *net; + + /******************************/ + /* update cwnd and Early FR */ + /******************************/ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + +#ifdef JANA_CMT_FAST_RECOVERY + /* + * CMT fast recovery code. Need to debug. + */ + if (net->fast_retran_loss_recovery && net->new_pseudo_cumack) { + if (compare_with_wrap(asoc->last_acked_seq, + net->fast_recovery_tsn, MAX_TSN) || + (asoc->last_acked_seq == net->fast_recovery_tsn) || + compare_with_wrap(net->pseudo_cumack, net->fast_recovery_tsn, MAX_TSN) || + (net->pseudo_cumack == net->fast_recovery_tsn)) { + net->will_exit_fast_recovery = 1; + } + } +#endif + if (SCTP_BASE_SYSCTL(sctp_early_fr)) { + /* + * So, first of all do we need to have a Early FR + * timer running? + */ + if ((!TAILQ_EMPTY(&asoc->sent_queue) && + (net->ref_count > 1) && + (net->flight_size < net->cwnd)) || + (reneged_all)) { + /* + * yes, so in this case stop it if its + * running, and then restart it. Reneging + * all is a special case where we want to + * run the Early FR timer and then force the + * last few unacked to be sent, causing us + * to illicit a sack with gaps to force out + * the others. + */ + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpidsck2); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_20); + } + SCTP_STAT_INCR(sctps_earlyfrstrid); + sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net); + } else { + /* No, stop it if its running */ + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpidsck3); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_21); + } + } + } + /* if nothing was acked on this destination skip it */ + if (net->net_ack == 0) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, 0, SCTP_CWND_LOG_FROM_SACK); + } + continue; + } + if (net->net_ack2 > 0) { + /* + * Karn's rule applies to clearing error count, this + * is optional. + */ + net->error_count = 0; + if ((net->dest_state & SCTP_ADDR_NOT_REACHABLE) == + SCTP_ADDR_NOT_REACHABLE) { + /* addr came good */ + net->dest_state &= ~SCTP_ADDR_NOT_REACHABLE; + net->dest_state |= SCTP_ADDR_REACHABLE; + sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, + SCTP_RECEIVED_SACK, (void *)net, SCTP_SO_NOT_LOCKED); + /* now was it the primary? if so restore */ + if (net->dest_state & SCTP_ADDR_WAS_PRIMARY) { + (void)sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net); + } + } + /* + * JRS 5/14/07 - If CMT PF is on and the destination + * is in PF state, set the destination to active + * state and set the cwnd to one or two MTU's based + * on whether PF1 or PF2 is being used. + * + * Should we stop any running T3 timer here? + */ + if ((asoc->sctp_cmt_on_off == 1) && + (asoc->sctp_cmt_pf > 0) && + ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) { + net->dest_state &= ~SCTP_ADDR_PF; + net->cwnd = net->mtu * asoc->sctp_cmt_pf; + SCTPDBG(SCTP_DEBUG_INDATA1, "Destination %p moved from PF to reachable with cwnd %d.\n", + net, net->cwnd); + /* + * Since the cwnd value is explicitly set, + * skip the code that updates the cwnd + * value. + */ + goto skip_cwnd_update; + } + } +#ifdef JANA_CMT_FAST_RECOVERY + /* + * CMT fast recovery code + */ + /* + * if (sctp_cmt_on_off == 1 && + * net->fast_retran_loss_recovery && + * net->will_exit_fast_recovery == 0) { @@@ Do something } + * else if (sctp_cmt_on_off == 0 && + * asoc->fast_retran_loss_recovery && will_exit == 0) { + */ +#endif + + if (asoc->fast_retran_loss_recovery && + will_exit == 0 && + (asoc->sctp_cmt_on_off == 0)) { + /* + * If we are in loss recovery we skip any cwnd + * update + */ + goto skip_cwnd_update; + } + /* + * CMT: CUC algorithm. Update cwnd if pseudo-cumack has + * moved. + */ + if (accum_moved || + ((asoc->sctp_cmt_on_off == 1) && net->new_pseudo_cumack)) { + htcp_cong_avoid(stcb, net); + measure_achieved_throughput(stcb, net); + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, net->mtu, + SCTP_CWND_LOG_NO_CUMACK); + } + } +skip_cwnd_update: + /* + * NOW, according to Karn's rule do we need to restore the + * RTO timer back? Check our net_ack2. If not set then we + * have a ambiguity.. i.e. all data ack'd was sent to more + * than one place. + */ + if (net->net_ack2) { + /* restore any doubled timers */ + net->RTO = ((net->lastsa >> 2) + net->lastsv) >> 1; + if (net->RTO < stcb->asoc.minrto) { + net->RTO = stcb->asoc.minrto; + } + if (net->RTO > stcb->asoc.maxrto) { + net->RTO = stcb->asoc.maxrto; + } + } + } +} + +void +sctp_htcp_cwnd_update_after_fr(struct sctp_tcb *stcb, + struct sctp_association *asoc) +{ + struct sctp_nets *net; + + /* + * CMT fast recovery code. Need to debug. ((sctp_cmt_on_off == 1) && + * (net->fast_retran_loss_recovery == 0))) + */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + if ((asoc->fast_retran_loss_recovery == 0) || + (asoc->sctp_cmt_on_off == 1)) { + /* out of a RFC2582 Fast recovery window? */ + if (net->net_ack > 0) { + /* + * per section 7.2.3, are there any + * destinations that had a fast retransmit + * to them. If so what we need to do is + * adjust ssthresh and cwnd. + */ + struct sctp_tmit_chunk *lchk; + int old_cwnd = net->cwnd; + + /* JRS - reset as if state were changed */ + htcp_reset(&net->htcp_ca); + net->ssthresh = htcp_recalc_ssthresh(stcb, net); + net->cwnd = net->ssthresh; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), + SCTP_CWND_LOG_FROM_FR); + } + lchk = TAILQ_FIRST(&asoc->send_queue); + + net->partial_bytes_acked = 0; + /* Turn on fast recovery window */ + asoc->fast_retran_loss_recovery = 1; + if (lchk == NULL) { + /* Mark end of the window */ + asoc->fast_recovery_tsn = asoc->sending_seq - 1; + } else { + asoc->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; + } + + /* + * CMT fast recovery -- per destination + * recovery variable. + */ + net->fast_retran_loss_recovery = 1; + + if (lchk == NULL) { + /* Mark end of the window */ + net->fast_recovery_tsn = asoc->sending_seq - 1; + } else { + net->fast_recovery_tsn = lchk->rec.data.TSN_seq - 1; + } + + /* + * Disable Nonce Sum Checking and store the + * resync tsn + */ + asoc->nonce_sum_check = 0; + asoc->nonce_resync_tsn = asoc->fast_recovery_tsn + 1; + + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_32); + sctp_timer_start(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, net); + } + } else if (net->net_ack > 0) { + /* + * Mark a peg that we WOULD have done a cwnd + * reduction but RFC2582 prevented this action. + */ + SCTP_STAT_INCR(sctps_fastretransinrtt); + } + } +} + +void +sctp_htcp_cwnd_update_after_timeout(struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + int old_cwnd = net->cwnd; + + /* JRS - reset as if the state were being changed to timeout */ + htcp_reset(&net->htcp_ca); + net->ssthresh = htcp_recalc_ssthresh(stcb, net); + net->cwnd = net->mtu; + net->partial_bytes_acked = 0; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->cwnd - old_cwnd, SCTP_CWND_LOG_FROM_RTX); + } +} + +void +sctp_htcp_cwnd_update_after_fr_timer(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, struct sctp_nets *net) +{ + int old_cwnd; + + old_cwnd = net->cwnd; + + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_EARLY_FR_TMR, SCTP_SO_NOT_LOCKED); + net->htcp_ca.last_cong = sctp_get_tick_count(); + /* + * make a small adjustment to cwnd and force to CA. + */ + if (net->cwnd > net->mtu) + /* drop down one MTU after sending */ + net->cwnd -= net->mtu; + if (net->cwnd < net->ssthresh) + /* still in SS move to CA */ + net->ssthresh = net->cwnd - 1; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, (old_cwnd - net->cwnd), SCTP_CWND_LOG_FROM_FR); + } +} + +void +sctp_htcp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + int old_cwnd; + + old_cwnd = net->cwnd; + + /* JRS - reset hctp as if state changed */ + htcp_reset(&net->htcp_ca); + SCTP_STAT_INCR(sctps_ecnereducedcwnd); + net->ssthresh = htcp_recalc_ssthresh(stcb, net); + if (net->ssthresh < net->mtu) { + net->ssthresh = net->mtu; + /* here back off the timer as well, to slow us down */ + net->RTO <<= 1; + } + net->cwnd = net->ssthresh; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, (net->cwnd - old_cwnd), SCTP_CWND_LOG_FROM_SAT); + } +} diff --git a/freebsd/sys/netinet/sctp_cc_functions.h b/freebsd/sys/netinet/sctp_cc_functions.h new file mode 100644 index 00000000..3b95d7de --- /dev/null +++ b/freebsd/sys/netinet/sctp_cc_functions.h @@ -0,0 +1,116 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_cc_functions_h__ +#define __sctp_cc_functions_h__ + +#if defined(_KERNEL) || defined(__Userspace__) + +void +sctp_set_initial_cc_param(struct sctp_tcb *stcb, + struct sctp_nets *net); + +void +sctp_cwnd_update_after_fr(struct sctp_tcb *stcb, + struct sctp_association *asoc); + +void +sctp_cwnd_update_after_sack(struct sctp_tcb *stcb, + struct sctp_association *asoc, + int accum_moved, int reneged_all, int will_exit); + +void +sctp_cwnd_update_after_timeout(struct sctp_tcb *stcb, + struct sctp_nets *net); + +void +sctp_hs_cwnd_update_after_fr(struct sctp_tcb *stcb, + struct sctp_association *asoc); + +void +sctp_hs_cwnd_update_after_sack(struct sctp_tcb *stcb, + struct sctp_association *asoc, + int accum_moved, int reneged_all, int will_exit); + +void +sctp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb, + struct sctp_nets *net); + +void +sctp_cwnd_update_after_packet_dropped(struct sctp_tcb *stcb, + struct sctp_nets *net, struct sctp_pktdrop_chunk *cp, + uint32_t * bottle_bw, uint32_t * on_queue); + +void +sctp_cwnd_update_after_output(struct sctp_tcb *stcb, + struct sctp_nets *net, int burst_limit); + +void +sctp_cwnd_update_after_fr_timer(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, struct sctp_nets *net); + +/* + * HTCP algorithms are directly taken from + * R.N.Shorten, D.J.Leith and are work/outcome from + * a Cisco-URP grant to enhance HTCP for satellite + * communications. We use the BSD Liscense + * granted from his source and have modified his + * algorithms to fit within the SCTP BSD framework. + */ + +void +sctp_htcp_set_initial_cc_param(struct sctp_tcb *stcb, + struct sctp_nets *net); + +void +sctp_htcp_cwnd_update_after_fr(struct sctp_tcb *stcb, + struct sctp_association *asoc); + +void +sctp_htcp_cwnd_update_after_sack(struct sctp_tcb *stcb, + struct sctp_association *asoc, + int accum_moved, int reneged_all, int will_exit); + +void +sctp_htcp_cwnd_update_after_timeout(struct sctp_tcb *stcb, + struct sctp_nets *net); + +void +sctp_htcp_cwnd_update_after_ecn_echo(struct sctp_tcb *stcb, + struct sctp_nets *net); + +void +sctp_htcp_cwnd_update_after_fr_timer(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, struct sctp_nets *net); + +#endif +#endif diff --git a/freebsd/sys/netinet/sctp_constants.h b/freebsd/sys/netinet/sctp_constants.h new file mode 100644 index 00000000..c4f4be23 --- /dev/null +++ b/freebsd/sys/netinet/sctp_constants.h @@ -0,0 +1,1051 @@ +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_constants.h,v 1.17 2005/03/06 16:04:17 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_constants_h__ +#define __sctp_constants_h__ + +/* IANA assigned port number for SCTP over UDP encapsulation */ +/* For freebsd we cannot bind the port at + * startup. Otherwise what will happen is + * we really won't be bound. The user must + * put it into the sysctl... or we need + * to build a special timer for this to allow + * us to wait 1 second or so after the system + * comes up. + */ +#define SCTP_OVER_UDP_TUNNELING_PORT 0 +/* Number of packets to get before sack sent by default */ +#define SCTP_DEFAULT_SACK_FREQ 2 + +/* Address limit - This variable is calculated + * based on an 65535 byte max ip packet. We take out 100 bytes + * for the cookie, 40 bytes for a v6 header and 32 + * bytes for the init structure. A second init structure + * for the init-ack and then finally a third one for the + * imbedded init. This yeilds 100+40+(3 * 32) = 236 bytes. + * This leaves 65299 bytes for addresses. We throw out the 299 bytes. + * Now whatever we send in the INIT() we need to allow to get back in the + * INIT-ACK plus all the values from INIT and INIT-ACK + * listed in the cookie. Plus we need some overhead for + * maybe copied parameters in the COOKIE. If we + * allow 1080 addresses, and each side has 1080 V6 addresses + * that will be 21600 bytes. In the INIT-ACK we will + * see the INIT-ACK 21600 + 43200 in the cookie. This leaves + * about 500 bytes slack for misc things in the cookie. + */ +#define SCTP_ADDRESS_LIMIT 1080 + +/* We need at least 2k of space for us, inits + * larger than that lets abort. + */ +#define SCTP_LARGEST_INIT_ACCEPTED (65535 - 2048) + +/* Number of addresses where we just skip the counting */ +#define SCTP_COUNT_LIMIT 40 + +#define SCTP_ZERO_COPY_TICK_DELAY (((100 * hz) + 999) / 1000) +#define SCTP_ZERO_COPY_SENDQ_TICK_DELAY (((100 * hz) + 999) / 1000) + +/* Number of ticks to delay before running + * iterator on an address change. + */ +#define SCTP_ADDRESS_TICK_DELAY 2 + +#define SCTP_VERSION_STRING "KAME-BSD 1.1" +/* #define SCTP_AUDITING_ENABLED 1 used for debug/auditing */ +#define SCTP_AUDIT_SIZE 256 + + +#define SCTP_KTRHEAD_NAME "sctp_iterator" +#define SCTP_KTHREAD_PAGES 0 + + +/* If you support Multi-VRF how big to + * make the initial array of VRF's to. + */ +#define SCTP_DEFAULT_VRF_SIZE 4 + +/* constants for rto calc */ +#define sctp_align_safe_nocopy 0 +#define sctp_align_unsafe_makecopy 1 + +/* JRS - Values defined for the HTCP algorithm */ +#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ +#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ +#define BETA_MAX 102 /* 0.8 with shift << 7 */ + +/* Places that CWND log can happen from */ +#define SCTP_CWND_LOG_FROM_FR 1 +#define SCTP_CWND_LOG_FROM_RTX 2 +#define SCTP_CWND_LOG_FROM_BRST 3 +#define SCTP_CWND_LOG_FROM_SS 4 +#define SCTP_CWND_LOG_FROM_CA 5 +#define SCTP_CWND_LOG_FROM_SAT 6 +#define SCTP_BLOCK_LOG_INTO_BLK 7 +#define SCTP_BLOCK_LOG_OUTOF_BLK 8 +#define SCTP_BLOCK_LOG_CHECK 9 +#define SCTP_STR_LOG_FROM_INTO_STRD 10 +#define SCTP_STR_LOG_FROM_IMMED_DEL 11 +#define SCTP_STR_LOG_FROM_INSERT_HD 12 +#define SCTP_STR_LOG_FROM_INSERT_MD 13 +#define SCTP_STR_LOG_FROM_INSERT_TL 14 +#define SCTP_STR_LOG_FROM_MARK_TSN 15 +#define SCTP_STR_LOG_FROM_EXPRS_DEL 16 +#define SCTP_FR_LOG_BIGGEST_TSNS 17 +#define SCTP_FR_LOG_STRIKE_TEST 18 +#define SCTP_FR_LOG_STRIKE_CHUNK 19 +#define SCTP_FR_T3_TIMEOUT 20 +#define SCTP_MAP_PREPARE_SLIDE 21 +#define SCTP_MAP_SLIDE_FROM 22 +#define SCTP_MAP_SLIDE_RESULT 23 +#define SCTP_MAP_SLIDE_CLEARED 24 +#define SCTP_MAP_SLIDE_NONE 25 +#define SCTP_FR_T3_MARK_TIME 26 +#define SCTP_FR_T3_MARKED 27 +#define SCTP_FR_T3_STOPPED 28 +#define SCTP_FR_MARKED 30 +#define SCTP_CWND_LOG_NOADV_SS 31 +#define SCTP_CWND_LOG_NOADV_CA 32 +#define SCTP_MAX_BURST_APPLIED 33 +#define SCTP_MAX_IFP_APPLIED 34 +#define SCTP_MAX_BURST_ERROR_STOP 35 +#define SCTP_INCREASE_PEER_RWND 36 +#define SCTP_DECREASE_PEER_RWND 37 +#define SCTP_SET_PEER_RWND_VIA_SACK 38 +#define SCTP_LOG_MBCNT_INCREASE 39 +#define SCTP_LOG_MBCNT_DECREASE 40 +#define SCTP_LOG_MBCNT_CHKSET 41 +#define SCTP_LOG_NEW_SACK 42 +#define SCTP_LOG_TSN_ACKED 43 +#define SCTP_LOG_TSN_REVOKED 44 +#define SCTP_LOG_LOCK_TCB 45 +#define SCTP_LOG_LOCK_INP 46 +#define SCTP_LOG_LOCK_SOCK 47 +#define SCTP_LOG_LOCK_SOCKBUF_R 48 +#define SCTP_LOG_LOCK_SOCKBUF_S 49 +#define SCTP_LOG_LOCK_CREATE 50 +#define SCTP_LOG_INITIAL_RTT 51 +#define SCTP_LOG_RTTVAR 52 +#define SCTP_LOG_SBALLOC 53 +#define SCTP_LOG_SBFREE 54 +#define SCTP_LOG_SBRESULT 55 +#define SCTP_FR_DUPED 56 +#define SCTP_FR_MARKED_EARLY 57 +#define SCTP_FR_CWND_REPORT 58 +#define SCTP_FR_CWND_REPORT_START 59 +#define SCTP_FR_CWND_REPORT_STOP 60 +#define SCTP_CWND_LOG_FROM_SEND 61 +#define SCTP_CWND_INITIALIZATION 62 +#define SCTP_CWND_LOG_FROM_T3 63 +#define SCTP_CWND_LOG_FROM_SACK 64 +#define SCTP_CWND_LOG_NO_CUMACK 65 +#define SCTP_CWND_LOG_FROM_RESEND 66 +#define SCTP_FR_LOG_CHECK_STRIKE 67 +#define SCTP_SEND_NOW_COMPLETES 68 +#define SCTP_CWND_LOG_FILL_OUTQ_CALLED 69 +#define SCTP_CWND_LOG_FILL_OUTQ_FILLS 70 +#define SCTP_LOG_FREE_SENT 71 +#define SCTP_NAGLE_APPLIED 72 +#define SCTP_NAGLE_SKIPPED 73 +#define SCTP_WAKESND_FROM_SACK 74 +#define SCTP_WAKESND_FROM_FWDTSN 75 +#define SCTP_NOWAKE_FROM_SACK 76 +#define SCTP_CWNDLOG_PRESEND 77 +#define SCTP_CWNDLOG_ENDSEND 78 +#define SCTP_AT_END_OF_SACK 79 +#define SCTP_REASON_FOR_SC 80 +#define SCTP_BLOCK_LOG_INTO_BLKA 81 +#define SCTP_ENTER_USER_RECV 82 +#define SCTP_USER_RECV_SACKS 83 +#define SCTP_SORECV_BLOCKSA 84 +#define SCTP_SORECV_BLOCKSB 85 +#define SCTP_SORECV_DONE 86 +#define SCTP_SACK_RWND_UPDATE 87 +#define SCTP_SORECV_ENTER 88 +#define SCTP_SORECV_ENTERPL 89 +#define SCTP_MBUF_INPUT 90 +#define SCTP_MBUF_IALLOC 91 +#define SCTP_MBUF_IFREE 92 +#define SCTP_MBUF_ICOPY 93 +#define SCTP_MBUF_SPLIT 94 +#define SCTP_SORCV_FREECTL 95 +#define SCTP_SORCV_DOESCPY 96 +#define SCTP_SORCV_DOESLCK 97 +#define SCTP_SORCV_DOESADJ 98 +#define SCTP_SORCV_BOTWHILE 99 +#define SCTP_SORCV_PASSBF 100 +#define SCTP_SORCV_ADJD 101 +#define SCTP_UNKNOWN_MAX 102 +#define SCTP_RANDY_STUFF 103 +#define SCTP_RANDY_STUFF1 104 +#define SCTP_STRMOUT_LOG_ASSIGN 105 +#define SCTP_STRMOUT_LOG_SEND 106 +#define SCTP_FLIGHT_LOG_DOWN_CA 107 +#define SCTP_FLIGHT_LOG_UP 108 +#define SCTP_FLIGHT_LOG_DOWN_GAP 109 +#define SCTP_FLIGHT_LOG_DOWN_RSND 110 +#define SCTP_FLIGHT_LOG_UP_RSND 111 +#define SCTP_FLIGHT_LOG_DOWN_RSND_TO 112 +#define SCTP_FLIGHT_LOG_DOWN_WP 113 +#define SCTP_FLIGHT_LOG_UP_REVOKE 114 +#define SCTP_FLIGHT_LOG_DOWN_PDRP 115 +#define SCTP_FLIGHT_LOG_DOWN_PMTU 116 +#define SCTP_SACK_LOG_NORMAL 117 +#define SCTP_SACK_LOG_EXPRESS 118 +#define SCTP_MAP_TSN_ENTERS 119 +#define SCTP_THRESHOLD_CLEAR 120 +#define SCTP_THRESHOLD_INCR 121 +#define SCTP_FLIGHT_LOG_DWN_WP_FWD 122 +#define SCTP_FWD_TSN_CHECK 123 +#define SCTP_LOG_MAX_TYPES 124 +/* + * To turn on various logging, you must first enable 'options KTR' and + * you might want to bump the entires 'options KTR_ENTRIES=80000'. + * To get something to log you define one of the logging defines. + * (see LINT). + * + * This gets the compile in place, but you still need to turn the + * logging flag on too in the sysctl (see in sctp.h). + */ + +#define SCTP_LOG_EVENT_UNKNOWN 0 +#define SCTP_LOG_EVENT_CWND 1 +#define SCTP_LOG_EVENT_BLOCK 2 +#define SCTP_LOG_EVENT_STRM 3 +#define SCTP_LOG_EVENT_FR 4 +#define SCTP_LOG_EVENT_MAP 5 +#define SCTP_LOG_EVENT_MAXBURST 6 +#define SCTP_LOG_EVENT_RWND 7 +#define SCTP_LOG_EVENT_MBCNT 8 +#define SCTP_LOG_EVENT_SACK 9 +#define SCTP_LOG_LOCK_EVENT 10 +#define SCTP_LOG_EVENT_RTT 11 +#define SCTP_LOG_EVENT_SB 12 +#define SCTP_LOG_EVENT_NAGLE 13 +#define SCTP_LOG_EVENT_WAKE 14 +#define SCTP_LOG_MISC_EVENT 15 +#define SCTP_LOG_EVENT_CLOSE 16 +#define SCTP_LOG_EVENT_MBUF 17 +#define SCTP_LOG_CHUNK_PROC 18 +#define SCTP_LOG_ERROR_RET 19 + +#define SCTP_LOG_MAX_EVENT 20 + +#define SCTP_LOCK_UNKNOWN 2 + + +/* number of associations by default for zone allocation */ +#define SCTP_MAX_NUM_OF_ASOC 40000 +/* how many addresses per assoc remote and local */ +#define SCTP_SCALE_FOR_ADDR 2 + +/* default AUTO_ASCONF mode enable(1)/disable(0) value (sysctl) */ +#define SCTP_DEFAULT_AUTO_ASCONF 1 + +/* default MULTIPLE_ASCONF mode enable(1)/disable(0) value (sysctl) */ +#define SCTP_DEFAULT_MULTIPLE_ASCONFS 0 + +/* default MOBILITY_BASE mode enable(1)/disable(0) value (sysctl) */ +#define SCTP_DEFAULT_MOBILITY_BASE 0 + +/* default MOBILITY_FASTHANDOFF mode enable(1)/disable(0) value (sysctl) */ +#define SCTP_DEFAULT_MOBILITY_FASTHANDOFF 0 + +/* + * Theshold for rwnd updates, we have to read (sb_hiwat >> + * SCTP_RWND_HIWAT_SHIFT) before we will look to see if we need to send a + * window update sack. When we look, we compare the last rwnd we sent vs the + * current rwnd. It too must be greater than this value. Using 3 divdes the + * hiwat by 8, so for 200k rwnd we need to read 24k. For a 64k rwnd we need + * to read 8k. This seems about right.. I hope :-D.. we do set a + * min of a MTU on it so if the rwnd is real small we will insist + * on a full MTU of 1500 bytes. + */ +#define SCTP_RWND_HIWAT_SHIFT 3 + +/* How much of the rwnd must the + * message be taking up to start partial delivery. + * We calculate this by shifing the hi_water (recv_win) + * left the following .. set to 1, when a message holds + * 1/2 the rwnd. If we set it to 2 when a message holds + * 1/4 the rwnd...etc.. + */ + +#define SCTP_PARTIAL_DELIVERY_SHIFT 1 + +/* + * default HMAC for cookies, etc... use one of the AUTH HMAC id's + * SCTP_HMAC is the HMAC_ID to use + * SCTP_SIGNATURE_SIZE is the digest length + */ +#define SCTP_HMAC SCTP_AUTH_HMAC_ID_SHA1 +#define SCTP_SIGNATURE_SIZE SCTP_AUTH_DIGEST_LEN_SHA1 +#define SCTP_SIGNATURE_ALOC_SIZE SCTP_SIGNATURE_SIZE + +/* + * the SCTP protocol signature this includes the version number encoded in + * the last 4 bits of the signature. + */ +#define PROTO_SIGNATURE_A 0x30000000 +#define SCTP_VERSION_NUMBER 0x3 + +#define MAX_TSN 0xffffffff +#define MAX_SEQ 0xffff + +/* how many executions every N tick's */ +#define SCTP_ITERATOR_MAX_AT_ONCE 20 + +/* number of clock ticks between iterator executions */ +#define SCTP_ITERATOR_TICKS 1 + +/* + * option: If you comment out the following you will receive the old behavior + * of obeying cwnd for the fast retransmit algorithm. With this defined a FR + * happens right away with-out waiting for the flightsize to drop below the + * cwnd value (which is reduced by the FR to 1/2 the inflight packets). + */ +#define SCTP_IGNORE_CWND_ON_FR 1 + +/* + * Adds implementors guide behavior to only use newest highest update in SACK + * gap ack's to figure out if you need to stroke a chunk for FR. + */ +#define SCTP_NO_FR_UNLESS_SEGMENT_SMALLER 1 + +/* default max I can burst out after a fast retransmit */ +#define SCTP_DEF_MAX_BURST 4 +/* IP hdr (20/40) + 12+2+2 (enet) + sctp common 12 */ +#define SCTP_FIRST_MBUF_RESV 68 +/* Packet transmit states in the sent field */ +#define SCTP_DATAGRAM_UNSENT 0 +#define SCTP_DATAGRAM_SENT 1 +#define SCTP_DATAGRAM_RESEND1 2 /* not used (in code, but may + * hit this value) */ +#define SCTP_DATAGRAM_RESEND2 3 /* not used (in code, but may + * hit this value) */ +#define SCTP_DATAGRAM_RESEND 4 +#define SCTP_DATAGRAM_ACKED 10010 +#define SCTP_DATAGRAM_MARKED 20010 +#define SCTP_FORWARD_TSN_SKIP 30010 + +/* chunk output send from locations */ +#define SCTP_OUTPUT_FROM_USR_SEND 0 +#define SCTP_OUTPUT_FROM_T3 1 +#define SCTP_OUTPUT_FROM_INPUT_ERROR 2 +#define SCTP_OUTPUT_FROM_CONTROL_PROC 3 +#define SCTP_OUTPUT_FROM_SACK_TMR 4 +#define SCTP_OUTPUT_FROM_SHUT_TMR 5 +#define SCTP_OUTPUT_FROM_HB_TMR 6 +#define SCTP_OUTPUT_FROM_SHUT_ACK_TMR 7 +#define SCTP_OUTPUT_FROM_ASCONF_TMR 8 +#define SCTP_OUTPUT_FROM_STRRST_TMR 9 +#define SCTP_OUTPUT_FROM_AUTOCLOSE_TMR 10 +#define SCTP_OUTPUT_FROM_EARLY_FR_TMR 11 +#define SCTP_OUTPUT_FROM_STRRST_REQ 12 +#define SCTP_OUTPUT_FROM_USR_RCVD 13 +#define SCTP_OUTPUT_FROM_COOKIE_ACK 14 +#define SCTP_OUTPUT_FROM_DRAIN 15 +#define SCTP_OUTPUT_FROM_CLOSING 16 +/* SCTP chunk types are moved sctp.h for application (NAT, FW) use */ + +/* align to 32-bit sizes */ +#define SCTP_SIZE32(x) ((((x)+3) >> 2) << 2) + +#define IS_SCTP_CONTROL(a) ((a)->chunk_type != SCTP_DATA) +#define IS_SCTP_DATA(a) ((a)->chunk_type == SCTP_DATA) + + +/* SCTP parameter types */ +/*************0x0000 series*************/ +#define SCTP_HEARTBEAT_INFO 0x0001 +#define SCTP_IPV4_ADDRESS 0x0005 +#define SCTP_IPV6_ADDRESS 0x0006 +#define SCTP_STATE_COOKIE 0x0007 +#define SCTP_UNRECOG_PARAM 0x0008 +#define SCTP_COOKIE_PRESERVE 0x0009 +#define SCTP_HOSTNAME_ADDRESS 0x000b +#define SCTP_SUPPORTED_ADDRTYPE 0x000c + +/* draft-ietf-stewart-tsvwg-strreset-xxx */ +#define SCTP_STR_RESET_OUT_REQUEST 0x000d +#define SCTP_STR_RESET_IN_REQUEST 0x000e +#define SCTP_STR_RESET_TSN_REQUEST 0x000f +#define SCTP_STR_RESET_RESPONSE 0x0010 +#define SCTP_STR_RESET_ADD_STREAMS 0x0011 + +#define SCTP_MAX_RESET_PARAMS 2 +#define SCTP_STREAM_RESET_TSN_DELTA 0x1000 + +/*************0x4000 series*************/ + +/*************0x8000 series*************/ +#define SCTP_ECN_CAPABLE 0x8000 +/* ECN Nonce: draft-ladha-sctp-ecn-nonce */ +#define SCTP_ECN_NONCE_SUPPORTED 0x8001 +/* draft-ietf-tsvwg-auth-xxx */ +#define SCTP_RANDOM 0x8002 +#define SCTP_CHUNK_LIST 0x8003 +#define SCTP_HMAC_LIST 0x8004 +/* + * draft-ietf-tsvwg-addip-sctp-xx param=0x8008 len=0xNNNN Byte | Byte | Byte + * | Byte Byte | Byte ... + * + * Where each byte is a chunk type extension supported. For example, to support + * all chunks one would have (in hex): + * + * 80 01 00 09 C0 C1 80 81 82 00 00 00 + * + * Has the parameter. C0 = PR-SCTP (RFC3758) C1, 80 = ASCONF (addip draft) 81 + * = Packet Drop 82 = Stream Reset 83 = Authentication + */ +#define SCTP_SUPPORTED_CHUNK_EXT 0x8008 + +/*************0xC000 series*************/ +#define SCTP_PRSCTP_SUPPORTED 0xc000 +/* draft-ietf-tsvwg-addip-sctp */ +#define SCTP_ADD_IP_ADDRESS 0xc001 +#define SCTP_DEL_IP_ADDRESS 0xc002 +#define SCTP_ERROR_CAUSE_IND 0xc003 +#define SCTP_SET_PRIM_ADDR 0xc004 +#define SCTP_SUCCESS_REPORT 0xc005 +#define SCTP_ULP_ADAPTATION 0xc006 +/* behave-nat-draft */ +#define SCTP_HAS_NAT_SUPPORT 0xc007 +#define SCTP_NAT_VTAGS 0xc008 + +/* Notification error codes */ +#define SCTP_NOTIFY_DATAGRAM_UNSENT 0x0001 +#define SCTP_NOTIFY_DATAGRAM_SENT 0x0002 +#define SCTP_FAILED_THRESHOLD 0x0004 +#define SCTP_HEARTBEAT_SUCCESS 0x0008 +#define SCTP_RESPONSE_TO_USER_REQ 0x0010 +#define SCTP_INTERNAL_ERROR 0x0020 +#define SCTP_SHUTDOWN_GUARD_EXPIRES 0x0040 +#define SCTP_RECEIVED_SACK 0x0080 +#define SCTP_PEER_FAULTY 0x0100 +#define SCTP_ICMP_REFUSED 0x0200 + +/* bits for TOS field */ +#define SCTP_ECT0_BIT 0x02 +#define SCTP_ECT1_BIT 0x01 +#define SCTP_CE_BITS 0x03 + +/* below turns off above */ +#define SCTP_FLEXIBLE_ADDRESS 0x20 +#define SCTP_NO_HEARTBEAT 0x40 + +/* mask to get sticky */ +#define SCTP_STICKY_OPTIONS_MASK 0x0c + + +/* + * SCTP states for internal state machine XXX (should match "user" values) + */ +#define SCTP_STATE_EMPTY 0x0000 +#define SCTP_STATE_INUSE 0x0001 +#define SCTP_STATE_COOKIE_WAIT 0x0002 +#define SCTP_STATE_COOKIE_ECHOED 0x0004 +#define SCTP_STATE_OPEN 0x0008 +#define SCTP_STATE_SHUTDOWN_SENT 0x0010 +#define SCTP_STATE_SHUTDOWN_RECEIVED 0x0020 +#define SCTP_STATE_SHUTDOWN_ACK_SENT 0x0040 +#define SCTP_STATE_SHUTDOWN_PENDING 0x0080 +#define SCTP_STATE_CLOSED_SOCKET 0x0100 +#define SCTP_STATE_ABOUT_TO_BE_FREED 0x0200 +#define SCTP_STATE_PARTIAL_MSG_LEFT 0x0400 +#define SCTP_STATE_WAS_ABORTED 0x0800 +#define SCTP_STATE_IN_ACCEPT_QUEUE 0x1000 +#define SCTP_STATE_MASK 0x007f + +#define SCTP_GET_STATE(asoc) ((asoc)->state & SCTP_STATE_MASK) +#define SCTP_SET_STATE(asoc, newstate) ((asoc)->state = ((asoc)->state & ~SCTP_STATE_MASK) | newstate) +#define SCTP_CLEAR_SUBSTATE(asoc, substate) ((asoc)->state &= ~substate) +#define SCTP_ADD_SUBSTATE(asoc, substate) ((asoc)->state |= substate) + +/* SCTP reachability state for each address */ +#define SCTP_ADDR_REACHABLE 0x001 +#define SCTP_ADDR_NOT_REACHABLE 0x002 +#define SCTP_ADDR_NOHB 0x004 +#define SCTP_ADDR_BEING_DELETED 0x008 +#define SCTP_ADDR_NOT_IN_ASSOC 0x010 +#define SCTP_ADDR_WAS_PRIMARY 0x020 +#define SCTP_ADDR_SWITCH_PRIMARY 0x040 +#define SCTP_ADDR_OUT_OF_SCOPE 0x080 +#define SCTP_ADDR_DOUBLE_SWITCH 0x100 +#define SCTP_ADDR_UNCONFIRMED 0x200 +#define SCTP_ADDR_REQ_PRIMARY 0x400 +/* JRS 5/13/07 - Added potentially failed state for CMT PF */ +#define SCTP_ADDR_PF 0x800 +#define SCTP_REACHABLE_MASK 0x203 + +/* bound address types (e.g. valid address types to allow) */ +#define SCTP_BOUND_V6 0x01 +#define SCTP_BOUND_V4 0x02 + +/* + * what is the default number of mbufs in a chain I allow before switching to + * a cluster + */ +#define SCTP_DEFAULT_MBUFS_IN_CHAIN 5 + +/* How long a cookie lives in milli-seconds */ +#define SCTP_DEFAULT_COOKIE_LIFE 60000 + +/* resource limit of streams */ +#define MAX_SCTP_STREAMS 2048 + +/* Maximum the mapping array will grow to (TSN mapping array) */ +#define SCTP_MAPPING_ARRAY 512 + +/* size of the inital malloc on the mapping array */ +#define SCTP_INITIAL_MAPPING_ARRAY 16 +/* how much we grow the mapping array each call */ +#define SCTP_MAPPING_ARRAY_INCR 32 + +/* + * Here we define the timer types used by the implementation as arguments in + * the set/get timer type calls. + */ +#define SCTP_TIMER_INIT 0 +#define SCTP_TIMER_RECV 1 +#define SCTP_TIMER_SEND 2 +#define SCTP_TIMER_HEARTBEAT 3 +#define SCTP_TIMER_PMTU 4 +#define SCTP_TIMER_MAXSHUTDOWN 5 +#define SCTP_TIMER_SIGNATURE 6 +/* + * number of timer types in the base SCTP structure used in the set/get and + * has the base default. + */ +#define SCTP_NUM_TMRS 7 + +/* timer types */ +#define SCTP_TIMER_TYPE_NONE 0 +#define SCTP_TIMER_TYPE_SEND 1 +#define SCTP_TIMER_TYPE_INIT 2 +#define SCTP_TIMER_TYPE_RECV 3 +#define SCTP_TIMER_TYPE_SHUTDOWN 4 +#define SCTP_TIMER_TYPE_HEARTBEAT 5 +#define SCTP_TIMER_TYPE_COOKIE 6 +#define SCTP_TIMER_TYPE_NEWCOOKIE 7 +#define SCTP_TIMER_TYPE_PATHMTURAISE 8 +#define SCTP_TIMER_TYPE_SHUTDOWNACK 9 +#define SCTP_TIMER_TYPE_ASCONF 10 +#define SCTP_TIMER_TYPE_SHUTDOWNGUARD 11 +#define SCTP_TIMER_TYPE_AUTOCLOSE 12 +#define SCTP_TIMER_TYPE_EVENTWAKE 13 +#define SCTP_TIMER_TYPE_STRRESET 14 +#define SCTP_TIMER_TYPE_INPKILL 15 +#define SCTP_TIMER_TYPE_EARLYFR 17 +#define SCTP_TIMER_TYPE_ASOCKILL 18 +#define SCTP_TIMER_TYPE_ADDR_WQ 19 +#define SCTP_TIMER_TYPE_ZERO_COPY 20 +#define SCTP_TIMER_TYPE_ZCOPY_SENDQ 21 +#define SCTP_TIMER_TYPE_PRIM_DELETED 22 +/* add new timers here - and increment LAST */ +#define SCTP_TIMER_TYPE_LAST 23 + +#define SCTP_IS_TIMER_TYPE_VALID(t) (((t) > SCTP_TIMER_TYPE_NONE) && \ + ((t) < SCTP_TIMER_TYPE_LAST)) + + + +/* max number of TSN's dup'd that I will hold */ +#define SCTP_MAX_DUP_TSNS 20 + +/* + * Here we define the types used when setting the retry amounts. + */ +/* How many drop re-attempts we make on INIT/COOKIE-ECHO */ +#define SCTP_RETRY_DROPPED_THRESH 4 + +/* + * Maxmium number of chunks a single association can have on it. Note that + * this is a squishy number since the count can run over this if the user + * sends a large message down .. the fragmented chunks don't count until + * AFTER the message is on queue.. it would be the next send that blocks + * things. This number will get tuned up at boot in the sctp_init and use the + * number of clusters as a base. This way high bandwidth environments will + * not get impacted by the lower bandwidth sending a bunch of 1 byte chunks + */ +#define SCTP_ASOC_MAX_CHUNKS_ON_QUEUE 512 + + +/* The conversion from time to ticks and vice versa is done by rounding + * upwards. This way we can test in the code the time to be positive and + * know that this corresponds to a positive number of ticks. + */ +#define MSEC_TO_TICKS(x) ((hz == 1000) ? x : ((((x) * hz) + 999) / 1000)) +#define TICKS_TO_MSEC(x) ((hz == 1000) ? x : ((((x) * 1000) + (hz - 1)) / hz)) + +#define SEC_TO_TICKS(x) ((x) * hz) +#define TICKS_TO_SEC(x) (((x) + (hz - 1)) / hz) + +/* + * Basically the minimum amount of time before I do a early FR. Making this + * value to low will cause duplicate retransmissions. + */ +#define SCTP_MINFR_MSEC_TIMER 250 +/* The floor this value is allowed to fall to when starting a timer. */ +#define SCTP_MINFR_MSEC_FLOOR 20 + +/* init timer def = 1 sec */ +#define SCTP_INIT_SEC 1 + +/* send timer def = 1 seconds */ +#define SCTP_SEND_SEC 1 + +/* recv timer def = 200ms */ +#define SCTP_RECV_MSEC 200 + +/* 30 seconds + RTO (in ms) */ +#define SCTP_HB_DEFAULT_MSEC 30000 + +/* Max time I will wait for Shutdown to complete */ +#define SCTP_DEF_MAX_SHUTDOWN_SEC 180 + + +/* + * This is how long a secret lives, NOT how long a cookie lives how many + * ticks the current secret will live. + */ +#define SCTP_DEFAULT_SECRET_LIFE_SEC 3600 + +#define SCTP_RTO_UPPER_BOUND (60000) /* 60 sec in ms */ +#define SCTP_RTO_UPPER_BOUND_SEC 60 /* for the init timer */ +#define SCTP_RTO_LOWER_BOUND (1000) /* 1 sec in ms */ +#define SCTP_RTO_INITIAL (3000) /* 3 sec in ms */ + + +#define SCTP_INP_KILL_TIMEOUT 20/* number of ms to retry kill of inpcb */ +#define SCTP_ASOC_KILL_TIMEOUT 10 /* number of ms to retry kill of inpcb */ + +#define SCTP_DEF_MAX_INIT 8 +#define SCTP_DEF_MAX_SEND 10 +#define SCTP_DEF_MAX_PATH_RTX 5 + +#define SCTP_DEF_PMTU_RAISE_SEC 600 /* 10 min between raise attempts */ + + +/* How many streams I request initally by default */ +#define SCTP_OSTREAM_INITIAL 10 + +/* + * How many smallest_mtu's need to increase before a window update sack is + * sent (should be a power of 2). + */ +/* Send window update (incr * this > hiwat). Should be a power of 2 */ +#define SCTP_MINIMAL_RWND (4096) /* minimal rwnd */ + +#define SCTP_ADDRMAX 24 + +/* SCTP DEBUG Switch parameters */ +#define SCTP_DEBUG_TIMER1 0x00000001 +#define SCTP_DEBUG_TIMER2 0x00000002 /* unused */ +#define SCTP_DEBUG_TIMER3 0x00000004 /* unused */ +#define SCTP_DEBUG_TIMER4 0x00000008 +#define SCTP_DEBUG_OUTPUT1 0x00000010 +#define SCTP_DEBUG_OUTPUT2 0x00000020 +#define SCTP_DEBUG_OUTPUT3 0x00000040 +#define SCTP_DEBUG_OUTPUT4 0x00000080 +#define SCTP_DEBUG_UTIL1 0x00000100 +#define SCTP_DEBUG_UTIL2 0x00000200 /* unused */ +#define SCTP_DEBUG_AUTH1 0x00000400 +#define SCTP_DEBUG_AUTH2 0x00000800 /* unused */ +#define SCTP_DEBUG_INPUT1 0x00001000 +#define SCTP_DEBUG_INPUT2 0x00002000 +#define SCTP_DEBUG_INPUT3 0x00004000 +#define SCTP_DEBUG_INPUT4 0x00008000 /* unused */ +#define SCTP_DEBUG_ASCONF1 0x00010000 +#define SCTP_DEBUG_ASCONF2 0x00020000 +#define SCTP_DEBUG_OUTPUT5 0x00040000 /* unused */ +#define SCTP_DEBUG_XXX 0x00080000 /* unused */ +#define SCTP_DEBUG_PCB1 0x00100000 +#define SCTP_DEBUG_PCB2 0x00200000 /* unused */ +#define SCTP_DEBUG_PCB3 0x00400000 +#define SCTP_DEBUG_PCB4 0x00800000 +#define SCTP_DEBUG_INDATA1 0x01000000 +#define SCTP_DEBUG_INDATA2 0x02000000 /* unused */ +#define SCTP_DEBUG_INDATA3 0x04000000 /* unused */ +#define SCTP_DEBUG_CRCOFFLOAD 0x08000000 /* unused */ +#define SCTP_DEBUG_USRREQ1 0x10000000 /* unused */ +#define SCTP_DEBUG_USRREQ2 0x20000000 /* unused */ +#define SCTP_DEBUG_PEEL1 0x40000000 +#define SCTP_DEBUG_XXXXX 0x80000000 /* unused */ +#define SCTP_DEBUG_ALL 0x7ff3ffff +#define SCTP_DEBUG_NOISY 0x00040000 + +/* What sender needs to see to avoid SWS or we consider peers rwnd 0 */ +#define SCTP_SWS_SENDER_DEF 1420 + +/* + * SWS is scaled to the sb_hiwat of the socket. A value of 2 is hiwat/4, 1 + * would be hiwat/2 etc. + */ +/* What receiver needs to see in sockbuf or we tell peer its 1 */ +#define SCTP_SWS_RECEIVER_DEF 3000 + +#define SCTP_INITIAL_CWND 4380 + +#define SCTP_DEFAULT_MTU 1500 /* emergency default MTU */ +/* amount peer is obligated to have in rwnd or I will abort */ +#define SCTP_MIN_RWND 1500 + +#define SCTP_DEFAULT_MAXSEGMENT 65535 + +#define SCTP_CHUNK_BUFFER_SIZE 512 +#define SCTP_PARAM_BUFFER_SIZE 512 + +/* small chunk store for looking at chunk_list in auth */ +#define SCTP_SMALL_CHUNK_STORE 260 + +#define SCTP_DEFAULT_MINSEGMENT 512 /* MTU size ... if no mtu disc */ +#define SCTP_HOW_MANY_SECRETS 2 /* how many secrets I keep */ + +#define SCTP_NUMBER_OF_SECRETS 8 /* or 8 * 4 = 32 octets */ +#define SCTP_SECRET_SIZE 32 /* number of octets in a 256 bits */ + + +/* + * SCTP upper layer notifications + */ +#define SCTP_NOTIFY_ASSOC_UP 1 +#define SCTP_NOTIFY_ASSOC_DOWN 2 +#define SCTP_NOTIFY_INTERFACE_DOWN 3 +#define SCTP_NOTIFY_INTERFACE_UP 4 +#define SCTP_NOTIFY_DG_FAIL 5 +#define SCTP_NOTIFY_STRDATA_ERR 6 +#define SCTP_NOTIFY_ASSOC_ABORTED 7 +#define SCTP_NOTIFY_PEER_OPENED_STREAM 8 +#define SCTP_NOTIFY_STREAM_OPENED_OK 9 +#define SCTP_NOTIFY_ASSOC_RESTART 10 +#define SCTP_NOTIFY_HB_RESP 11 +#define SCTP_NOTIFY_ASCONF_SUCCESS 12 +#define SCTP_NOTIFY_ASCONF_FAILED 13 +#define SCTP_NOTIFY_PEER_SHUTDOWN 14 +#define SCTP_NOTIFY_ASCONF_ADD_IP 15 +#define SCTP_NOTIFY_ASCONF_DELETE_IP 16 +#define SCTP_NOTIFY_ASCONF_SET_PRIMARY 17 +#define SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION 18 +#define SCTP_NOTIFY_INTERFACE_CONFIRMED 20 +#define SCTP_NOTIFY_STR_RESET_RECV 21 +#define SCTP_NOTIFY_STR_RESET_SEND 22 +#define SCTP_NOTIFY_STR_RESET_FAILED_OUT 23 +#define SCTP_NOTIFY_STR_RESET_FAILED_IN 24 +#define SCTP_NOTIFY_AUTH_NEW_KEY 25 +#define SCTP_NOTIFY_AUTH_FREE_KEY 26 +#define SCTP_NOTIFY_SPECIAL_SP_FAIL 27 +#define SCTP_NOTIFY_NO_PEER_AUTH 28 +#define SCTP_NOTIFY_SENDER_DRY 29 +#define SCTP_NOTIFY_STR_RESET_ADD_OK 30 +#define SCTP_NOTIFY_STR_RESET_ADD_FAIL 31 +#define SCTP_NOTIFY_STR_RESET_INSTREAM_ADD_OK 32 +#define SCTP_NOTIFY_MAX 32 + + +/* This is the value for messages that are NOT completely + * copied down where we will start to split the message. + * So, with our default, we split only if the piece we + * want to take will fill up a full MTU (assuming + * a 1500 byte MTU). + */ +#define SCTP_DEFAULT_SPLIT_POINT_MIN 2904 + +/* ABORT CODES and other tell-tale location + * codes are generated by adding the below + * to the instance id. + */ + +/* File defines */ +#define SCTP_FROM_SCTP_INPUT 0x10000000 +#define SCTP_FROM_SCTP_PCB 0x20000000 +#define SCTP_FROM_SCTP_INDATA 0x30000000 +#define SCTP_FROM_SCTP_TIMER 0x40000000 +#define SCTP_FROM_SCTP_USRREQ 0x50000000 +#define SCTP_FROM_SCTPUTIL 0x60000000 +#define SCTP_FROM_SCTP6_USRREQ 0x70000000 +#define SCTP_FROM_SCTP_ASCONF 0x80000000 +#define SCTP_FROM_SCTP_OUTPUT 0x90000000 +#define SCTP_FROM_SCTP_PEELOFF 0xa0000000 +#define SCTP_FROM_SCTP_PANDA 0xb0000000 +#define SCTP_FROM_SCTP_SYSCTL 0xc0000000 + +/* Location ID's */ +#define SCTP_LOC_1 0x00000001 +#define SCTP_LOC_2 0x00000002 +#define SCTP_LOC_3 0x00000003 +#define SCTP_LOC_4 0x00000004 +#define SCTP_LOC_5 0x00000005 +#define SCTP_LOC_6 0x00000006 +#define SCTP_LOC_7 0x00000007 +#define SCTP_LOC_8 0x00000008 +#define SCTP_LOC_9 0x00000009 +#define SCTP_LOC_10 0x0000000a +#define SCTP_LOC_11 0x0000000b +#define SCTP_LOC_12 0x0000000c +#define SCTP_LOC_13 0x0000000d +#define SCTP_LOC_14 0x0000000e +#define SCTP_LOC_15 0x0000000f +#define SCTP_LOC_16 0x00000010 +#define SCTP_LOC_17 0x00000011 +#define SCTP_LOC_18 0x00000012 +#define SCTP_LOC_19 0x00000013 +#define SCTP_LOC_20 0x00000014 +#define SCTP_LOC_21 0x00000015 +#define SCTP_LOC_22 0x00000016 +#define SCTP_LOC_23 0x00000017 +#define SCTP_LOC_24 0x00000018 +#define SCTP_LOC_25 0x00000019 +#define SCTP_LOC_26 0x0000001a +#define SCTP_LOC_27 0x0000001b +#define SCTP_LOC_28 0x0000001c +#define SCTP_LOC_29 0x0000001d +#define SCTP_LOC_30 0x0000001e +#define SCTP_LOC_31 0x0000001f +#define SCTP_LOC_32 0x00000020 +#define SCTP_LOC_33 0x00000021 + + +/* Free assoc codes */ +#define SCTP_NORMAL_PROC 0 +#define SCTP_PCBFREE_NOFORCE 1 +#define SCTP_PCBFREE_FORCE 2 + +/* From codes for adding addresses */ +#define SCTP_ADDR_IS_CONFIRMED 8 +#define SCTP_ADDR_DYNAMIC_ADDED 6 +#define SCTP_IN_COOKIE_PROC 100 +#define SCTP_ALLOC_ASOC 1 +#define SCTP_LOAD_ADDR_2 2 +#define SCTP_LOAD_ADDR_3 3 +#define SCTP_LOAD_ADDR_4 4 +#define SCTP_LOAD_ADDR_5 5 + +#define SCTP_DONOT_SETSCOPE 0 +#define SCTP_DO_SETSCOPE 1 + + +/* This value determines the default for when + * we try to add more on the send queue., if + * there is room. This prevents us from cycling + * into the copy_resume routine to often if + * we have not got enough space to add a decent + * enough size message. Note that if we have enough + * space to complete the message copy we will always + * add to the message, no matter what the size. Its + * only when we reach the point that we have some left + * to add, there is only room for part of it that we + * will use this threshold. Its also a sysctl. + */ +#define SCTP_DEFAULT_ADD_MORE 1452 + +#ifndef SCTP_PCBHASHSIZE +/* default number of association hash buckets in each endpoint */ +#define SCTP_PCBHASHSIZE 256 +#endif +#ifndef SCTP_TCBHASHSIZE +#define SCTP_TCBHASHSIZE 1024 +#endif + +#ifndef SCTP_CHUNKQUEUE_SCALE +#define SCTP_CHUNKQUEUE_SCALE 10 +#endif + +/* clock variance is 1 ms */ +#define SCTP_CLOCK_GRANULARITY 1 +#define IP_HDR_SIZE 40 /* we use the size of a IP6 header here this + * detracts a small amount for ipv4 but it + * simplifies the ipv6 addition */ + +/* Argument magic number for sctp_inpcb_free() */ + +/* third argument */ +#define SCTP_CALLED_DIRECTLY_NOCMPSET 0 +#define SCTP_CALLED_AFTER_CMPSET_OFCLOSE 1 +#define SCTP_CALLED_FROM_INPKILL_TIMER 2 +/* second argument */ +#define SCTP_FREE_SHOULD_USE_ABORT 1 +#define SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE 0 + +#ifndef IPPROTO_SCTP +#define IPPROTO_SCTP 132 /* the Official IANA number :-) */ +#endif /* !IPPROTO_SCTP */ + +#define SCTP_MAX_DATA_BUNDLING 256 + +/* modular comparison */ +/* True if a > b (mod = M) */ +#define compare_with_wrap(a, b, M) (((a > b) && ((a - b) < ((M >> 1) + 1))) || \ + ((b > a) && ((b - a) > ((M >> 1) + 1)))) + + +/* Mapping array manipulation routines */ +#define SCTP_IS_TSN_PRESENT(arry, gap) ((arry[(gap >> 3)] >> (gap & 0x07)) & 0x01) +#define SCTP_SET_TSN_PRESENT(arry, gap) (arry[(gap >> 3)] |= (0x01 << ((gap & 0x07)))) +#define SCTP_UNSET_TSN_PRESENT(arry, gap) (arry[(gap >> 3)] &= ((~(0x01 << ((gap & 0x07)))) & 0xff)) +#define SCTP_CALC_TSN_TO_GAP(gap, tsn, mapping_tsn) do { \ + if (tsn >= mapping_tsn) { \ + gap = tsn - mapping_tsn; \ + } else { \ + gap = (MAX_TSN - mapping_tsn) + tsn + 1; \ + } \ + } while(0) + + +#define SCTP_RETRAN_DONE -1 +#define SCTP_RETRAN_EXIT -2 + +/* + * This value defines the number of vtag block time wait entry's per list + * element. Each entry will take 2 4 byte ints (and of course the overhead + * of the next pointer as well). Using 15 as an example will yield * ((8 * + * 15) + 8) or 128 bytes of overhead for each timewait block that gets + * initialized. Increasing it to 31 would yeild 256 bytes per block. + */ +#define SCTP_NUMBER_IN_VTAG_BLOCK 15 +/* + * If we use the STACK option, we have an array of this size head pointers. + * This array is mod'd the with the size to find which bucket and then all + * entries must be searched to see if the tag is in timed wait. If so we + * reject it. + */ +#define SCTP_STACK_VTAG_HASH_SIZE 32 + +/* + * Number of seconds of time wait for a vtag. + */ +#define SCTP_TIME_WAIT 60 + +#define SCTP_SEND_BUFFER_SPLITTING 0x00000001 +#define SCTP_RECV_BUFFER_SPLITTING 0x00000002 + +/* The system retains a cache of free chunks such to + * cut down on calls the memory allocation system. There + * is a per association limit of free items and a overall + * system limit. If either one gets hit then the resource + * stops being cached. + */ + +#define SCTP_DEF_ASOC_RESC_LIMIT 10 +#define SCTP_DEF_SYSTEM_RESC_LIMIT 1000 + +/*- + * defines for socket lock states. + * Used by __APPLE__ and SCTP_SO_LOCK_TESTING + */ +#define SCTP_SO_LOCKED 1 +#define SCTP_SO_NOT_LOCKED 0 + + +#define SCTP_HOLDS_LOCK 1 +#define SCTP_NOT_LOCKED 0 + +/*- + * For address locks, do we hold the lock? + */ +#define SCTP_ADDR_LOCKED 1 +#define SCTP_ADDR_NOT_LOCKED 0 + +#define IN4_ISPRIVATE_ADDRESS(a) \ + ((((uint8_t *)&(a)->s_addr)[0] == 10) || \ + ((((uint8_t *)&(a)->s_addr)[0] == 172) && \ + (((uint8_t *)&(a)->s_addr)[1] >= 16) && \ + (((uint8_t *)&(a)->s_addr)[1] <= 32)) || \ + ((((uint8_t *)&(a)->s_addr)[0] == 192) && \ + (((uint8_t *)&(a)->s_addr)[1] == 168))) + +#define IN4_ISLOOPBACK_ADDRESS(a) \ + ((((uint8_t *)&(a)->s_addr)[0] == 127) && \ + (((uint8_t *)&(a)->s_addr)[1] == 0) && \ + (((uint8_t *)&(a)->s_addr)[2] == 0) && \ + (((uint8_t *)&(a)->s_addr)[3] == 1)) + + +#if defined(_KERNEL) + +#define SCTP_GETTIME_TIMEVAL(x) (getmicrouptime(x)) +#define SCTP_GETPTIME_TIMEVAL(x) (microuptime(x)) +#endif +/*#if defined(__FreeBSD__) || defined(__APPLE__)*/ +/*#define SCTP_GETTIME_TIMEVAL(x) { \*/ +/* (x)->tv_sec = ticks / 1000; \*/ +/* (x)->tv_usec = (ticks % 1000) * 1000; \*/ +/*}*/ + +/*#else*/ +/*#define SCTP_GETTIME_TIMEVAL(x) (microtime(x))*/ +/*#endif __FreeBSD__ */ + +#if defined(_KERNEL) || defined(__Userspace__) +#define sctp_sowwakeup(inp, so) \ +do { \ + if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { \ + inp->sctp_flags |= SCTP_PCB_FLAGS_WAKEOUTPUT; \ + } else { \ + sowwakeup(so); \ + } \ +} while (0) + +#define sctp_sowwakeup_locked(inp, so) \ +do { \ + if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { \ + SOCKBUF_UNLOCK(&((so)->so_snd)); \ + inp->sctp_flags |= SCTP_PCB_FLAGS_WAKEOUTPUT; \ + } else { \ + sowwakeup_locked(so); \ + } \ +} while (0) + +#define sctp_sorwakeup(inp, so) \ +do { \ + if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { \ + inp->sctp_flags |= SCTP_PCB_FLAGS_WAKEINPUT; \ + } else { \ + sorwakeup(so); \ + } \ +} while (0) + +#define sctp_sorwakeup_locked(inp, so) \ +do { \ + if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { \ + inp->sctp_flags |= SCTP_PCB_FLAGS_WAKEINPUT; \ + SOCKBUF_UNLOCK(&((so)->so_rcv)); \ + } else { \ + sorwakeup_locked(so); \ + } \ +} while (0) + +#endif /* _KERNEL || __Userspace__ */ +#endif diff --git a/freebsd/sys/netinet/sctp_crc32.c b/freebsd/sys/netinet/sctp_crc32.c new file mode 100644 index 00000000..aa4c08cf --- /dev/null +++ b/freebsd/sys/netinet/sctp_crc32.c @@ -0,0 +1,148 @@ +#include + +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_crc32.c,v 1.12 2005/03/06 16:04:17 itojun Exp $ */ + + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + + +#if !defined(SCTP_WITH_NO_CSUM) + +static uint32_t +sctp_finalize_crc32c(uint32_t crc32c) +{ + uint32_t result; + +#if BYTE_ORDER == BIG_ENDIAN + uint8_t byte0, byte1, byte2, byte3; + +#endif + /* Complement the result */ + result = ~crc32c; +#if BYTE_ORDER == BIG_ENDIAN + /* + * For BIG-ENDIAN.. aka Motorola byte order the result is in + * little-endian form. So we must manually swap the bytes. Then we + * can call htonl() which does nothing... + */ + byte0 = result & 0x000000ff; + byte1 = (result >> 8) & 0x000000ff; + byte2 = (result >> 16) & 0x000000ff; + byte3 = (result >> 24) & 0x000000ff; + crc32c = ((byte0 << 24) | (byte1 << 16) | (byte2 << 8) | byte3); +#else + /* + * For INTEL platforms the result comes out in network order. No + * htonl is required or the swap above. So we optimize out both the + * htonl and the manual swap above. + */ + crc32c = result; +#endif + return (crc32c); +} + +uint32_t +sctp_calculate_cksum(struct mbuf *m, uint32_t offset) +{ + /* + * given a mbuf chain with a packetheader offset by 'offset' + * pointing at a sctphdr (with csum set to 0) go through the chain + * of SCTP_BUF_NEXT()'s and calculate the SCTP checksum. This also + * has a side bonus as it will calculate the total length of the + * mbuf chain. Note: if offset is greater than the total mbuf + * length, checksum=1, pktlen=0 is returned (ie. no real error code) + */ + uint32_t base = 0xffffffff; + struct mbuf *at; + + at = m; + /* find the correct mbuf and offset into mbuf */ + while ((at != NULL) && (offset > (uint32_t) SCTP_BUF_LEN(at))) { + offset -= SCTP_BUF_LEN(at); /* update remaining offset + * left */ + at = SCTP_BUF_NEXT(at); + } + while (at != NULL) { + if ((SCTP_BUF_LEN(at) - offset) > 0) { + base = calculate_crc32c(base, + (unsigned char *)(SCTP_BUF_AT(at, offset)), + (unsigned int)(SCTP_BUF_LEN(at) - offset)); + } + if (offset) { + /* we only offset once into the first mbuf */ + if (offset < (uint32_t) SCTP_BUF_LEN(at)) + offset = 0; + else + offset -= SCTP_BUF_LEN(at); + } + at = SCTP_BUF_NEXT(at); + } + base = sctp_finalize_crc32c(base); + return (base); +} + +#endif /* !defined(SCTP_WITH_NO_CSUM) */ + + +void +sctp_delayed_cksum(struct mbuf *m, uint32_t offset) +{ +#if defined(SCTP_WITH_NO_CSUM) + panic("sctp_delayed_cksum() called when using no SCTP CRC."); +#else + uint32_t checksum; + + checksum = sctp_calculate_cksum(m, offset); + SCTP_STAT_DECR(sctps_sendhwcrc); + SCTP_STAT_INCR(sctps_sendswcrc); + offset += offsetof(struct sctphdr, checksum); + + if (offset + sizeof(uint32_t) > (uint32_t) (m->m_len)) { + printf("sctp_delayed_cksum(): m->len: %d, off: %d.\n", + (uint32_t) m->m_len, offset); + /* + * XXX this shouldn't happen, but if it does, the correct + * behavior may be to insert the checksum in the appropriate + * next mbuf in the chain. + */ + return; + } + *(uint32_t *) (m->m_data + offset) = checksum; +#endif +} diff --git a/freebsd/sys/netinet/sctp_crc32.h b/freebsd/sys/netinet/sctp_crc32.h new file mode 100644 index 00000000..768b25d5 --- /dev/null +++ b/freebsd/sys/netinet/sctp_crc32.h @@ -0,0 +1,47 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_crc32.h,v 1.5 2004/08/17 04:06:16 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __crc32c_h__ +#define __crc32c_h__ + +#if defined(_KERNEL) +#if !defined(SCTP_WITH_NO_CSUM) +uint32_t sctp_calculate_cksum(struct mbuf *, uint32_t); + +#endif +void sctp_delayed_cksum(struct mbuf *, uint32_t offset); + +#endif /* _KERNEL */ +#endif /* __crc32c_h__ */ diff --git a/freebsd/sys/netinet/sctp_header.h b/freebsd/sys/netinet/sctp_header.h new file mode 100644 index 00000000..141bfcda --- /dev/null +++ b/freebsd/sys/netinet/sctp_header.h @@ -0,0 +1,624 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_header.h,v 1.14 2005/03/06 16:04:17 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_header_h__ +#define __sctp_header_h__ + +#include +#include +#include + +#define SCTP_PACKED __attribute__((packed)) + +/* + * Parameter structures + */ +struct sctp_ipv4addr_param { + struct sctp_paramhdr ph;/* type=SCTP_IPV4_PARAM_TYPE, len=8 */ + uint32_t addr; /* IPV4 address */ +} SCTP_PACKED; + +#define SCTP_V6_ADDR_BYTES 16 + + +struct sctp_ipv6addr_param { + struct sctp_paramhdr ph;/* type=SCTP_IPV6_PARAM_TYPE, len=20 */ + uint8_t addr[SCTP_V6_ADDR_BYTES]; /* IPV6 address */ +} SCTP_PACKED; + +/* Cookie Preservative */ +struct sctp_cookie_perserve_param { + struct sctp_paramhdr ph;/* type=SCTP_COOKIE_PRESERVE, len=8 */ + uint32_t time; /* time in ms to extend cookie */ +} SCTP_PACKED; + +#define SCTP_ARRAY_MIN_LEN 1 +/* Host Name Address */ +struct sctp_host_name_param { + struct sctp_paramhdr ph;/* type=SCTP_HOSTNAME_ADDRESS */ + char name[SCTP_ARRAY_MIN_LEN]; /* host name */ +} SCTP_PACKED; + +/* + * This is the maximum padded size of a s-a-p + * so paramheadr + 3 address types (6 bytes) + 2 byte pad = 12 + */ +#define SCTP_MAX_ADDR_PARAMS_SIZE 12 +/* supported address type */ +struct sctp_supported_addr_param { + struct sctp_paramhdr ph;/* type=SCTP_SUPPORTED_ADDRTYPE */ + uint16_t addr_type[SCTP_ARRAY_MIN_LEN]; /* array of supported address + * types */ +} SCTP_PACKED; + +/* ECN parameter */ +struct sctp_ecn_supported_param { + struct sctp_paramhdr ph;/* type=SCTP_ECN_CAPABLE */ +} SCTP_PACKED; + + +/* heartbeat info parameter */ +struct sctp_heartbeat_info_param { + struct sctp_paramhdr ph; + uint32_t time_value_1; + uint32_t time_value_2; + uint32_t random_value1; + uint32_t random_value2; + uint16_t user_req; + uint8_t addr_family; + uint8_t addr_len; + char address[SCTP_ADDRMAX]; +} SCTP_PACKED; + + +/* draft-ietf-tsvwg-prsctp */ +/* PR-SCTP supported parameter */ +struct sctp_prsctp_supported_param { + struct sctp_paramhdr ph; +} SCTP_PACKED; + + +/* draft-ietf-tsvwg-addip-sctp */ +struct sctp_asconf_paramhdr { /* an ASCONF "parameter" */ + struct sctp_paramhdr ph;/* a SCTP parameter header */ + uint32_t correlation_id;/* correlation id for this param */ +} SCTP_PACKED; + +struct sctp_asconf_addr_param { /* an ASCONF address parameter */ + struct sctp_asconf_paramhdr aph; /* asconf "parameter" */ + struct sctp_ipv6addr_param addrp; /* max storage size */ +} SCTP_PACKED; + + +struct sctp_asconf_tag_param { /* an ASCONF NAT-Vtag parameter */ + struct sctp_asconf_paramhdr aph; /* asconf "parameter" */ + uint32_t local_vtag; + uint32_t remote_vtag; +} SCTP_PACKED; + + +struct sctp_asconf_addrv4_param { /* an ASCONF address (v4) parameter */ + struct sctp_asconf_paramhdr aph; /* asconf "parameter" */ + struct sctp_ipv4addr_param addrp; /* max storage size */ +} SCTP_PACKED; + +#define SCTP_MAX_SUPPORTED_EXT 256 + +struct sctp_supported_chunk_types_param { + struct sctp_paramhdr ph;/* type = 0x8008 len = x */ + uint8_t chunk_types[]; +} SCTP_PACKED; + + +/* ECN Nonce: draft-ladha-sctp-ecn-nonce */ +struct sctp_ecn_nonce_supported_param { + struct sctp_paramhdr ph;/* type = 0x8001 len = 4 */ +} SCTP_PACKED; + + +/* + * Structures for DATA chunks + */ +struct sctp_data { + uint32_t tsn; + uint16_t stream_id; + uint16_t stream_sequence; + uint32_t protocol_id; + /* user data follows */ +} SCTP_PACKED; + +struct sctp_data_chunk { + struct sctp_chunkhdr ch; + struct sctp_data dp; +} SCTP_PACKED; + +/* + * Structures for the control chunks + */ + +/* Initiate (INIT)/Initiate Ack (INIT ACK) */ +struct sctp_init { + uint32_t initiate_tag; /* initiate tag */ + uint32_t a_rwnd; /* a_rwnd */ + uint16_t num_outbound_streams; /* OS */ + uint16_t num_inbound_streams; /* MIS */ + uint32_t initial_tsn; /* I-TSN */ + /* optional param's follow */ +} SCTP_PACKED; + +#define SCTP_IDENTIFICATION_SIZE 16 +#define SCTP_ADDRESS_SIZE 4 +#define SCTP_RESERVE_SPACE 6 +/* state cookie header */ +struct sctp_state_cookie { /* this is our definition... */ + uint8_t identification[SCTP_IDENTIFICATION_SIZE]; /* id of who we are */ + struct timeval time_entered; /* the time I built cookie */ + uint32_t cookie_life; /* life I will award this cookie */ + uint32_t tie_tag_my_vtag; /* my tag in old association */ + + uint32_t tie_tag_peer_vtag; /* peers tag in old association */ + uint32_t peers_vtag; /* peers tag in INIT (for quick ref) */ + + uint32_t my_vtag; /* my tag in INIT-ACK (for quick ref) */ + uint32_t address[SCTP_ADDRESS_SIZE]; /* 4 ints/128 bits */ + uint32_t addr_type; /* address type */ + uint32_t laddress[SCTP_ADDRESS_SIZE]; /* my local from address */ + uint32_t laddr_type; /* my local from address type */ + uint32_t scope_id; /* v6 scope id for link-locals */ + + uint16_t peerport; /* port address of the peer in the INIT */ + uint16_t myport; /* my port address used in the INIT */ + uint8_t ipv4_addr_legal;/* Are V4 addr legal? */ + uint8_t ipv6_addr_legal;/* Are V6 addr legal? */ + uint8_t local_scope; /* IPv6 local scope flag */ + uint8_t site_scope; /* IPv6 site scope flag */ + + uint8_t ipv4_scope; /* IPv4 private addr scope */ + uint8_t loopback_scope; /* loopback scope information */ + uint8_t reserved[SCTP_RESERVE_SPACE]; /* Align to 64 bits */ + /* + * at the end is tacked on the INIT chunk and the INIT-ACK chunk + * (minus the cookie). + */ +} SCTP_PACKED; + + +/* Used for NAT state error cause */ +struct sctp_missing_nat_state { + uint16_t cause; + uint16_t length; + uint8_t data[]; +} SCTP_PACKED; + + +struct sctp_inv_mandatory_param { + uint16_t cause; + uint16_t length; + uint32_t num_param; + uint16_t param; + /* + * We include this to 0 it since only a missing cookie will cause + * this error. + */ + uint16_t resv; +} SCTP_PACKED; + +struct sctp_unresolv_addr { + uint16_t cause; + uint16_t length; + uint16_t addr_type; + uint16_t reserved; /* Only one invalid addr type */ +} SCTP_PACKED; + +/* state cookie parameter */ +struct sctp_state_cookie_param { + struct sctp_paramhdr ph; + struct sctp_state_cookie cookie; +} SCTP_PACKED; + +struct sctp_init_chunk { + struct sctp_chunkhdr ch; + struct sctp_init init; +} SCTP_PACKED; + +struct sctp_init_msg { + struct sctphdr sh; + struct sctp_init_chunk msg; +} SCTP_PACKED; + +/* ... used for both INIT and INIT ACK */ +#define sctp_init_ack sctp_init +#define sctp_init_ack_chunk sctp_init_chunk +#define sctp_init_ack_msg sctp_init_msg + + +/* Selective Ack (SACK) */ +struct sctp_gap_ack_block { + uint16_t start; /* Gap Ack block start */ + uint16_t end; /* Gap Ack block end */ +} SCTP_PACKED; + +struct sctp_sack { + uint32_t cum_tsn_ack; /* cumulative TSN Ack */ + uint32_t a_rwnd; /* updated a_rwnd of sender */ + uint16_t num_gap_ack_blks; /* number of Gap Ack blocks */ + uint16_t num_dup_tsns; /* number of duplicate TSNs */ + /* struct sctp_gap_ack_block's follow */ + /* uint32_t duplicate_tsn's follow */ +} SCTP_PACKED; + +struct sctp_sack_chunk { + struct sctp_chunkhdr ch; + struct sctp_sack sack; +} SCTP_PACKED; + +struct sctp_nr_sack { + uint32_t cum_tsn_ack; /* cumulative TSN Ack */ + uint32_t a_rwnd; /* updated a_rwnd of sender */ + uint16_t num_gap_ack_blks; /* number of Gap Ack blocks */ + uint16_t num_nr_gap_ack_blks; /* number of NR Gap Ack blocks */ + uint16_t num_dup_tsns; /* number of duplicate TSNs */ + uint16_t reserved; /* not currently used */ + /* struct sctp_gap_ack_block's follow */ + /* uint32_t duplicate_tsn's follow */ +} SCTP_PACKED; + +struct sctp_nr_sack_chunk { + struct sctp_chunkhdr ch; + struct sctp_nr_sack nr_sack; +} SCTP_PACKED; + + +/* Heartbeat Request (HEARTBEAT) */ +struct sctp_heartbeat { + struct sctp_heartbeat_info_param hb_info; +} SCTP_PACKED; + +struct sctp_heartbeat_chunk { + struct sctp_chunkhdr ch; + struct sctp_heartbeat heartbeat; +} SCTP_PACKED; + +/* ... used for Heartbeat Ack (HEARTBEAT ACK) */ +#define sctp_heartbeat_ack sctp_heartbeat +#define sctp_heartbeat_ack_chunk sctp_heartbeat_chunk + + +/* Abort Asssociation (ABORT) */ +struct sctp_abort_chunk { + struct sctp_chunkhdr ch; + /* optional error cause may follow */ +} SCTP_PACKED; + +struct sctp_abort_msg { + struct sctphdr sh; + struct sctp_abort_chunk msg; +} SCTP_PACKED; + + +/* Shutdown Association (SHUTDOWN) */ +struct sctp_shutdown_chunk { + struct sctp_chunkhdr ch; + uint32_t cumulative_tsn_ack; +} SCTP_PACKED; + + +/* Shutdown Acknowledgment (SHUTDOWN ACK) */ +struct sctp_shutdown_ack_chunk { + struct sctp_chunkhdr ch; +} SCTP_PACKED; + + +/* Operation Error (ERROR) */ +struct sctp_error_chunk { + struct sctp_chunkhdr ch; + /* optional error causes follow */ +} SCTP_PACKED; + + +/* Cookie Echo (COOKIE ECHO) */ +struct sctp_cookie_echo_chunk { + struct sctp_chunkhdr ch; + struct sctp_state_cookie cookie; +} SCTP_PACKED; + +/* Cookie Acknowledgment (COOKIE ACK) */ +struct sctp_cookie_ack_chunk { + struct sctp_chunkhdr ch; +} SCTP_PACKED; + +/* Explicit Congestion Notification Echo (ECNE) */ +struct sctp_ecne_chunk { + struct sctp_chunkhdr ch; + uint32_t tsn; +} SCTP_PACKED; + +/* Congestion Window Reduced (CWR) */ +struct sctp_cwr_chunk { + struct sctp_chunkhdr ch; + uint32_t tsn; +} SCTP_PACKED; + +/* Shutdown Complete (SHUTDOWN COMPLETE) */ +struct sctp_shutdown_complete_chunk { + struct sctp_chunkhdr ch; +} SCTP_PACKED; + +/* Oper error holding a stale cookie */ +struct sctp_stale_cookie_msg { + struct sctp_paramhdr ph;/* really an error cause */ + uint32_t time_usec; +} SCTP_PACKED; + +struct sctp_adaptation_layer_indication { + struct sctp_paramhdr ph; + uint32_t indication; +} SCTP_PACKED; + +struct sctp_cookie_while_shutting_down { + struct sctphdr sh; + struct sctp_chunkhdr ch; + struct sctp_paramhdr ph;/* really an error cause */ +} SCTP_PACKED; + +struct sctp_shutdown_complete_msg { + struct sctphdr sh; + struct sctp_shutdown_complete_chunk shut_cmp; +} SCTP_PACKED; + +/* + * draft-ietf-tsvwg-addip-sctp + */ +/* Address/Stream Configuration Change (ASCONF) */ +struct sctp_asconf_chunk { + struct sctp_chunkhdr ch; + uint32_t serial_number; + /* lookup address parameter (mandatory) */ + /* asconf parameters follow */ +} SCTP_PACKED; + +/* Address/Stream Configuration Acknowledge (ASCONF ACK) */ +struct sctp_asconf_ack_chunk { + struct sctp_chunkhdr ch; + uint32_t serial_number; + /* asconf parameters follow */ +} SCTP_PACKED; + +/* draft-ietf-tsvwg-prsctp */ +/* Forward Cumulative TSN (FORWARD TSN) */ +struct sctp_forward_tsn_chunk { + struct sctp_chunkhdr ch; + uint32_t new_cumulative_tsn; + /* stream/sequence pairs (sctp_strseq) follow */ +} SCTP_PACKED; + +struct sctp_strseq { + uint16_t stream; + uint16_t sequence; +} SCTP_PACKED; + +struct sctp_forward_tsn_msg { + struct sctphdr sh; + struct sctp_forward_tsn_chunk msg; +} SCTP_PACKED; + +/* should be a multiple of 4 - 1 aka 3/7/11 etc. */ + +#define SCTP_NUM_DB_TO_VERIFY 31 + +struct sctp_chunk_desc { + uint8_t chunk_type; + uint8_t data_bytes[SCTP_NUM_DB_TO_VERIFY]; + uint32_t tsn_ifany; +} SCTP_PACKED; + + +struct sctp_pktdrop_chunk { + struct sctp_chunkhdr ch; + uint32_t bottle_bw; + uint32_t current_onq; + uint16_t trunc_len; + uint16_t reserved; + uint8_t data[]; +} SCTP_PACKED; + +/**********STREAM RESET STUFF ******************/ + +struct sctp_stream_reset_out_request { + struct sctp_paramhdr ph; + uint32_t request_seq; /* monotonically increasing seq no */ + uint32_t response_seq; /* if a response, the resp seq no */ + uint32_t send_reset_at_tsn; /* last TSN I assigned outbound */ + uint16_t list_of_streams[]; /* if not all list of streams */ +} SCTP_PACKED; + +struct sctp_stream_reset_in_request { + struct sctp_paramhdr ph; + uint32_t request_seq; + uint16_t list_of_streams[]; /* if not all list of streams */ +} SCTP_PACKED; + + +struct sctp_stream_reset_tsn_request { + struct sctp_paramhdr ph; + uint32_t request_seq; +} SCTP_PACKED; + +struct sctp_stream_reset_response { + struct sctp_paramhdr ph; + uint32_t response_seq; /* if a response, the resp seq no */ + uint32_t result; +} SCTP_PACKED; + +struct sctp_stream_reset_response_tsn { + struct sctp_paramhdr ph; + uint32_t response_seq; /* if a response, the resp seq no */ + uint32_t result; + uint32_t senders_next_tsn; + uint32_t receivers_next_tsn; +} SCTP_PACKED; + +struct sctp_stream_reset_add_strm { + struct sctp_paramhdr ph; + uint32_t request_seq; + uint16_t number_of_streams; + uint16_t reserved; +} SCTP_PACKED; + +#define SCTP_STREAM_RESET_NOTHING 0x00000000 /* Nothing for me to do */ +#define SCTP_STREAM_RESET_PERFORMED 0x00000001 /* Did it */ +#define SCTP_STREAM_RESET_DENIED 0x00000002 /* refused to do it */ +#define SCTP_STREAM_RESET_ERROR_STR 0x00000003 /* bad Stream no */ +#define SCTP_STREAM_RESET_TRY_LATER 0x00000004 /* collision, try again */ +#define SCTP_STREAM_RESET_BAD_SEQNO 0x00000005 /* bad str-reset seq no */ + +/* + * convience structures, note that if you are making a request for specific + * streams then the request will need to be an overlay structure. + */ + +struct sctp_stream_reset_out_req { + struct sctp_chunkhdr ch; + struct sctp_stream_reset_out_request sr_req; +} SCTP_PACKED; + +struct sctp_stream_reset_in_req { + struct sctp_chunkhdr ch; + struct sctp_stream_reset_in_request sr_req; +} SCTP_PACKED; + +struct sctp_stream_reset_tsn_req { + struct sctp_chunkhdr ch; + struct sctp_stream_reset_tsn_request sr_req; +} SCTP_PACKED; + +struct sctp_stream_reset_resp { + struct sctp_chunkhdr ch; + struct sctp_stream_reset_response sr_resp; +} SCTP_PACKED; + +/* respone only valid with a TSN request */ +struct sctp_stream_reset_resp_tsn { + struct sctp_chunkhdr ch; + struct sctp_stream_reset_response_tsn sr_resp; +} SCTP_PACKED; + +/****************************************************/ + +/* + * Authenticated chunks support draft-ietf-tsvwg-sctp-auth + */ + +/* Should we make the max be 32? */ +#define SCTP_RANDOM_MAX_SIZE 256 +struct sctp_auth_random { + struct sctp_paramhdr ph;/* type = 0x8002 */ + uint8_t random_data[]; +} SCTP_PACKED; + +struct sctp_auth_chunk_list { + struct sctp_paramhdr ph;/* type = 0x8003 */ + uint8_t chunk_types[]; +} SCTP_PACKED; + +struct sctp_auth_hmac_algo { + struct sctp_paramhdr ph;/* type = 0x8004 */ + uint16_t hmac_ids[]; +} SCTP_PACKED; + +struct sctp_auth_chunk { + struct sctp_chunkhdr ch; + uint16_t shared_key_id; + uint16_t hmac_id; + uint8_t hmac[]; +} SCTP_PACKED; + +struct sctp_auth_invalid_hmac { + struct sctp_paramhdr ph; + uint16_t hmac_id; + uint16_t padding; +} SCTP_PACKED; + +/* + * we pre-reserve enough room for a ECNE or CWR AND a SACK with no missing + * pieces. If ENCE is missing we could have a couple of blocks. This way we + * optimize so we MOST likely can bundle a SACK/ECN with the smallest size + * data chunk I will split into. We could increase throughput slightly by + * taking out these two but the 24-sack/8-CWR i.e. 32 bytes I pre-reserve I + * feel is worth it for now. + */ +#ifndef SCTP_MAX_OVERHEAD +#ifdef INET6 +#define SCTP_MAX_OVERHEAD (sizeof(struct sctp_data_chunk) + \ + sizeof(struct sctphdr) + \ + sizeof(struct sctp_ecne_chunk) + \ + sizeof(struct sctp_sack_chunk) + \ + sizeof(struct ip6_hdr)) + +#define SCTP_MED_OVERHEAD (sizeof(struct sctp_data_chunk) + \ + sizeof(struct sctphdr) + \ + sizeof(struct ip6_hdr)) + + +#define SCTP_MIN_OVERHEAD (sizeof(struct ip6_hdr) + \ + sizeof(struct sctphdr)) + +#else +#define SCTP_MAX_OVERHEAD (sizeof(struct sctp_data_chunk) + \ + sizeof(struct sctphdr) + \ + sizeof(struct sctp_ecne_chunk) + \ + sizeof(struct sctp_sack_chunk) + \ + sizeof(struct ip)) + +#define SCTP_MED_OVERHEAD (sizeof(struct sctp_data_chunk) + \ + sizeof(struct sctphdr) + \ + sizeof(struct ip)) + + +#define SCTP_MIN_OVERHEAD (sizeof(struct ip) + \ + sizeof(struct sctphdr)) + +#endif /* INET6 */ +#endif /* !SCTP_MAX_OVERHEAD */ + +#define SCTP_MED_V4_OVERHEAD (sizeof(struct sctp_data_chunk) + \ + sizeof(struct sctphdr) + \ + sizeof(struct ip)) + +#define SCTP_MIN_V4_OVERHEAD (sizeof(struct ip) + \ + sizeof(struct sctphdr)) + +#undef SCTP_PACKED +#endif /* !__sctp_header_h__ */ diff --git a/freebsd/sys/netinet/sctp_indata.c b/freebsd/sys/netinet/sctp_indata.c new file mode 100644 index 00000000..963b3205 --- /dev/null +++ b/freebsd/sys/netinet/sctp_indata.c @@ -0,0 +1,5800 @@ +#include + +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_indata.c,v 1.36 2005/03/06 16:04:17 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * NOTES: On the outbound side of things I need to check the sack timer to + * see if I should generate a sack into the chunk queue (if I have data to + * send that is and will be sending it .. for bundling. + * + * The callback in sctp_usrreq.c will get called when the socket is read from. + * This will cause sctp_service_queues() to get called on the top entry in + * the list. + */ + +void +sctp_set_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc) +{ + asoc->my_rwnd = sctp_calc_rwnd(stcb, asoc); +} + +/* Calculate what the rwnd would be */ +uint32_t +sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc) +{ + uint32_t calc = 0; + + /* + * This is really set wrong with respect to a 1-2-m socket. Since + * the sb_cc is the count that everyone as put up. When we re-write + * sctp_soreceive then we will fix this so that ONLY this + * associations data is taken into account. + */ + if (stcb->sctp_socket == NULL) + return (calc); + + if (stcb->asoc.sb_cc == 0 && + asoc->size_on_reasm_queue == 0 && + asoc->size_on_all_streams == 0) { + /* Full rwnd granted */ + calc = max(SCTP_SB_LIMIT_RCV(stcb->sctp_socket), SCTP_MINIMAL_RWND); + return (calc); + } + /* get actual space */ + calc = (uint32_t) sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv); + + /* + * take out what has NOT been put on socket queue and we yet hold + * for putting up. + */ + calc = sctp_sbspace_sub(calc, (uint32_t) (asoc->size_on_reasm_queue + + asoc->cnt_on_reasm_queue * MSIZE)); + calc = sctp_sbspace_sub(calc, (uint32_t) (asoc->size_on_all_streams + + asoc->cnt_on_all_streams * MSIZE)); + + if (calc == 0) { + /* out of space */ + return (calc); + } + /* what is the overhead of all these rwnd's */ + calc = sctp_sbspace_sub(calc, stcb->asoc.my_rwnd_control_len); + /* + * If the window gets too small due to ctrl-stuff, reduce it to 1, + * even it is 0. SWS engaged + */ + if (calc < stcb->asoc.my_rwnd_control_len) { + calc = 1; + } + return (calc); +} + + + +/* + * Build out our readq entry based on the incoming packet. + */ +struct sctp_queued_to_read * +sctp_build_readq_entry(struct sctp_tcb *stcb, + struct sctp_nets *net, + uint32_t tsn, uint32_t ppid, + uint32_t context, uint16_t stream_no, + uint16_t stream_seq, uint8_t flags, + struct mbuf *dm) +{ + struct sctp_queued_to_read *read_queue_e = NULL; + + sctp_alloc_a_readq(stcb, read_queue_e); + if (read_queue_e == NULL) { + goto failed_build; + } + read_queue_e->sinfo_stream = stream_no; + read_queue_e->sinfo_ssn = stream_seq; + read_queue_e->sinfo_flags = (flags << 8); + read_queue_e->sinfo_ppid = ppid; + read_queue_e->sinfo_context = stcb->asoc.context; + read_queue_e->sinfo_timetolive = 0; + read_queue_e->sinfo_tsn = tsn; + read_queue_e->sinfo_cumtsn = tsn; + read_queue_e->sinfo_assoc_id = sctp_get_associd(stcb); + read_queue_e->whoFrom = net; + read_queue_e->length = 0; + atomic_add_int(&net->ref_count, 1); + read_queue_e->data = dm; + read_queue_e->spec_flags = 0; + read_queue_e->tail_mbuf = NULL; + read_queue_e->aux_data = NULL; + read_queue_e->stcb = stcb; + read_queue_e->port_from = stcb->rport; + read_queue_e->do_not_ref_stcb = 0; + read_queue_e->end_added = 0; + read_queue_e->some_taken = 0; + read_queue_e->pdapi_aborted = 0; +failed_build: + return (read_queue_e); +} + + +/* + * Build out our readq entry based on the incoming packet. + */ +static struct sctp_queued_to_read * +sctp_build_readq_entry_chk(struct sctp_tcb *stcb, + struct sctp_tmit_chunk *chk) +{ + struct sctp_queued_to_read *read_queue_e = NULL; + + sctp_alloc_a_readq(stcb, read_queue_e); + if (read_queue_e == NULL) { + goto failed_build; + } + read_queue_e->sinfo_stream = chk->rec.data.stream_number; + read_queue_e->sinfo_ssn = chk->rec.data.stream_seq; + read_queue_e->sinfo_flags = (chk->rec.data.rcv_flags << 8); + read_queue_e->sinfo_ppid = chk->rec.data.payloadtype; + read_queue_e->sinfo_context = stcb->asoc.context; + read_queue_e->sinfo_timetolive = 0; + read_queue_e->sinfo_tsn = chk->rec.data.TSN_seq; + read_queue_e->sinfo_cumtsn = chk->rec.data.TSN_seq; + read_queue_e->sinfo_assoc_id = sctp_get_associd(stcb); + read_queue_e->whoFrom = chk->whoTo; + read_queue_e->aux_data = NULL; + read_queue_e->length = 0; + atomic_add_int(&chk->whoTo->ref_count, 1); + read_queue_e->data = chk->data; + read_queue_e->tail_mbuf = NULL; + read_queue_e->stcb = stcb; + read_queue_e->port_from = stcb->rport; + read_queue_e->spec_flags = 0; + read_queue_e->do_not_ref_stcb = 0; + read_queue_e->end_added = 0; + read_queue_e->some_taken = 0; + read_queue_e->pdapi_aborted = 0; +failed_build: + return (read_queue_e); +} + + +struct mbuf * +sctp_build_ctl_nchunk(struct sctp_inpcb *inp, + struct sctp_sndrcvinfo *sinfo) +{ + struct sctp_sndrcvinfo *outinfo; + struct cmsghdr *cmh; + struct mbuf *ret; + int len; + int use_extended = 0; + + if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { + /* user does not want the sndrcv ctl */ + return (NULL); + } + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { + use_extended = 1; + len = CMSG_LEN(sizeof(struct sctp_extrcvinfo)); + } else { + len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + } + + + ret = sctp_get_mbuf_for_msg(len, + 0, M_DONTWAIT, 1, MT_DATA); + + if (ret == NULL) { + /* No space */ + return (ret); + } + /* We need a CMSG header followed by the struct */ + cmh = mtod(ret, struct cmsghdr *); + outinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmh); + cmh->cmsg_level = IPPROTO_SCTP; + if (use_extended) { + cmh->cmsg_type = SCTP_EXTRCV; + cmh->cmsg_len = len; + memcpy(outinfo, sinfo, len); + } else { + cmh->cmsg_type = SCTP_SNDRCV; + cmh->cmsg_len = len; + *outinfo = *sinfo; + } + SCTP_BUF_LEN(ret) = cmh->cmsg_len; + return (ret); +} + + +char * +sctp_build_ctl_cchunk(struct sctp_inpcb *inp, + int *control_len, + struct sctp_sndrcvinfo *sinfo) +{ + struct sctp_sndrcvinfo *outinfo; + struct cmsghdr *cmh; + char *buf; + int len; + int use_extended = 0; + + if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) { + /* user does not want the sndrcv ctl */ + return (NULL); + } + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { + use_extended = 1; + len = CMSG_LEN(sizeof(struct sctp_extrcvinfo)); + } else { + len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + } + SCTP_MALLOC(buf, char *, len, SCTP_M_CMSG); + if (buf == NULL) { + /* No space */ + return (buf); + } + /* We need a CMSG header followed by the struct */ + cmh = (struct cmsghdr *)buf; + outinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmh); + cmh->cmsg_level = IPPROTO_SCTP; + if (use_extended) { + cmh->cmsg_type = SCTP_EXTRCV; + cmh->cmsg_len = len; + memcpy(outinfo, sinfo, len); + } else { + cmh->cmsg_type = SCTP_SNDRCV; + cmh->cmsg_len = len; + *outinfo = *sinfo; + } + *control_len = len; + return (buf); +} + +static void +sctp_mark_non_revokable(struct sctp_association *asoc, uint32_t tsn) +{ + uint32_t gap, i, cumackp1; + int fnd = 0; + + if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { + return; + } + cumackp1 = asoc->cumulative_tsn + 1; + if (compare_with_wrap(cumackp1, tsn, MAX_TSN)) { + /* + * this tsn is behind the cum ack and thus we don't need to + * worry about it being moved from one to the other. + */ + return; + } + SCTP_CALC_TSN_TO_GAP(gap, tsn, asoc->mapping_array_base_tsn); + if (!SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) { + printf("gap:%x tsn:%x\n", gap, tsn); + sctp_print_mapping_array(asoc); +#ifdef INVARIANTS + panic("Things are really messed up now!!"); +#endif + } + SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); + SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); + if (compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) { + asoc->highest_tsn_inside_nr_map = tsn; + } + if (tsn == asoc->highest_tsn_inside_map) { + /* We must back down to see what the new highest is */ + for (i = tsn - 1; (compare_with_wrap(i, asoc->mapping_array_base_tsn, MAX_TSN) || + (i == asoc->mapping_array_base_tsn)); i--) { + SCTP_CALC_TSN_TO_GAP(gap, i, asoc->mapping_array_base_tsn); + if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) { + asoc->highest_tsn_inside_map = i; + fnd = 1; + break; + } + } + if (!fnd) { + asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1; + } + } +} + + +/* + * We are delivering currently from the reassembly queue. We must continue to + * deliver until we either: 1) run out of space. 2) run out of sequential + * TSN's 3) hit the SCTP_DATA_LAST_FRAG flag. + */ +static void +sctp_service_reassembly(struct sctp_tcb *stcb, struct sctp_association *asoc) +{ + struct sctp_tmit_chunk *chk; + uint16_t nxt_todel; + uint16_t stream_no; + int end = 0; + int cntDel; + + struct sctp_queued_to_read *control, *ctl, *ctlat; + + if (stcb == NULL) + return; + + cntDel = stream_no = 0; + if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || + (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) || + (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) { + /* socket above is long gone or going.. */ +abandon: + asoc->fragmented_delivery_inprogress = 0; + chk = TAILQ_FIRST(&asoc->reasmqueue); + while (chk) { + TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); + asoc->size_on_reasm_queue -= chk->send_size; + sctp_ucount_decr(asoc->cnt_on_reasm_queue); + /* + * Lose the data pointer, since its in the socket + * buffer + */ + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + /* Now free the address and data */ + sctp_free_a_chunk(stcb, chk); + /* sa_ignore FREED_MEMORY */ + chk = TAILQ_FIRST(&asoc->reasmqueue); + } + return; + } + SCTP_TCB_LOCK_ASSERT(stcb); + do { + chk = TAILQ_FIRST(&asoc->reasmqueue); + if (chk == NULL) { + return; + } + if (chk->rec.data.TSN_seq != (asoc->tsn_last_delivered + 1)) { + /* Can't deliver more :< */ + return; + } + stream_no = chk->rec.data.stream_number; + nxt_todel = asoc->strmin[stream_no].last_sequence_delivered + 1; + if (nxt_todel != chk->rec.data.stream_seq && + (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) { + /* + * Not the next sequence to deliver in its stream OR + * unordered + */ + return; + } + if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { + + control = sctp_build_readq_entry_chk(stcb, chk); + if (control == NULL) { + /* out of memory? */ + return; + } + /* save it off for our future deliveries */ + stcb->asoc.control_pdapi = control; + if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) + end = 1; + else + end = 0; + sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq); + sctp_add_to_readq(stcb->sctp_ep, + stcb, control, &stcb->sctp_socket->so_rcv, end, + SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); + cntDel++; + } else { + if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) + end = 1; + else + end = 0; + sctp_mark_non_revokable(asoc, chk->rec.data.TSN_seq); + if (sctp_append_to_readq(stcb->sctp_ep, stcb, + stcb->asoc.control_pdapi, + chk->data, end, chk->rec.data.TSN_seq, + &stcb->sctp_socket->so_rcv)) { + /* + * something is very wrong, either + * control_pdapi is NULL, or the tail_mbuf + * is corrupt, or there is a EOM already on + * the mbuf chain. + */ + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + goto abandon; + } else { +#ifdef INVARIANTS + if ((stcb->asoc.control_pdapi == NULL) || (stcb->asoc.control_pdapi->tail_mbuf == NULL)) { + panic("This should not happen control_pdapi NULL?"); + } + /* if we did not panic, it was a EOM */ + panic("Bad chunking ??"); +#else + if ((stcb->asoc.control_pdapi == NULL) || (stcb->asoc.control_pdapi->tail_mbuf == NULL)) { + SCTP_PRINTF("This should not happen control_pdapi NULL?\n"); + } + SCTP_PRINTF("Bad chunking ??\n"); + SCTP_PRINTF("Dumping re-assembly queue this will probably hose the association\n"); + +#endif + goto abandon; + } + } + cntDel++; + } + /* pull it we did it */ + TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); + if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { + asoc->fragmented_delivery_inprogress = 0; + if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) { + asoc->strmin[stream_no].last_sequence_delivered++; + } + if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) { + SCTP_STAT_INCR_COUNTER64(sctps_reasmusrmsgs); + } + } else if (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { + /* + * turn the flag back on since we just delivered + * yet another one. + */ + asoc->fragmented_delivery_inprogress = 1; + } + asoc->tsn_of_pdapi_last_delivered = chk->rec.data.TSN_seq; + asoc->last_flags_delivered = chk->rec.data.rcv_flags; + asoc->last_strm_seq_delivered = chk->rec.data.stream_seq; + asoc->last_strm_no_delivered = chk->rec.data.stream_number; + + asoc->tsn_last_delivered = chk->rec.data.TSN_seq; + asoc->size_on_reasm_queue -= chk->send_size; + sctp_ucount_decr(asoc->cnt_on_reasm_queue); + /* free up the chk */ + chk->data = NULL; + sctp_free_a_chunk(stcb, chk); + + if (asoc->fragmented_delivery_inprogress == 0) { + /* + * Now lets see if we can deliver the next one on + * the stream + */ + struct sctp_stream_in *strm; + + strm = &asoc->strmin[stream_no]; + nxt_todel = strm->last_sequence_delivered + 1; + ctl = TAILQ_FIRST(&strm->inqueue); + if (ctl && (nxt_todel == ctl->sinfo_ssn)) { + while (ctl != NULL) { + /* Deliver more if we can. */ + if (nxt_todel == ctl->sinfo_ssn) { + ctlat = TAILQ_NEXT(ctl, next); + TAILQ_REMOVE(&strm->inqueue, ctl, next); + asoc->size_on_all_streams -= ctl->length; + sctp_ucount_decr(asoc->cnt_on_all_streams); + strm->last_sequence_delivered++; + sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); + sctp_add_to_readq(stcb->sctp_ep, stcb, + ctl, + &stcb->sctp_socket->so_rcv, 1, + SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); + ctl = ctlat; + } else { + break; + } + nxt_todel = strm->last_sequence_delivered + 1; + } + } + break; + } + /* sa_ignore FREED_MEMORY */ + chk = TAILQ_FIRST(&asoc->reasmqueue); + } while (chk); +} + +/* + * Queue the chunk either right into the socket buffer if it is the next one + * to go OR put it in the correct place in the delivery queue. If we do + * append to the so_buf, keep doing so until we are out of order. One big + * question still remains, what to do when the socket buffer is FULL?? + */ +static void +sctp_queue_data_to_stream(struct sctp_tcb *stcb, struct sctp_association *asoc, + struct sctp_queued_to_read *control, int *abort_flag) +{ + /* + * FIX-ME maybe? What happens when the ssn wraps? If we are getting + * all the data in one stream this could happen quite rapidly. One + * could use the TSN to keep track of things, but this scheme breaks + * down in the other type of stream useage that could occur. Send a + * single msg to stream 0, send 4Billion messages to stream 1, now + * send a message to stream 0. You have a situation where the TSN + * has wrapped but not in the stream. Is this worth worrying about + * or should we just change our queue sort at the bottom to be by + * TSN. + * + * Could it also be legal for a peer to send ssn 1 with TSN 2 and ssn 2 + * with TSN 1? If the peer is doing some sort of funky TSN/SSN + * assignment this could happen... and I don't see how this would be + * a violation. So for now I am undecided an will leave the sort by + * SSN alone. Maybe a hybred approach is the answer + * + */ + struct sctp_stream_in *strm; + struct sctp_queued_to_read *at; + int queue_needed; + uint16_t nxt_todel; + struct mbuf *oper; + + queue_needed = 1; + asoc->size_on_all_streams += control->length; + sctp_ucount_incr(asoc->cnt_on_all_streams); + strm = &asoc->strmin[control->sinfo_stream]; + nxt_todel = strm->last_sequence_delivered + 1; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { + sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_INTO_STRD); + } + SCTPDBG(SCTP_DEBUG_INDATA1, + "queue to stream called for ssn:%u lastdel:%u nxt:%u\n", + (uint32_t) control->sinfo_stream, + (uint32_t) strm->last_sequence_delivered, + (uint32_t) nxt_todel); + if (compare_with_wrap(strm->last_sequence_delivered, + control->sinfo_ssn, MAX_SEQ) || + (strm->last_sequence_delivered == control->sinfo_ssn)) { + /* The incoming sseq is behind where we last delivered? */ + SCTPDBG(SCTP_DEBUG_INDATA1, "Duplicate S-SEQ:%d delivered:%d from peer, Abort association\n", + control->sinfo_ssn, strm->last_sequence_delivered); +protocol_error: + /* + * throw it in the stream so it gets cleaned up in + * association destruction + */ + TAILQ_INSERT_HEAD(&strm->inqueue, control, next); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) + + (sizeof(uint32_t) * 3); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_1); + ippp++; + *ippp = control->sinfo_tsn; + ippp++; + *ippp = ((control->sinfo_stream << 16) | control->sinfo_ssn); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_1; + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return; + + } + if (nxt_todel == control->sinfo_ssn) { + /* can be delivered right away? */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { + sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_IMMED_DEL); + } + /* EY it wont be queued if it could be delivered directly */ + queue_needed = 0; + asoc->size_on_all_streams -= control->length; + sctp_ucount_decr(asoc->cnt_on_all_streams); + strm->last_sequence_delivered++; + + sctp_mark_non_revokable(asoc, control->sinfo_tsn); + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, + SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); + control = TAILQ_FIRST(&strm->inqueue); + while (control != NULL) { + /* all delivered */ + nxt_todel = strm->last_sequence_delivered + 1; + if (nxt_todel == control->sinfo_ssn) { + at = TAILQ_NEXT(control, next); + TAILQ_REMOVE(&strm->inqueue, control, next); + asoc->size_on_all_streams -= control->length; + sctp_ucount_decr(asoc->cnt_on_all_streams); + strm->last_sequence_delivered++; + /* + * We ignore the return of deliver_data here + * since we always can hold the chunk on the + * d-queue. And we have a finite number that + * can be delivered from the strq. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { + sctp_log_strm_del(control, NULL, + SCTP_STR_LOG_FROM_IMMED_DEL); + } + sctp_mark_non_revokable(asoc, control->sinfo_tsn); + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, + SCTP_READ_LOCK_NOT_HELD, + SCTP_SO_NOT_LOCKED); + control = at; + continue; + } + break; + } + } + if (queue_needed) { + /* + * Ok, we did not deliver this guy, find the correct place + * to put it on the queue. + */ + if ((compare_with_wrap(asoc->cumulative_tsn, + control->sinfo_tsn, MAX_TSN)) || + (control->sinfo_tsn == asoc->cumulative_tsn)) { + goto protocol_error; + } + if (TAILQ_EMPTY(&strm->inqueue)) { + /* Empty queue */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { + sctp_log_strm_del(control, NULL, SCTP_STR_LOG_FROM_INSERT_HD); + } + TAILQ_INSERT_HEAD(&strm->inqueue, control, next); + } else { + TAILQ_FOREACH(at, &strm->inqueue, next) { + if (compare_with_wrap(at->sinfo_ssn, + control->sinfo_ssn, MAX_SEQ)) { + /* + * one in queue is bigger than the + * new one, insert before this one + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { + sctp_log_strm_del(control, at, + SCTP_STR_LOG_FROM_INSERT_MD); + } + TAILQ_INSERT_BEFORE(at, control, next); + break; + } else if (at->sinfo_ssn == control->sinfo_ssn) { + /* + * Gak, He sent me a duplicate str + * seq number + */ + /* + * foo bar, I guess I will just free + * this new guy, should we abort + * too? FIX ME MAYBE? Or it COULD be + * that the SSN's have wrapped. + * Maybe I should compare to TSN + * somehow... sigh for now just blow + * away the chunk! + */ + + if (control->data) + sctp_m_freem(control->data); + control->data = NULL; + asoc->size_on_all_streams -= control->length; + sctp_ucount_decr(asoc->cnt_on_all_streams); + if (control->whoFrom) { + sctp_free_remote_addr(control->whoFrom); + control->whoFrom = NULL; + } + sctp_free_a_readq(stcb, control); + return; + } else { + if (TAILQ_NEXT(at, next) == NULL) { + /* + * We are at the end, insert + * it after this one + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { + sctp_log_strm_del(control, at, + SCTP_STR_LOG_FROM_INSERT_TL); + } + TAILQ_INSERT_AFTER(&strm->inqueue, + at, control, next); + break; + } + } + } + } + } +} + +/* + * Returns two things: You get the total size of the deliverable parts of the + * first fragmented message on the reassembly queue. And you get a 1 back if + * all of the message is ready or a 0 back if the message is still incomplete + */ +static int +sctp_is_all_msg_on_reasm(struct sctp_association *asoc, uint32_t * t_size) +{ + struct sctp_tmit_chunk *chk; + uint32_t tsn; + + *t_size = 0; + chk = TAILQ_FIRST(&asoc->reasmqueue); + if (chk == NULL) { + /* nothing on the queue */ + return (0); + } + if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0) { + /* Not a first on the queue */ + return (0); + } + tsn = chk->rec.data.TSN_seq; + while (chk) { + if (tsn != chk->rec.data.TSN_seq) { + return (0); + } + *t_size += chk->send_size; + if (chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { + return (1); + } + tsn++; + chk = TAILQ_NEXT(chk, sctp_next); + } + return (0); +} + +static void +sctp_deliver_reasm_check(struct sctp_tcb *stcb, struct sctp_association *asoc) +{ + struct sctp_tmit_chunk *chk; + uint16_t nxt_todel; + uint32_t tsize, pd_point; + +doit_again: + chk = TAILQ_FIRST(&asoc->reasmqueue); + if (chk == NULL) { + /* Huh? */ + asoc->size_on_reasm_queue = 0; + asoc->cnt_on_reasm_queue = 0; + return; + } + if (asoc->fragmented_delivery_inprogress == 0) { + nxt_todel = + asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered + 1; + if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) && + (nxt_todel == chk->rec.data.stream_seq || + (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED))) { + /* + * Yep the first one is here and its ok to deliver + * but should we? + */ + if (stcb->sctp_socket) { + pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket), + stcb->sctp_ep->partial_delivery_point); + } else { + pd_point = stcb->sctp_ep->partial_delivery_point; + } + if (sctp_is_all_msg_on_reasm(asoc, &tsize) || (tsize >= pd_point)) { + + /* + * Yes, we setup to start reception, by + * backing down the TSN just in case we + * can't deliver. If we + */ + asoc->fragmented_delivery_inprogress = 1; + asoc->tsn_last_delivered = + chk->rec.data.TSN_seq - 1; + asoc->str_of_pdapi = + chk->rec.data.stream_number; + asoc->ssn_of_pdapi = chk->rec.data.stream_seq; + asoc->pdapi_ppid = chk->rec.data.payloadtype; + asoc->fragment_flags = chk->rec.data.rcv_flags; + sctp_service_reassembly(stcb, asoc); + } + } + } else { + /* + * Service re-assembly will deliver stream data queued at + * the end of fragmented delivery.. but it wont know to go + * back and call itself again... we do that here with the + * got doit_again + */ + sctp_service_reassembly(stcb, asoc); + if (asoc->fragmented_delivery_inprogress == 0) { + /* + * finished our Fragmented delivery, could be more + * waiting? + */ + goto doit_again; + } + } +} + +/* + * Dump onto the re-assembly queue, in its proper place. After dumping on the + * queue, see if anthing can be delivered. If so pull it off (or as much as + * we can. If we run out of space then we must dump what we can and set the + * appropriate flag to say we queued what we could. + */ +static void +sctp_queue_data_for_reasm(struct sctp_tcb *stcb, struct sctp_association *asoc, + struct sctp_tmit_chunk *chk, int *abort_flag) +{ + struct mbuf *oper; + uint32_t cum_ackp1, last_tsn, prev_tsn, post_tsn; + u_char last_flags; + struct sctp_tmit_chunk *at, *prev, *next; + + prev = next = NULL; + cum_ackp1 = asoc->tsn_last_delivered + 1; + if (TAILQ_EMPTY(&asoc->reasmqueue)) { + /* This is the first one on the queue */ + TAILQ_INSERT_HEAD(&asoc->reasmqueue, chk, sctp_next); + /* + * we do not check for delivery of anything when only one + * fragment is here + */ + asoc->size_on_reasm_queue = chk->send_size; + sctp_ucount_incr(asoc->cnt_on_reasm_queue); + if (chk->rec.data.TSN_seq == cum_ackp1) { + if (asoc->fragmented_delivery_inprogress == 0 && + (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) != + SCTP_DATA_FIRST_FRAG) { + /* + * An empty queue, no delivery inprogress, + * we hit the next one and it does NOT have + * a FIRST fragment mark. + */ + SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, its not first, no fragmented delivery in progress\n"); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (sizeof(uint32_t) * 3); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_2); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_2; + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; + } else if (asoc->fragmented_delivery_inprogress && + (chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) { + /* + * We are doing a partial delivery and the + * NEXT chunk MUST be either the LAST or + * MIDDLE fragment NOT a FIRST + */ + SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS a first and fragmented delivery in progress\n"); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_3); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_3; + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; + } else if (asoc->fragmented_delivery_inprogress) { + /* + * Here we are ok with a MIDDLE or LAST + * piece + */ + if (chk->rec.data.stream_number != + asoc->str_of_pdapi) { + /* Got to be the right STR No */ + SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS not same stream number %d vs %d\n", + chk->rec.data.stream_number, + asoc->str_of_pdapi); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (sizeof(uint32_t) * 3); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_4); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_4; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; + } else if ((asoc->fragment_flags & SCTP_DATA_UNORDERED) != + SCTP_DATA_UNORDERED && + chk->rec.data.stream_seq != asoc->ssn_of_pdapi) { + /* Got to be the right STR Seq */ + SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it IS not same stream seq %d vs %d\n", + chk->rec.data.stream_seq, + asoc->ssn_of_pdapi); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_5); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_5; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; + } + } + } + return; + } + /* Find its place */ + TAILQ_FOREACH(at, &asoc->reasmqueue, sctp_next) { + if (compare_with_wrap(at->rec.data.TSN_seq, + chk->rec.data.TSN_seq, MAX_TSN)) { + /* + * one in queue is bigger than the new one, insert + * before this one + */ + /* A check */ + asoc->size_on_reasm_queue += chk->send_size; + sctp_ucount_incr(asoc->cnt_on_reasm_queue); + next = at; + TAILQ_INSERT_BEFORE(at, chk, sctp_next); + break; + } else if (at->rec.data.TSN_seq == chk->rec.data.TSN_seq) { + /* Gak, He sent me a duplicate str seq number */ + /* + * foo bar, I guess I will just free this new guy, + * should we abort too? FIX ME MAYBE? Or it COULD be + * that the SSN's have wrapped. Maybe I should + * compare to TSN somehow... sigh for now just blow + * away the chunk! + */ + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + sctp_free_a_chunk(stcb, chk); + return; + } else { + last_flags = at->rec.data.rcv_flags; + last_tsn = at->rec.data.TSN_seq; + prev = at; + if (TAILQ_NEXT(at, sctp_next) == NULL) { + /* + * We are at the end, insert it after this + * one + */ + /* check it first */ + asoc->size_on_reasm_queue += chk->send_size; + sctp_ucount_incr(asoc->cnt_on_reasm_queue); + TAILQ_INSERT_AFTER(&asoc->reasmqueue, at, chk, sctp_next); + break; + } + } + } + /* Now the audits */ + if (prev) { + prev_tsn = chk->rec.data.TSN_seq - 1; + if (prev_tsn == prev->rec.data.TSN_seq) { + /* + * Ok the one I am dropping onto the end is the + * NEXT. A bit of valdiation here. + */ + if ((prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == + SCTP_DATA_FIRST_FRAG || + (prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == + SCTP_DATA_MIDDLE_FRAG) { + /* + * Insert chk MUST be a MIDDLE or LAST + * fragment + */ + if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == + SCTP_DATA_FIRST_FRAG) { + SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - It can be a midlle or last but not a first\n"); + SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, it's a FIRST!\n"); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_6); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_6; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; + return; + } + if (chk->rec.data.stream_number != + prev->rec.data.stream_number) { + /* + * Huh, need the correct STR here, + * they must be the same. + */ + SCTP_PRINTF("Prev check - Gak, Evil plot, ssn:%d not the same as at:%d\n", + chk->rec.data.stream_number, + prev->rec.data.stream_number); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_7); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_7; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return; + } + if ((prev->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0 && + chk->rec.data.stream_seq != + prev->rec.data.stream_seq) { + /* + * Huh, need the correct STR here, + * they must be the same. + */ + SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, Evil plot, sseq:%d not the same as at:%d\n", + chk->rec.data.stream_seq, + prev->rec.data.stream_seq); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_8); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_8; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return; + } + } else if ((prev->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == + SCTP_DATA_LAST_FRAG) { + /* Insert chk MUST be a FIRST */ + if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) != + SCTP_DATA_FIRST_FRAG) { + SCTPDBG(SCTP_DEBUG_INDATA1, "Prev check - Gak, evil plot, its not FIRST and it must be!\n"); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_9); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_9; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return; + } + } + } + } + if (next) { + post_tsn = chk->rec.data.TSN_seq + 1; + if (post_tsn == next->rec.data.TSN_seq) { + /* + * Ok the one I am inserting ahead of is my NEXT + * one. A bit of valdiation here. + */ + if (next->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) { + /* Insert chk MUST be a last fragment */ + if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) + != SCTP_DATA_LAST_FRAG) { + SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Next is FIRST, we must be LAST\n"); + SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, its not a last!\n"); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_10); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_10; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return; + } + } else if ((next->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == + SCTP_DATA_MIDDLE_FRAG || + (next->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == + SCTP_DATA_LAST_FRAG) { + /* + * Insert chk CAN be MIDDLE or FIRST NOT + * LAST + */ + if ((chk->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) == + SCTP_DATA_LAST_FRAG) { + SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Next is a MIDDLE/LAST\n"); + SCTPDBG(SCTP_DEBUG_INDATA1, "Gak, Evil plot, new prev chunk is a LAST\n"); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_11); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_11; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return; + } + if (chk->rec.data.stream_number != + next->rec.data.stream_number) { + /* + * Huh, need the correct STR here, + * they must be the same. + */ + SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Gak, Evil plot, ssn:%d not the same as at:%d\n", + chk->rec.data.stream_number, + next->rec.data.stream_number); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_12); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_12; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return; + } + if ((next->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0 && + chk->rec.data.stream_seq != + next->rec.data.stream_seq) { + /* + * Huh, need the correct STR here, + * they must be the same. + */ + SCTPDBG(SCTP_DEBUG_INDATA1, "Next chk - Gak, Evil plot, sseq:%d not the same as at:%d\n", + chk->rec.data.stream_seq, + next->rec.data.stream_seq); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_13); + ippp++; + *ippp = chk->rec.data.TSN_seq; + ippp++; + *ippp = ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_13; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return; + } + } + } + } + /* Do we need to do some delivery? check */ + sctp_deliver_reasm_check(stcb, asoc); +} + +/* + * This is an unfortunate routine. It checks to make sure a evil guy is not + * stuffing us full of bad packet fragments. A broken peer could also do this + * but this is doubtful. It is to bad I must worry about evil crackers sigh + * :< more cycles. + */ +static int +sctp_does_tsn_belong_to_reasm(struct sctp_association *asoc, + uint32_t TSN_seq) +{ + struct sctp_tmit_chunk *at; + uint32_t tsn_est; + + TAILQ_FOREACH(at, &asoc->reasmqueue, sctp_next) { + if (compare_with_wrap(TSN_seq, + at->rec.data.TSN_seq, MAX_TSN)) { + /* is it one bigger? */ + tsn_est = at->rec.data.TSN_seq + 1; + if (tsn_est == TSN_seq) { + /* yep. It better be a last then */ + if ((at->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) != + SCTP_DATA_LAST_FRAG) { + /* + * Ok this guy belongs next to a guy + * that is NOT last, it should be a + * middle/last, not a complete + * chunk. + */ + return (1); + } else { + /* + * This guy is ok since its a LAST + * and the new chunk is a fully + * self- contained one. + */ + return (0); + } + } + } else if (TSN_seq == at->rec.data.TSN_seq) { + /* Software error since I have a dup? */ + return (1); + } else { + /* + * Ok, 'at' is larger than new chunk but does it + * need to be right before it. + */ + tsn_est = TSN_seq + 1; + if (tsn_est == at->rec.data.TSN_seq) { + /* Yep, It better be a first */ + if ((at->rec.data.rcv_flags & SCTP_DATA_FRAG_MASK) != + SCTP_DATA_FIRST_FRAG) { + return (1); + } else { + return (0); + } + } + } + } + return (0); +} + + +static int +sctp_process_a_data_chunk(struct sctp_tcb *stcb, struct sctp_association *asoc, + struct mbuf **m, int offset, struct sctp_data_chunk *ch, int chk_length, + struct sctp_nets *net, uint32_t * high_tsn, int *abort_flag, + int *break_flag, int last_chunk) +{ + /* Process a data chunk */ + /* struct sctp_tmit_chunk *chk; */ + struct sctp_tmit_chunk *chk; + uint32_t tsn, gap; + struct mbuf *dmbuf; + int indx, the_len; + int need_reasm_check = 0; + uint16_t strmno, strmseq; + struct mbuf *oper; + struct sctp_queued_to_read *control; + int ordered; + uint32_t protocol_id; + uint8_t chunk_flags; + struct sctp_stream_reset_list *liste; + + chk = NULL; + tsn = ntohl(ch->dp.tsn); + chunk_flags = ch->ch.chunk_flags; + if ((chunk_flags & SCTP_DATA_SACK_IMMEDIATELY) == SCTP_DATA_SACK_IMMEDIATELY) { + asoc->send_sack = 1; + } + protocol_id = ch->dp.protocol_id; + ordered = ((chunk_flags & SCTP_DATA_UNORDERED) == 0); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { + sctp_log_map(tsn, asoc->cumulative_tsn, asoc->highest_tsn_inside_map, SCTP_MAP_TSN_ENTERS); + } + if (stcb == NULL) { + return (0); + } + SCTP_LTRACE_CHK(stcb->sctp_ep, stcb, ch->ch.chunk_type, tsn); + if (compare_with_wrap(asoc->cumulative_tsn, tsn, MAX_TSN) || + asoc->cumulative_tsn == tsn) { + /* It is a duplicate */ + SCTP_STAT_INCR(sctps_recvdupdata); + if (asoc->numduptsns < SCTP_MAX_DUP_TSNS) { + /* Record a dup for the next outbound sack */ + asoc->dup_tsns[asoc->numduptsns] = tsn; + asoc->numduptsns++; + } + asoc->send_sack = 1; + return (0); + } + /* Calculate the number of TSN's between the base and this TSN */ + SCTP_CALC_TSN_TO_GAP(gap, tsn, asoc->mapping_array_base_tsn); + if (gap >= (SCTP_MAPPING_ARRAY << 3)) { + /* Can't hold the bit in the mapping at max array, toss it */ + return (0); + } + if (gap >= (uint32_t) (asoc->mapping_array_size << 3)) { + SCTP_TCB_LOCK_ASSERT(stcb); + if (sctp_expand_mapping_array(asoc, gap)) { + /* Can't expand, drop it */ + return (0); + } + } + if (compare_with_wrap(tsn, *high_tsn, MAX_TSN)) { + *high_tsn = tsn; + } + /* See if we have received this one already */ + if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap) || + SCTP_IS_TSN_PRESENT(asoc->nr_mapping_array, gap)) { + SCTP_STAT_INCR(sctps_recvdupdata); + if (asoc->numduptsns < SCTP_MAX_DUP_TSNS) { + /* Record a dup for the next outbound sack */ + asoc->dup_tsns[asoc->numduptsns] = tsn; + asoc->numduptsns++; + } + asoc->send_sack = 1; + return (0); + } + /* + * Check to see about the GONE flag, duplicates would cause a sack + * to be sent up above + */ + if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) + ) { + /* + * wait a minute, this guy is gone, there is no longer a + * receiver. Send peer an ABORT! + */ + struct mbuf *op_err; + + op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC); + sctp_abort_an_association(stcb->sctp_ep, stcb, 0, op_err, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; + return (0); + } + /* + * Now before going further we see if there is room. If NOT then we + * MAY let one through only IF this TSN is the one we are waiting + * for on a partial delivery API. + */ + + /* now do the tests */ + if (((asoc->cnt_on_all_streams + + asoc->cnt_on_reasm_queue + + asoc->cnt_msg_on_sb) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)) || + (((int)asoc->my_rwnd) <= 0)) { + /* + * When we have NO room in the rwnd we check to make sure + * the reader is doing its job... + */ + if (stcb->sctp_socket->so_rcv.sb_cc) { + /* some to read, wake-up */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + /* assoc was freed while we were unlocked */ + SCTP_SOCKET_UNLOCK(so, 1); + return (0); + } +#endif + sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } + /* now is it in the mapping array of what we have accepted? */ + if (compare_with_wrap(tsn, asoc->highest_tsn_inside_map, MAX_TSN) && + compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) { + /* Nope not in the valid range dump it */ + sctp_set_rwnd(stcb, asoc); + if ((asoc->cnt_on_all_streams + + asoc->cnt_on_reasm_queue + + asoc->cnt_msg_on_sb) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)) { + SCTP_STAT_INCR(sctps_datadropchklmt); + } else { + SCTP_STAT_INCR(sctps_datadroprwnd); + } + indx = *break_flag; + *break_flag = 1; + return (0); + } + } + strmno = ntohs(ch->dp.stream_id); + if (strmno >= asoc->streamincnt) { + struct sctp_paramhdr *phdr; + struct mbuf *mb; + + mb = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) * 2), + 0, M_DONTWAIT, 1, MT_DATA); + if (mb != NULL) { + /* add some space up front so prepend will work well */ + SCTP_BUF_RESV_UF(mb, sizeof(struct sctp_chunkhdr)); + phdr = mtod(mb, struct sctp_paramhdr *); + /* + * Error causes are just param's and this one has + * two back to back phdr, one with the error type + * and size, the other with the streamid and a rsvd + */ + SCTP_BUF_LEN(mb) = (sizeof(struct sctp_paramhdr) * 2); + phdr->param_type = htons(SCTP_CAUSE_INVALID_STREAM); + phdr->param_length = + htons(sizeof(struct sctp_paramhdr) * 2); + phdr++; + /* We insert the stream in the type field */ + phdr->param_type = ch->dp.stream_id; + /* And set the length to 0 for the rsvd field */ + phdr->param_length = 0; + sctp_queue_op_err(stcb, mb); + } + SCTP_STAT_INCR(sctps_badsid); + SCTP_TCB_LOCK_ASSERT(stcb); + SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); + if (compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) { + asoc->highest_tsn_inside_nr_map = tsn; + } + if (tsn == (asoc->cumulative_tsn + 1)) { + /* Update cum-ack */ + asoc->cumulative_tsn = tsn; + } + return (0); + } + /* + * Before we continue lets validate that we are not being fooled by + * an evil attacker. We can only have 4k chunks based on our TSN + * spread allowed by the mapping array 512 * 8 bits, so there is no + * way our stream sequence numbers could have wrapped. We of course + * only validate the FIRST fragment so the bit must be set. + */ + strmseq = ntohs(ch->dp.stream_sequence); +#ifdef SCTP_ASOCLOG_OF_TSNS + SCTP_TCB_LOCK_ASSERT(stcb); + if (asoc->tsn_in_at >= SCTP_TSN_LOG_SIZE) { + asoc->tsn_in_at = 0; + asoc->tsn_in_wrapped = 1; + } + asoc->in_tsnlog[asoc->tsn_in_at].tsn = tsn; + asoc->in_tsnlog[asoc->tsn_in_at].strm = strmno; + asoc->in_tsnlog[asoc->tsn_in_at].seq = strmseq; + asoc->in_tsnlog[asoc->tsn_in_at].sz = chk_length; + asoc->in_tsnlog[asoc->tsn_in_at].flgs = chunk_flags; + asoc->in_tsnlog[asoc->tsn_in_at].stcb = (void *)stcb; + asoc->in_tsnlog[asoc->tsn_in_at].in_pos = asoc->tsn_in_at; + asoc->in_tsnlog[asoc->tsn_in_at].in_out = 1; + asoc->tsn_in_at++; +#endif + if ((chunk_flags & SCTP_DATA_FIRST_FRAG) && + (TAILQ_EMPTY(&asoc->resetHead)) && + (chunk_flags & SCTP_DATA_UNORDERED) == 0 && + (compare_with_wrap(asoc->strmin[strmno].last_sequence_delivered, + strmseq, MAX_SEQ) || + asoc->strmin[strmno].last_sequence_delivered == strmseq)) { + /* The incoming sseq is behind where we last delivered? */ + SCTPDBG(SCTP_DEBUG_INDATA1, "EVIL/Broken-Dup S-SEQ:%d delivered:%d from peer, Abort!\n", + strmseq, asoc->strmin[strmno].last_sequence_delivered); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_14); + ippp++; + *ippp = tsn; + ippp++; + *ippp = ((strmno << 16) | strmseq); + + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_14; + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + *abort_flag = 1; + return (0); + } + /************************************ + * From here down we may find ch-> invalid + * so its a good idea NOT to use it. + *************************************/ + + the_len = (chk_length - sizeof(struct sctp_data_chunk)); + if (last_chunk == 0) { + dmbuf = SCTP_M_COPYM(*m, + (offset + sizeof(struct sctp_data_chunk)), + the_len, M_DONTWAIT); +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = dmbuf; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_ICOPY); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + } else { + /* We can steal the last chunk */ + int l_len; + + dmbuf = *m; + /* lop off the top part */ + m_adj(dmbuf, (offset + sizeof(struct sctp_data_chunk))); + if (SCTP_BUF_NEXT(dmbuf) == NULL) { + l_len = SCTP_BUF_LEN(dmbuf); + } else { + /* + * need to count up the size hopefully does not hit + * this to often :-0 + */ + struct mbuf *lat; + + l_len = 0; + lat = dmbuf; + while (lat) { + l_len += SCTP_BUF_LEN(lat); + lat = SCTP_BUF_NEXT(lat); + } + } + if (l_len > the_len) { + /* Trim the end round bytes off too */ + m_adj(dmbuf, -(l_len - the_len)); + } + } + if (dmbuf == NULL) { + SCTP_STAT_INCR(sctps_nomem); + return (0); + } + if ((chunk_flags & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG && + asoc->fragmented_delivery_inprogress == 0 && + TAILQ_EMPTY(&asoc->resetHead) && + ((ordered == 0) || + ((uint16_t) (asoc->strmin[strmno].last_sequence_delivered + 1) == strmseq && + TAILQ_EMPTY(&asoc->strmin[strmno].inqueue)))) { + /* Candidate for express delivery */ + /* + * Its not fragmented, No PD-API is up, Nothing in the + * delivery queue, Its un-ordered OR ordered and the next to + * deliver AND nothing else is stuck on the stream queue, + * And there is room for it in the socket buffer. Lets just + * stuff it up the buffer.... + */ + + /* It would be nice to avoid this copy if we could :< */ + sctp_alloc_a_readq(stcb, control); + sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn, + protocol_id, + stcb->asoc.context, + strmno, strmseq, + chunk_flags, + dmbuf); + if (control == NULL) { + goto failed_express_del; + } + SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); + if (compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) { + asoc->highest_tsn_inside_nr_map = tsn; + } + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, &stcb->sctp_socket->so_rcv, + 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); + + if ((chunk_flags & SCTP_DATA_UNORDERED) == 0) { + /* for ordered, bump what we delivered */ + asoc->strmin[strmno].last_sequence_delivered++; + } + SCTP_STAT_INCR(sctps_recvexpress); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { + sctp_log_strm_del_alt(stcb, tsn, strmseq, strmno, + SCTP_STR_LOG_FROM_EXPRS_DEL); + } + control = NULL; + + goto finish_express_del; + } +failed_express_del: + /* If we reach here this is a new chunk */ + chk = NULL; + control = NULL; + /* Express for fragmented delivery? */ + if ((asoc->fragmented_delivery_inprogress) && + (stcb->asoc.control_pdapi) && + (asoc->str_of_pdapi == strmno) && + (asoc->ssn_of_pdapi == strmseq) + ) { + control = stcb->asoc.control_pdapi; + if ((chunk_flags & SCTP_DATA_FIRST_FRAG) == SCTP_DATA_FIRST_FRAG) { + /* Can't be another first? */ + goto failed_pdapi_express_del; + } + if (tsn == (control->sinfo_tsn + 1)) { + /* Yep, we can add it on */ + int end = 0; + uint32_t cumack; + + if (chunk_flags & SCTP_DATA_LAST_FRAG) { + end = 1; + } + cumack = asoc->cumulative_tsn; + if ((cumack + 1) == tsn) + cumack = tsn; + + if (sctp_append_to_readq(stcb->sctp_ep, stcb, control, dmbuf, end, + tsn, + &stcb->sctp_socket->so_rcv)) { + SCTP_PRINTF("Append fails end:%d\n", end); + goto failed_pdapi_express_del; + } + SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); + if (compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) { + asoc->highest_tsn_inside_nr_map = tsn; + } + SCTP_STAT_INCR(sctps_recvexpressm); + control->sinfo_tsn = tsn; + asoc->tsn_last_delivered = tsn; + asoc->fragment_flags = chunk_flags; + asoc->tsn_of_pdapi_last_delivered = tsn; + asoc->last_flags_delivered = chunk_flags; + asoc->last_strm_seq_delivered = strmseq; + asoc->last_strm_no_delivered = strmno; + if (end) { + /* clean up the flags and such */ + asoc->fragmented_delivery_inprogress = 0; + if ((chunk_flags & SCTP_DATA_UNORDERED) == 0) { + asoc->strmin[strmno].last_sequence_delivered++; + } + stcb->asoc.control_pdapi = NULL; + if (TAILQ_EMPTY(&asoc->reasmqueue) == 0) { + /* + * There could be another message + * ready + */ + need_reasm_check = 1; + } + } + control = NULL; + goto finish_express_del; + } + } +failed_pdapi_express_del: + control = NULL; + if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { + SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, gap); + if (compare_with_wrap(tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) { + asoc->highest_tsn_inside_nr_map = tsn; + } + } else { + SCTP_SET_TSN_PRESENT(asoc->mapping_array, gap); + if (compare_with_wrap(tsn, asoc->highest_tsn_inside_map, MAX_TSN)) { + asoc->highest_tsn_inside_map = tsn; + } + } + if ((chunk_flags & SCTP_DATA_NOT_FRAG) != SCTP_DATA_NOT_FRAG) { + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* No memory so we drop the chunk */ + SCTP_STAT_INCR(sctps_nomem); + if (last_chunk == 0) { + /* we copied it, free the copy */ + sctp_m_freem(dmbuf); + } + return (0); + } + chk->rec.data.TSN_seq = tsn; + chk->no_fr_allowed = 0; + chk->rec.data.stream_seq = strmseq; + chk->rec.data.stream_number = strmno; + chk->rec.data.payloadtype = protocol_id; + chk->rec.data.context = stcb->asoc.context; + chk->rec.data.doing_fast_retransmit = 0; + chk->rec.data.rcv_flags = chunk_flags; + chk->asoc = asoc; + chk->send_size = the_len; + chk->whoTo = net; + atomic_add_int(&net->ref_count, 1); + chk->data = dmbuf; + } else { + sctp_alloc_a_readq(stcb, control); + sctp_build_readq_entry_mac(control, stcb, asoc->context, net, tsn, + protocol_id, + stcb->asoc.context, + strmno, strmseq, + chunk_flags, + dmbuf); + if (control == NULL) { + /* No memory so we drop the chunk */ + SCTP_STAT_INCR(sctps_nomem); + if (last_chunk == 0) { + /* we copied it, free the copy */ + sctp_m_freem(dmbuf); + } + return (0); + } + control->length = the_len; + } + + /* Mark it as received */ + /* Now queue it where it belongs */ + if (control != NULL) { + /* First a sanity check */ + if (asoc->fragmented_delivery_inprogress) { + /* + * Ok, we have a fragmented delivery in progress if + * this chunk is next to deliver OR belongs in our + * view to the reassembly, the peer is evil or + * broken. + */ + uint32_t estimate_tsn; + + estimate_tsn = asoc->tsn_last_delivered + 1; + if (TAILQ_EMPTY(&asoc->reasmqueue) && + (estimate_tsn == control->sinfo_tsn)) { + /* Evil/Broke peer */ + sctp_m_freem(control->data); + control->data = NULL; + if (control->whoFrom) { + sctp_free_remote_addr(control->whoFrom); + control->whoFrom = NULL; + } + sctp_free_a_readq(stcb, control); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_15); + ippp++; + *ippp = tsn; + ippp++; + *ippp = ((strmno << 16) | strmseq); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_15; + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return (0); + } else { + if (sctp_does_tsn_belong_to_reasm(asoc, control->sinfo_tsn)) { + sctp_m_freem(control->data); + control->data = NULL; + if (control->whoFrom) { + sctp_free_remote_addr(control->whoFrom); + control->whoFrom = NULL; + } + sctp_free_a_readq(stcb, control); + + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_16); + ippp++; + *ippp = tsn; + ippp++; + *ippp = ((strmno << 16) | strmseq); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_16; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return (0); + } + } + } else { + /* No PDAPI running */ + if (!TAILQ_EMPTY(&asoc->reasmqueue)) { + /* + * Reassembly queue is NOT empty validate + * that this tsn does not need to be in + * reasembly queue. If it does then our peer + * is broken or evil. + */ + if (sctp_does_tsn_belong_to_reasm(asoc, control->sinfo_tsn)) { + sctp_m_freem(control->data); + control->data = NULL; + if (control->whoFrom) { + sctp_free_remote_addr(control->whoFrom); + control->whoFrom = NULL; + } + sctp_free_a_readq(stcb, control); + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr) + + (3 * sizeof(uint32_t)); + ph = mtod(oper, + struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = + htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_17); + ippp++; + *ippp = tsn; + ippp++; + *ippp = ((strmno << 16) | strmseq); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_17; + sctp_abort_an_association(stcb->sctp_ep, + stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + + *abort_flag = 1; + return (0); + } + } + } + /* ok, if we reach here we have passed the sanity checks */ + if (chunk_flags & SCTP_DATA_UNORDERED) { + /* queue directly into socket buffer */ + sctp_mark_non_revokable(asoc, control->sinfo_tsn); + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); + } else { + /* + * Special check for when streams are resetting. We + * could be more smart about this and check the + * actual stream to see if it is not being reset.. + * that way we would not create a HOLB when amongst + * streams being reset and those not being reset. + * + * We take complete messages that have a stream reset + * intervening (aka the TSN is after where our + * cum-ack needs to be) off and put them on a + * pending_reply_queue. The reassembly ones we do + * not have to worry about since they are all sorted + * and proceessed by TSN order. It is only the + * singletons I must worry about. + */ + if (((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) && + ((compare_with_wrap(tsn, liste->tsn, MAX_TSN))) + ) { + /* + * yep its past where we need to reset... go + * ahead and queue it. + */ + if (TAILQ_EMPTY(&asoc->pending_reply_queue)) { + /* first one on */ + TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next); + } else { + struct sctp_queued_to_read *ctlOn; + unsigned char inserted = 0; + + ctlOn = TAILQ_FIRST(&asoc->pending_reply_queue); + while (ctlOn) { + if (compare_with_wrap(control->sinfo_tsn, + ctlOn->sinfo_tsn, MAX_TSN)) { + ctlOn = TAILQ_NEXT(ctlOn, next); + } else { + /* found it */ + TAILQ_INSERT_BEFORE(ctlOn, control, next); + inserted = 1; + break; + } + } + if (inserted == 0) { + /* + * must be put at end, use + * prevP (all setup from + * loop) to setup nextP. + */ + TAILQ_INSERT_TAIL(&asoc->pending_reply_queue, control, next); + } + } + } else { + sctp_queue_data_to_stream(stcb, asoc, control, abort_flag); + if (*abort_flag) { + return (0); + } + } + } + } else { + /* Into the re-assembly queue */ + sctp_queue_data_for_reasm(stcb, asoc, chk, abort_flag); + if (*abort_flag) { + /* + * the assoc is now gone and chk was put onto the + * reasm queue, which has all been freed. + */ + *m = NULL; + return (0); + } + } +finish_express_del: + if (tsn == (asoc->cumulative_tsn + 1)) { + /* Update cum-ack */ + asoc->cumulative_tsn = tsn; + } + if (last_chunk) { + *m = NULL; + } + if (ordered) { + SCTP_STAT_INCR_COUNTER64(sctps_inorderchunks); + } else { + SCTP_STAT_INCR_COUNTER64(sctps_inunorderchunks); + } + SCTP_STAT_INCR(sctps_recvdata); + /* Set it present please */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_STR_LOGGING_ENABLE) { + sctp_log_strm_del_alt(stcb, tsn, strmseq, strmno, SCTP_STR_LOG_FROM_MARK_TSN); + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { + sctp_log_map(asoc->mapping_array_base_tsn, asoc->cumulative_tsn, + asoc->highest_tsn_inside_map, SCTP_MAP_PREPARE_SLIDE); + } + /* check the special flag for stream resets */ + if (((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) && + ((compare_with_wrap(asoc->cumulative_tsn, liste->tsn, MAX_TSN)) || + (asoc->cumulative_tsn == liste->tsn)) + ) { + /* + * we have finished working through the backlogged TSN's now + * time to reset streams. 1: call reset function. 2: free + * pending_reply space 3: distribute any chunks in + * pending_reply_queue. + */ + struct sctp_queued_to_read *ctl; + + sctp_reset_in_stream(stcb, liste->number_entries, liste->req.list_of_streams); + TAILQ_REMOVE(&asoc->resetHead, liste, next_resp); + SCTP_FREE(liste, SCTP_M_STRESET); + /* sa_ignore FREED_MEMORY */ + liste = TAILQ_FIRST(&asoc->resetHead); + ctl = TAILQ_FIRST(&asoc->pending_reply_queue); + if (ctl && (liste == NULL)) { + /* All can be removed */ + while (ctl) { + TAILQ_REMOVE(&asoc->pending_reply_queue, ctl, next); + sctp_queue_data_to_stream(stcb, asoc, ctl, abort_flag); + if (*abort_flag) { + return (0); + } + ctl = TAILQ_FIRST(&asoc->pending_reply_queue); + } + } else if (ctl) { + /* more than one in queue */ + while (!compare_with_wrap(ctl->sinfo_tsn, liste->tsn, MAX_TSN)) { + /* + * if ctl->sinfo_tsn is <= liste->tsn we can + * process it which is the NOT of + * ctl->sinfo_tsn > liste->tsn + */ + TAILQ_REMOVE(&asoc->pending_reply_queue, ctl, next); + sctp_queue_data_to_stream(stcb, asoc, ctl, abort_flag); + if (*abort_flag) { + return (0); + } + ctl = TAILQ_FIRST(&asoc->pending_reply_queue); + } + } + /* + * Now service re-assembly to pick up anything that has been + * held on reassembly queue? + */ + sctp_deliver_reasm_check(stcb, asoc); + need_reasm_check = 0; + } + if (need_reasm_check) { + /* Another one waits ? */ + sctp_deliver_reasm_check(stcb, asoc); + } + return (1); +} + +int8_t sctp_map_lookup_tab[256] = { + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 4, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 5, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 4, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 6, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 4, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 5, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 4, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 7, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 4, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 5, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 4, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 6, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 4, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 5, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 4, + 0, 1, 0, 2, 0, 1, 0, 3, + 0, 1, 0, 2, 0, 1, 0, 8 +}; + + +void +sctp_slide_mapping_arrays(struct sctp_tcb *stcb) +{ + /* + * Now we also need to check the mapping array in a couple of ways. + * 1) Did we move the cum-ack point? + * + * When you first glance at this you might think that all entries that + * make up the postion of the cum-ack would be in the nr-mapping + * array only.. i.e. things up to the cum-ack are always + * deliverable. Thats true with one exception, when its a fragmented + * message we may not deliver the data until some threshold (or all + * of it) is in place. So we must OR the nr_mapping_array and + * mapping_array to get a true picture of the cum-ack. + */ + struct sctp_association *asoc; + int at; + uint8_t val; + int slide_from, slide_end, lgap, distance; + uint32_t old_cumack, old_base, old_highest, highest_tsn; + + asoc = &stcb->asoc; + at = 0; + + old_cumack = asoc->cumulative_tsn; + old_base = asoc->mapping_array_base_tsn; + old_highest = asoc->highest_tsn_inside_map; + /* + * We could probably improve this a small bit by calculating the + * offset of the current cum-ack as the starting point. + */ + at = 0; + for (slide_from = 0; slide_from < stcb->asoc.mapping_array_size; slide_from++) { + val = asoc->nr_mapping_array[slide_from] | asoc->mapping_array[slide_from]; + if (val == 0xff) { + at += 8; + } else { + /* there is a 0 bit */ + at += sctp_map_lookup_tab[val]; + break; + } + } + asoc->cumulative_tsn = asoc->mapping_array_base_tsn + (at - 1); + + if (compare_with_wrap(asoc->cumulative_tsn, asoc->highest_tsn_inside_map, MAX_TSN) && + compare_with_wrap(asoc->cumulative_tsn, asoc->highest_tsn_inside_nr_map, MAX_TSN)) { +#ifdef INVARIANTS + panic("huh, cumack 0x%x greater than high-tsn 0x%x in map", + asoc->cumulative_tsn, asoc->highest_tsn_inside_map); +#else + SCTP_PRINTF("huh, cumack 0x%x greater than high-tsn 0x%x in map - should panic?\n", + asoc->cumulative_tsn, asoc->highest_tsn_inside_map); + sctp_print_mapping_array(asoc); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { + sctp_log_map(0, 6, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); + } + asoc->highest_tsn_inside_map = asoc->cumulative_tsn; + asoc->highest_tsn_inside_nr_map = asoc->cumulative_tsn; +#endif + } + if (compare_with_wrap(asoc->highest_tsn_inside_nr_map, + asoc->highest_tsn_inside_map, + MAX_TSN)) { + highest_tsn = asoc->highest_tsn_inside_nr_map; + } else { + highest_tsn = asoc->highest_tsn_inside_map; + } + if ((asoc->cumulative_tsn == highest_tsn) && (at >= 8)) { + /* The complete array was completed by a single FR */ + /* highest becomes the cum-ack */ + int clr; + +#ifdef INVARIANTS + unsigned int i; + +#endif + + /* clear the array */ + clr = ((at + 7) >> 3); + if (clr > asoc->mapping_array_size) { + clr = asoc->mapping_array_size; + } + memset(asoc->mapping_array, 0, clr); + memset(asoc->nr_mapping_array, 0, clr); +#ifdef INVARIANTS + for (i = 0; i < asoc->mapping_array_size; i++) { + if ((asoc->mapping_array[i]) || (asoc->nr_mapping_array[i])) { + printf("Error Mapping array's not clean at clear\n"); + sctp_print_mapping_array(asoc); + } + } +#endif + asoc->mapping_array_base_tsn = asoc->cumulative_tsn + 1; + asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map = asoc->cumulative_tsn; + } else if (at >= 8) { + /* we can slide the mapping array down */ + /* slide_from holds where we hit the first NON 0xff byte */ + + /* + * now calculate the ceiling of the move using our highest + * TSN value + */ + SCTP_CALC_TSN_TO_GAP(lgap, highest_tsn, asoc->mapping_array_base_tsn); + slide_end = (lgap >> 3); + if (slide_end < slide_from) { + sctp_print_mapping_array(asoc); +#ifdef INVARIANTS + panic("impossible slide"); +#else + printf("impossible slide lgap:%x slide_end:%x slide_from:%x? at:%d\n", + lgap, slide_end, slide_from, at); + return; +#endif + } + if (slide_end > asoc->mapping_array_size) { +#ifdef INVARIANTS + panic("would overrun buffer"); +#else + printf("Gak, would have overrun map end:%d slide_end:%d\n", + asoc->mapping_array_size, slide_end); + slide_end = asoc->mapping_array_size; +#endif + } + distance = (slide_end - slide_from) + 1; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { + sctp_log_map(old_base, old_cumack, old_highest, + SCTP_MAP_PREPARE_SLIDE); + sctp_log_map((uint32_t) slide_from, (uint32_t) slide_end, + (uint32_t) lgap, SCTP_MAP_SLIDE_FROM); + } + if (distance + slide_from > asoc->mapping_array_size || + distance < 0) { + /* + * Here we do NOT slide forward the array so that + * hopefully when more data comes in to fill it up + * we will be able to slide it forward. Really I + * don't think this should happen :-0 + */ + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { + sctp_log_map((uint32_t) distance, (uint32_t) slide_from, + (uint32_t) asoc->mapping_array_size, + SCTP_MAP_SLIDE_NONE); + } + } else { + int ii; + + for (ii = 0; ii < distance; ii++) { + asoc->mapping_array[ii] = asoc->mapping_array[slide_from + ii]; + asoc->nr_mapping_array[ii] = asoc->nr_mapping_array[slide_from + ii]; + + } + for (ii = distance; ii < asoc->mapping_array_size; ii++) { + asoc->mapping_array[ii] = 0; + asoc->nr_mapping_array[ii] = 0; + } + if (asoc->highest_tsn_inside_map + 1 == asoc->mapping_array_base_tsn) { + asoc->highest_tsn_inside_map += (slide_from << 3); + } + if (asoc->highest_tsn_inside_nr_map + 1 == asoc->mapping_array_base_tsn) { + asoc->highest_tsn_inside_nr_map += (slide_from << 3); + } + asoc->mapping_array_base_tsn += (slide_from << 3); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { + sctp_log_map(asoc->mapping_array_base_tsn, + asoc->cumulative_tsn, asoc->highest_tsn_inside_map, + SCTP_MAP_SLIDE_RESULT); + } + } + } +} + + +void +sctp_sack_check(struct sctp_tcb *stcb, int was_a_gap, int *abort_flag) +{ + struct sctp_association *asoc; + uint32_t highest_tsn; + + asoc = &stcb->asoc; + if (compare_with_wrap(asoc->highest_tsn_inside_nr_map, + asoc->highest_tsn_inside_map, + MAX_TSN)) { + highest_tsn = asoc->highest_tsn_inside_nr_map; + } else { + highest_tsn = asoc->highest_tsn_inside_map; + } + + /* + * Now we need to see if we need to queue a sack or just start the + * timer (if allowed). + */ + if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { + /* + * Ok special case, in SHUTDOWN-SENT case. here we maker + * sure SACK timer is off and instead send a SHUTDOWN and a + * SACK + */ + if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { + sctp_timer_stop(SCTP_TIMER_TYPE_RECV, + stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_INDATA + SCTP_LOC_18); + } + sctp_send_shutdown(stcb, stcb->asoc.primary_destination); + sctp_send_sack(stcb); + } else { + int is_a_gap; + + /* is there a gap now ? */ + is_a_gap = compare_with_wrap(highest_tsn, stcb->asoc.cumulative_tsn, MAX_TSN); + + /* + * CMT DAC algorithm: increase number of packets received + * since last ack + */ + stcb->asoc.cmt_dac_pkts_rcvd++; + + if ((stcb->asoc.send_sack == 1) || /* We need to send a + * SACK */ + ((was_a_gap) && (is_a_gap == 0)) || /* was a gap, but no + * longer is one */ + (stcb->asoc.numduptsns) || /* we have dup's */ + (is_a_gap) || /* is still a gap */ + (stcb->asoc.delayed_ack == 0) || /* Delayed sack disabled */ + (stcb->asoc.data_pkts_seen >= stcb->asoc.sack_freq) /* hit limit of pkts */ + ) { + + if ((stcb->asoc.sctp_cmt_on_off == 1) && + (SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) && + (stcb->asoc.send_sack == 0) && + (stcb->asoc.numduptsns == 0) && + (stcb->asoc.delayed_ack) && + (!SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer))) { + + /* + * CMT DAC algorithm: With CMT, delay acks + * even in the face of + * + * reordering. Therefore, if acks that do not + * have to be sent because of the above + * reasons, will be delayed. That is, acks + * that would have been sent due to gap + * reports will be delayed with DAC. Start + * the delayed ack timer. + */ + sctp_timer_start(SCTP_TIMER_TYPE_RECV, + stcb->sctp_ep, stcb, NULL); + } else { + /* + * Ok we must build a SACK since the timer + * is pending, we got our first packet OR + * there are gaps or duplicates. + */ + (void)SCTP_OS_TIMER_STOP(&stcb->asoc.dack_timer.timer); + sctp_send_sack(stcb); + } + } else { + if (!SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { + sctp_timer_start(SCTP_TIMER_TYPE_RECV, + stcb->sctp_ep, stcb, NULL); + } + } + } +} + +void +sctp_service_queues(struct sctp_tcb *stcb, struct sctp_association *asoc) +{ + struct sctp_tmit_chunk *chk; + uint32_t tsize, pd_point; + uint16_t nxt_todel; + + if (asoc->fragmented_delivery_inprogress) { + sctp_service_reassembly(stcb, asoc); + } + /* Can we proceed further, i.e. the PD-API is complete */ + if (asoc->fragmented_delivery_inprogress) { + /* no */ + return; + } + /* + * Now is there some other chunk I can deliver from the reassembly + * queue. + */ +doit_again: + chk = TAILQ_FIRST(&asoc->reasmqueue); + if (chk == NULL) { + asoc->size_on_reasm_queue = 0; + asoc->cnt_on_reasm_queue = 0; + return; + } + nxt_todel = asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered + 1; + if ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) && + ((nxt_todel == chk->rec.data.stream_seq) || + (chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED))) { + /* + * Yep the first one is here. We setup to start reception, + * by backing down the TSN just in case we can't deliver. + */ + + /* + * Before we start though either all of the message should + * be here or the socket buffer max or nothing on the + * delivery queue and something can be delivered. + */ + if (stcb->sctp_socket) { + pd_point = min(SCTP_SB_LIMIT_RCV(stcb->sctp_socket), + stcb->sctp_ep->partial_delivery_point); + } else { + pd_point = stcb->sctp_ep->partial_delivery_point; + } + if (sctp_is_all_msg_on_reasm(asoc, &tsize) || (tsize >= pd_point)) { + asoc->fragmented_delivery_inprogress = 1; + asoc->tsn_last_delivered = chk->rec.data.TSN_seq - 1; + asoc->str_of_pdapi = chk->rec.data.stream_number; + asoc->ssn_of_pdapi = chk->rec.data.stream_seq; + asoc->pdapi_ppid = chk->rec.data.payloadtype; + asoc->fragment_flags = chk->rec.data.rcv_flags; + sctp_service_reassembly(stcb, asoc); + if (asoc->fragmented_delivery_inprogress == 0) { + goto doit_again; + } + } + } +} + +int +sctp_process_data(struct mbuf **mm, int iphlen, int *offset, int length, + struct sctphdr *sh, struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net, uint32_t * high_tsn) +{ + struct sctp_data_chunk *ch, chunk_buf; + struct sctp_association *asoc; + int num_chunks = 0; /* number of control chunks processed */ + int stop_proc = 0; + int chk_length, break_flag, last_chunk; + int abort_flag = 0, was_a_gap; + struct mbuf *m; + uint32_t highest_tsn; + + /* set the rwnd */ + sctp_set_rwnd(stcb, &stcb->asoc); + + m = *mm; + SCTP_TCB_LOCK_ASSERT(stcb); + asoc = &stcb->asoc; + if (compare_with_wrap(asoc->highest_tsn_inside_nr_map, asoc->highest_tsn_inside_map, MAX_TSN)) { + highest_tsn = asoc->highest_tsn_inside_nr_map; + } else { + highest_tsn = asoc->highest_tsn_inside_map; + } + was_a_gap = compare_with_wrap(highest_tsn, stcb->asoc.cumulative_tsn, MAX_TSN); + /* + * setup where we got the last DATA packet from for any SACK that + * may need to go out. Don't bump the net. This is done ONLY when a + * chunk is assigned. + */ + asoc->last_data_chunk_from = net; + + /*- + * Now before we proceed we must figure out if this is a wasted + * cluster... i.e. it is a small packet sent in and yet the driver + * underneath allocated a full cluster for it. If so we must copy it + * to a smaller mbuf and free up the cluster mbuf. This will help + * with cluster starvation. Note for __Panda__ we don't do this + * since it has clusters all the way down to 64 bytes. + */ + if (SCTP_BUF_LEN(m) < (long)MLEN && SCTP_BUF_NEXT(m) == NULL) { + /* we only handle mbufs that are singletons.. not chains */ + m = sctp_get_mbuf_for_msg(SCTP_BUF_LEN(m), 0, M_DONTWAIT, 1, MT_DATA); + if (m) { + /* ok lets see if we can copy the data up */ + caddr_t *from, *to; + + /* get the pointers and copy */ + to = mtod(m, caddr_t *); + from = mtod((*mm), caddr_t *); + memcpy(to, from, SCTP_BUF_LEN((*mm))); + /* copy the length and free up the old */ + SCTP_BUF_LEN(m) = SCTP_BUF_LEN((*mm)); + sctp_m_freem(*mm); + /* sucess, back copy */ + *mm = m; + } else { + /* We are in trouble in the mbuf world .. yikes */ + m = *mm; + } + } + /* get pointer to the first chunk header */ + ch = (struct sctp_data_chunk *)sctp_m_getptr(m, *offset, + sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf); + if (ch == NULL) { + return (1); + } + /* + * process all DATA chunks... + */ + *high_tsn = asoc->cumulative_tsn; + break_flag = 0; + asoc->data_pkts_seen++; + while (stop_proc == 0) { + /* validate chunk length */ + chk_length = ntohs(ch->ch.chunk_length); + if (length - *offset < chk_length) { + /* all done, mutulated chunk */ + stop_proc = 1; + break; + } + if (ch->ch.chunk_type == SCTP_DATA) { + if ((size_t)chk_length < sizeof(struct sctp_data_chunk) + 1) { + /* + * Need to send an abort since we had a + * invalid data chunk. + */ + struct mbuf *op_err; + + op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 2 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + + if (op_err) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_paramhdr) + + (2 * sizeof(uint32_t)); + ph = mtod(op_err, struct sctp_paramhdr *); + ph->param_type = + htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(op_err)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_19); + ippp++; + *ippp = asoc->cumulative_tsn; + + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_19; + sctp_abort_association(inp, stcb, m, iphlen, sh, + op_err, 0, net->port); + return (2); + } +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xB1, 0); +#endif + if (SCTP_SIZE32(chk_length) == (length - *offset)) { + last_chunk = 1; + } else { + last_chunk = 0; + } + if (sctp_process_a_data_chunk(stcb, asoc, mm, *offset, ch, + chk_length, net, high_tsn, &abort_flag, &break_flag, + last_chunk)) { + num_chunks++; + } + if (abort_flag) + return (2); + + if (break_flag) { + /* + * Set because of out of rwnd space and no + * drop rep space left. + */ + stop_proc = 1; + break; + } + } else { + /* not a data chunk in the data region */ + switch (ch->ch.chunk_type) { + case SCTP_INITIATION: + case SCTP_INITIATION_ACK: + case SCTP_SELECTIVE_ACK: + case SCTP_NR_SELECTIVE_ACK: /* EY */ + case SCTP_HEARTBEAT_REQUEST: + case SCTP_HEARTBEAT_ACK: + case SCTP_ABORT_ASSOCIATION: + case SCTP_SHUTDOWN: + case SCTP_SHUTDOWN_ACK: + case SCTP_OPERATION_ERROR: + case SCTP_COOKIE_ECHO: + case SCTP_COOKIE_ACK: + case SCTP_ECN_ECHO: + case SCTP_ECN_CWR: + case SCTP_SHUTDOWN_COMPLETE: + case SCTP_AUTHENTICATION: + case SCTP_ASCONF_ACK: + case SCTP_PACKET_DROPPED: + case SCTP_STREAM_RESET: + case SCTP_FORWARD_CUM_TSN: + case SCTP_ASCONF: + /* + * Now, what do we do with KNOWN chunks that + * are NOT in the right place? + * + * For now, I do nothing but ignore them. We + * may later want to add sysctl stuff to + * switch out and do either an ABORT() or + * possibly process them. + */ + if (SCTP_BASE_SYSCTL(sctp_strict_data_order)) { + struct mbuf *op_err; + + op_err = sctp_generate_invmanparam(SCTP_CAUSE_PROTOCOL_VIOLATION); + sctp_abort_association(inp, stcb, m, iphlen, sh, op_err, 0, net->port); + return (2); + } + break; + default: + /* unknown chunk type, use bit rules */ + if (ch->ch.chunk_type & 0x40) { + /* Add a error report to the queue */ + struct mbuf *merr; + struct sctp_paramhdr *phd; + + merr = sctp_get_mbuf_for_msg(sizeof(*phd), 0, M_DONTWAIT, 1, MT_DATA); + if (merr) { + phd = mtod(merr, struct sctp_paramhdr *); + /* + * We cheat and use param + * type since we did not + * bother to define a error + * cause struct. They are + * the same basic format + * with different names. + */ + phd->param_type = + htons(SCTP_CAUSE_UNRECOG_CHUNK); + phd->param_length = + htons(chk_length + sizeof(*phd)); + SCTP_BUF_LEN(merr) = sizeof(*phd); + SCTP_BUF_NEXT(merr) = SCTP_M_COPYM(m, *offset, + SCTP_SIZE32(chk_length), + M_DONTWAIT); + if (SCTP_BUF_NEXT(merr)) { + sctp_queue_op_err(stcb, merr); + } else { + sctp_m_freem(merr); + } + } + } + if ((ch->ch.chunk_type & 0x80) == 0) { + /* discard the rest of this packet */ + stop_proc = 1; + } /* else skip this bad chunk and + * continue... */ + break; + }; /* switch of chunk type */ + } + *offset += SCTP_SIZE32(chk_length); + if ((*offset >= length) || stop_proc) { + /* no more data left in the mbuf chain */ + stop_proc = 1; + continue; + } + ch = (struct sctp_data_chunk *)sctp_m_getptr(m, *offset, + sizeof(struct sctp_data_chunk), (uint8_t *) & chunk_buf); + if (ch == NULL) { + *offset = length; + stop_proc = 1; + break; + + } + } /* while */ + if (break_flag) { + /* + * we need to report rwnd overrun drops. + */ + sctp_send_packet_dropped(stcb, net, *mm, iphlen, 0); + } + if (num_chunks) { + /* + * Did we get data, if so update the time for auto-close and + * give peer credit for being alive. + */ + SCTP_STAT_INCR(sctps_recvpktwithdata); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INDATA, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_last_rcvd); + } + /* now service all of the reassm queue if needed */ + if (!(TAILQ_EMPTY(&asoc->reasmqueue))) + sctp_service_queues(stcb, asoc); + + if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { + /* Assure that we ack right away */ + stcb->asoc.send_sack = 1; + } + /* Start a sack timer or QUEUE a SACK for sending */ + sctp_sack_check(stcb, was_a_gap, &abort_flag); + if (abort_flag) + return (2); + + return (0); +} + +static int +sctp_process_segment_range(struct sctp_tcb *stcb, struct sctp_tmit_chunk **p_tp1, uint32_t last_tsn, + uint16_t frag_strt, uint16_t frag_end, int nr_sacking, + int *num_frs, + uint32_t * biggest_newly_acked_tsn, + uint32_t * this_sack_lowest_newack, + int *ecn_seg_sums) +{ + struct sctp_tmit_chunk *tp1; + unsigned int theTSN; + int j, wake_him = 0, circled = 0; + + /* Recover the tp1 we last saw */ + tp1 = *p_tp1; + if (tp1 == NULL) { + tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue); + } + for (j = frag_strt; j <= frag_end; j++) { + theTSN = j + last_tsn; + while (tp1) { + if (tp1->rec.data.doing_fast_retransmit) + (*num_frs) += 1; + + /*- + * CMT: CUCv2 algorithm. For each TSN being + * processed from the sent queue, track the + * next expected pseudo-cumack, or + * rtx_pseudo_cumack, if required. Separate + * cumack trackers for first transmissions, + * and retransmissions. + */ + if ((tp1->whoTo->find_pseudo_cumack == 1) && (tp1->sent < SCTP_DATAGRAM_RESEND) && + (tp1->snd_count == 1)) { + tp1->whoTo->pseudo_cumack = tp1->rec.data.TSN_seq; + tp1->whoTo->find_pseudo_cumack = 0; + } + if ((tp1->whoTo->find_rtx_pseudo_cumack == 1) && (tp1->sent < SCTP_DATAGRAM_RESEND) && + (tp1->snd_count > 1)) { + tp1->whoTo->rtx_pseudo_cumack = tp1->rec.data.TSN_seq; + tp1->whoTo->find_rtx_pseudo_cumack = 0; + } + if (tp1->rec.data.TSN_seq == theTSN) { + if (tp1->sent != SCTP_DATAGRAM_UNSENT) { + /*- + * must be held until + * cum-ack passes + */ + /*- + * ECN Nonce: Add the nonce + * value to the sender's + * nonce sum + */ + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + /*- + * If it is less than RESEND, it is + * now no-longer in flight. + * Higher values may already be set + * via previous Gap Ack Blocks... + * i.e. ACKED or RESEND. + */ + if (compare_with_wrap(tp1->rec.data.TSN_seq, + *biggest_newly_acked_tsn, MAX_TSN)) { + *biggest_newly_acked_tsn = tp1->rec.data.TSN_seq; + } + /*- + * CMT: SFR algo (and HTNA) - set + * saw_newack to 1 for dest being + * newly acked. update + * this_sack_highest_newack if + * appropriate. + */ + if (tp1->rec.data.chunk_was_revoked == 0) + tp1->whoTo->saw_newack = 1; + + if (compare_with_wrap(tp1->rec.data.TSN_seq, + tp1->whoTo->this_sack_highest_newack, + MAX_TSN)) { + tp1->whoTo->this_sack_highest_newack = + tp1->rec.data.TSN_seq; + } + /*- + * CMT DAC algo: also update + * this_sack_lowest_newack + */ + if (*this_sack_lowest_newack == 0) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { + sctp_log_sack(*this_sack_lowest_newack, + last_tsn, + tp1->rec.data.TSN_seq, + 0, + 0, + SCTP_LOG_TSN_ACKED); + } + *this_sack_lowest_newack = tp1->rec.data.TSN_seq; + } + /*- + * CMT: CUCv2 algorithm. If (rtx-)pseudo-cumack for corresp + * dest is being acked, then we have a new (rtx-)pseudo-cumack. Set + * new_(rtx_)pseudo_cumack to TRUE so that the cwnd for this dest can be + * updated. Also trigger search for the next expected (rtx-)pseudo-cumack. + * Separate pseudo_cumack trackers for first transmissions and + * retransmissions. + */ + if (tp1->rec.data.TSN_seq == tp1->whoTo->pseudo_cumack) { + if (tp1->rec.data.chunk_was_revoked == 0) { + tp1->whoTo->new_pseudo_cumack = 1; + } + tp1->whoTo->find_pseudo_cumack = 1; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, tp1->whoTo, tp1->rec.data.TSN_seq, SCTP_CWND_LOG_FROM_SACK); + } + if (tp1->rec.data.TSN_seq == tp1->whoTo->rtx_pseudo_cumack) { + if (tp1->rec.data.chunk_was_revoked == 0) { + tp1->whoTo->new_pseudo_cumack = 1; + } + tp1->whoTo->find_rtx_pseudo_cumack = 1; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { + sctp_log_sack(*biggest_newly_acked_tsn, + last_tsn, + tp1->rec.data.TSN_seq, + frag_strt, + frag_end, + SCTP_LOG_TSN_ACKED); + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_GAP, + tp1->whoTo->flight_size, + tp1->book_size, + (uintptr_t) tp1->whoTo, + tp1->rec.data.TSN_seq); + } + sctp_flight_size_decrease(tp1); + sctp_total_flight_decrease(stcb, tp1); + + tp1->whoTo->net_ack += tp1->send_size; + if (tp1->snd_count < 2) { + /*- + * True non-retransmited chunk + */ + tp1->whoTo->net_ack2 += tp1->send_size; + + /*- + * update RTO too ? + */ + if (tp1->do_rtt) { + tp1->whoTo->RTO = + sctp_calculate_rto(stcb, + &stcb->asoc, + tp1->whoTo, + &tp1->sent_rcv_time, + sctp_align_safe_nocopy); + tp1->do_rtt = 0; + } + } + } + if (tp1->sent <= SCTP_DATAGRAM_RESEND) { + (*ecn_seg_sums) += tp1->rec.data.ect_nonce; + (*ecn_seg_sums) &= SCTP_SACK_NONCE_SUM; + if (compare_with_wrap(tp1->rec.data.TSN_seq, + stcb->asoc.this_sack_highest_gap, + MAX_TSN)) { + stcb->asoc.this_sack_highest_gap = + tp1->rec.data.TSN_seq; + } + if (tp1->sent == SCTP_DATAGRAM_RESEND) { + sctp_ucount_decr(stcb->asoc.sent_queue_retran_cnt); +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xB2, + (stcb->asoc.sent_queue_retran_cnt & 0x000000ff)); +#endif + } + } + /*- + * All chunks NOT UNSENT fall through here and are marked + * (leave PR-SCTP ones that are to skip alone though) + */ + if (tp1->sent != SCTP_FORWARD_TSN_SKIP) + tp1->sent = SCTP_DATAGRAM_MARKED; + + if (tp1->rec.data.chunk_was_revoked) { + /* deflate the cwnd */ + tp1->whoTo->cwnd -= tp1->book_size; + tp1->rec.data.chunk_was_revoked = 0; + } + /* NR Sack code here */ + if (nr_sacking) { + if (tp1->data) { + /* + * sa_ignore + * NO_NULL_CHK + */ + sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1); + sctp_m_freem(tp1->data); + tp1->data = NULL; + } + wake_him++; + } + } + break; + } /* if (tp1->TSN_seq == theTSN) */ + if (compare_with_wrap(tp1->rec.data.TSN_seq, theTSN, + MAX_TSN)) + break; + + tp1 = TAILQ_NEXT(tp1, sctp_next); + if ((tp1 == NULL) && (circled == 0)) { + circled++; + tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue); + } + } /* end while (tp1) */ + if (tp1 == NULL) { + circled = 0; + tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue); + } + /* In case the fragments were not in order we must reset */ + } /* end for (j = fragStart */ + *p_tp1 = tp1; + return (wake_him); /* Return value only used for nr-sack */ +} + + +static int +sctp_handle_segments(struct mbuf *m, int *offset, struct sctp_tcb *stcb, struct sctp_association *asoc, + uint32_t last_tsn, uint32_t * biggest_tsn_acked, + uint32_t * biggest_newly_acked_tsn, uint32_t * this_sack_lowest_newack, + int num_seg, int num_nr_seg, int *ecn_seg_sums) +{ + struct sctp_gap_ack_block *frag, block; + struct sctp_tmit_chunk *tp1; + int i; + int num_frs = 0; + int chunk_freed; + int non_revocable; + uint16_t frag_strt, frag_end, prev_frag_end; + + tp1 = TAILQ_FIRST(&asoc->sent_queue); + prev_frag_end = 0; + chunk_freed = 0; + + for (i = 0; i < (num_seg + num_nr_seg); i++) { + if (i == num_seg) { + prev_frag_end = 0; + tp1 = TAILQ_FIRST(&asoc->sent_queue); + } + frag = (struct sctp_gap_ack_block *)sctp_m_getptr(m, *offset, + sizeof(struct sctp_gap_ack_block), (uint8_t *) & block); + *offset += sizeof(block); + if (frag == NULL) { + return (chunk_freed); + } + frag_strt = ntohs(frag->start); + frag_end = ntohs(frag->end); + + if (frag_strt > frag_end) { + /* This gap report is malformed, skip it. */ + continue; + } + if (frag_strt <= prev_frag_end) { + /* This gap report is not in order, so restart. */ + tp1 = TAILQ_FIRST(&asoc->sent_queue); + } + if (compare_with_wrap((last_tsn + frag_end), *biggest_tsn_acked, MAX_TSN)) { + *biggest_tsn_acked = last_tsn + frag_end; + } + if (i < num_seg) { + non_revocable = 0; + } else { + non_revocable = 1; + } + if (sctp_process_segment_range(stcb, &tp1, last_tsn, frag_strt, frag_end, + non_revocable, &num_frs, biggest_newly_acked_tsn, + this_sack_lowest_newack, ecn_seg_sums)) { + chunk_freed = 1; + } + prev_frag_end = frag_end; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { + if (num_frs) + sctp_log_fr(*biggest_tsn_acked, + *biggest_newly_acked_tsn, + last_tsn, SCTP_FR_LOG_BIGGEST_TSNS); + } + return (chunk_freed); +} + +static void +sctp_check_for_revoked(struct sctp_tcb *stcb, + struct sctp_association *asoc, uint32_t cumack, + uint32_t biggest_tsn_acked) +{ + struct sctp_tmit_chunk *tp1; + int tot_revoked = 0; + + tp1 = TAILQ_FIRST(&asoc->sent_queue); + while (tp1) { + if (compare_with_wrap(tp1->rec.data.TSN_seq, cumack, + MAX_TSN)) { + /* + * ok this guy is either ACK or MARKED. If it is + * ACKED it has been previously acked but not this + * time i.e. revoked. If it is MARKED it was ACK'ed + * again. + */ + if (compare_with_wrap(tp1->rec.data.TSN_seq, biggest_tsn_acked, + MAX_TSN)) + break; + + + if (tp1->sent == SCTP_DATAGRAM_ACKED) { + /* it has been revoked */ + tp1->sent = SCTP_DATAGRAM_SENT; + tp1->rec.data.chunk_was_revoked = 1; + /* + * We must add this stuff back in to assure + * timers and such get started. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_UP_REVOKE, + tp1->whoTo->flight_size, + tp1->book_size, + (uintptr_t) tp1->whoTo, + tp1->rec.data.TSN_seq); + } + sctp_flight_size_increase(tp1); + sctp_total_flight_increase(stcb, tp1); + /* + * We inflate the cwnd to compensate for our + * artificial inflation of the flight_size. + */ + tp1->whoTo->cwnd += tp1->book_size; + tot_revoked++; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { + sctp_log_sack(asoc->last_acked_seq, + cumack, + tp1->rec.data.TSN_seq, + 0, + 0, + SCTP_LOG_TSN_REVOKED); + } + } else if (tp1->sent == SCTP_DATAGRAM_MARKED) { + /* it has been re-acked in this SACK */ + tp1->sent = SCTP_DATAGRAM_ACKED; + } + } + if (tp1->sent == SCTP_DATAGRAM_UNSENT) + break; + tp1 = TAILQ_NEXT(tp1, sctp_next); + } + if (tot_revoked > 0) { + /* + * Setup the ecn nonce re-sync point. We do this since once + * data is revoked we begin to retransmit things, which do + * NOT have the ECN bits set. This means we are now out of + * sync and must wait until we get back in sync with the + * peer to check ECN bits. + */ + tp1 = TAILQ_FIRST(&asoc->send_queue); + if (tp1 == NULL) { + asoc->nonce_resync_tsn = asoc->sending_seq; + } else { + asoc->nonce_resync_tsn = tp1->rec.data.TSN_seq; + } + asoc->nonce_wait_for_ecne = 0; + asoc->nonce_sum_check = 0; + } +} + + +static void +sctp_strike_gap_ack_chunks(struct sctp_tcb *stcb, struct sctp_association *asoc, + uint32_t biggest_tsn_acked, uint32_t biggest_tsn_newly_acked, uint32_t this_sack_lowest_newack, int accum_moved) +{ + struct sctp_tmit_chunk *tp1; + int strike_flag = 0; + struct timeval now; + int tot_retrans = 0; + uint32_t sending_seq; + struct sctp_nets *net; + int num_dests_sacked = 0; + + /* + * select the sending_seq, this is either the next thing ready to be + * sent but not transmitted, OR, the next seq we assign. + */ + tp1 = TAILQ_FIRST(&stcb->asoc.send_queue); + if (tp1 == NULL) { + sending_seq = asoc->sending_seq; + } else { + sending_seq = tp1->rec.data.TSN_seq; + } + + /* CMT DAC algo: finding out if SACK is a mixed SACK */ + if ((asoc->sctp_cmt_on_off == 1) && + SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + if (net->saw_newack) + num_dests_sacked++; + } + } + if (stcb->asoc.peer_supports_prsctp) { + (void)SCTP_GETTIME_TIMEVAL(&now); + } + tp1 = TAILQ_FIRST(&asoc->sent_queue); + while (tp1) { + strike_flag = 0; + if (tp1->no_fr_allowed) { + /* this one had a timeout or something */ + tp1 = TAILQ_NEXT(tp1, sctp_next); + continue; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { + if (tp1->sent < SCTP_DATAGRAM_RESEND) + sctp_log_fr(biggest_tsn_newly_acked, + tp1->rec.data.TSN_seq, + tp1->sent, + SCTP_FR_LOG_CHECK_STRIKE); + } + if (compare_with_wrap(tp1->rec.data.TSN_seq, biggest_tsn_acked, + MAX_TSN) || + tp1->sent == SCTP_DATAGRAM_UNSENT) { + /* done */ + break; + } + if (stcb->asoc.peer_supports_prsctp) { + if ((PR_SCTP_TTL_ENABLED(tp1->flags)) && tp1->sent < SCTP_DATAGRAM_ACKED) { + /* Is it expired? */ + if (timevalcmp(&now, &tp1->rec.data.timetodrop, >)) { + /* Yes so drop it */ + if (tp1->data != NULL) { + (void)sctp_release_pr_sctp_chunk(stcb, tp1, + (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT), + SCTP_SO_NOT_LOCKED); + } + tp1 = TAILQ_NEXT(tp1, sctp_next); + continue; + } + } + } + if (compare_with_wrap(tp1->rec.data.TSN_seq, + asoc->this_sack_highest_gap, MAX_TSN)) { + /* we are beyond the tsn in the sack */ + break; + } + if (tp1->sent >= SCTP_DATAGRAM_RESEND) { + /* either a RESEND, ACKED, or MARKED */ + /* skip */ + if (tp1->sent == SCTP_FORWARD_TSN_SKIP) { + /* Continue strikin FWD-TSN chunks */ + tp1->rec.data.fwd_tsn_cnt++; + } + tp1 = TAILQ_NEXT(tp1, sctp_next); + continue; + } + /* + * CMT : SFR algo (covers part of DAC and HTNA as well) + */ + if (tp1->whoTo && tp1->whoTo->saw_newack == 0) { + /* + * No new acks were receieved for data sent to this + * dest. Therefore, according to the SFR algo for + * CMT, no data sent to this dest can be marked for + * FR using this SACK. + */ + tp1 = TAILQ_NEXT(tp1, sctp_next); + continue; + } else if (tp1->whoTo && compare_with_wrap(tp1->rec.data.TSN_seq, + tp1->whoTo->this_sack_highest_newack, MAX_TSN)) { + /* + * CMT: New acks were receieved for data sent to + * this dest. But no new acks were seen for data + * sent after tp1. Therefore, according to the SFR + * algo for CMT, tp1 cannot be marked for FR using + * this SACK. This step covers part of the DAC algo + * and the HTNA algo as well. + */ + tp1 = TAILQ_NEXT(tp1, sctp_next); + continue; + } + /* + * Here we check to see if we were have already done a FR + * and if so we see if the biggest TSN we saw in the sack is + * smaller than the recovery point. If so we don't strike + * the tsn... otherwise we CAN strike the TSN. + */ + /* + * @@@ JRI: Check for CMT if (accum_moved && + * asoc->fast_retran_loss_recovery && (sctp_cmt_on_off == + * 0)) { + */ + if (accum_moved && asoc->fast_retran_loss_recovery) { + /* + * Strike the TSN if in fast-recovery and cum-ack + * moved. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { + sctp_log_fr(biggest_tsn_newly_acked, + tp1->rec.data.TSN_seq, + tp1->sent, + SCTP_FR_LOG_STRIKE_CHUNK); + } + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + tp1->sent++; + } + if ((asoc->sctp_cmt_on_off == 1) && + SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { + /* + * CMT DAC algorithm: If SACK flag is set to + * 0, then lowest_newack test will not pass + * because it would have been set to the + * cumack earlier. If not already to be + * rtx'd, If not a mixed sack and if tp1 is + * not between two sacked TSNs, then mark by + * one more. NOTE that we are marking by one + * additional time since the SACK DAC flag + * indicates that two packets have been + * received after this missing TSN. + */ + if ((tp1->sent < SCTP_DATAGRAM_RESEND) && (num_dests_sacked == 1) && + compare_with_wrap(this_sack_lowest_newack, tp1->rec.data.TSN_seq, MAX_TSN)) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { + sctp_log_fr(16 + num_dests_sacked, + tp1->rec.data.TSN_seq, + tp1->sent, + SCTP_FR_LOG_STRIKE_CHUNK); + } + tp1->sent++; + } + } + } else if ((tp1->rec.data.doing_fast_retransmit) && + (asoc->sctp_cmt_on_off == 0)) { + /* + * For those that have done a FR we must take + * special consideration if we strike. I.e the + * biggest_newly_acked must be higher than the + * sending_seq at the time we did the FR. + */ + if ( +#ifdef SCTP_FR_TO_ALTERNATE + /* + * If FR's go to new networks, then we must only do + * this for singly homed asoc's. However if the FR's + * go to the same network (Armando's work) then its + * ok to FR multiple times. + */ + (asoc->numnets < 2) +#else + (1) +#endif + ) { + + if ((compare_with_wrap(biggest_tsn_newly_acked, + tp1->rec.data.fast_retran_tsn, MAX_TSN)) || + (biggest_tsn_newly_acked == + tp1->rec.data.fast_retran_tsn)) { + /* + * Strike the TSN, since this ack is + * beyond where things were when we + * did a FR. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { + sctp_log_fr(biggest_tsn_newly_acked, + tp1->rec.data.TSN_seq, + tp1->sent, + SCTP_FR_LOG_STRIKE_CHUNK); + } + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + tp1->sent++; + } + strike_flag = 1; + if ((asoc->sctp_cmt_on_off == 1) && + SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { + /* + * CMT DAC algorithm: If + * SACK flag is set to 0, + * then lowest_newack test + * will not pass because it + * would have been set to + * the cumack earlier. If + * not already to be rtx'd, + * If not a mixed sack and + * if tp1 is not between two + * sacked TSNs, then mark by + * one more. NOTE that we + * are marking by one + * additional time since the + * SACK DAC flag indicates + * that two packets have + * been received after this + * missing TSN. + */ + if ((tp1->sent < SCTP_DATAGRAM_RESEND) && + (num_dests_sacked == 1) && + compare_with_wrap(this_sack_lowest_newack, + tp1->rec.data.TSN_seq, MAX_TSN)) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { + sctp_log_fr(32 + num_dests_sacked, + tp1->rec.data.TSN_seq, + tp1->sent, + SCTP_FR_LOG_STRIKE_CHUNK); + } + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + tp1->sent++; + } + } + } + } + } + /* + * JRI: TODO: remove code for HTNA algo. CMT's SFR + * algo covers HTNA. + */ + } else if (compare_with_wrap(tp1->rec.data.TSN_seq, + biggest_tsn_newly_acked, MAX_TSN)) { + /* + * We don't strike these: This is the HTNA + * algorithm i.e. we don't strike If our TSN is + * larger than the Highest TSN Newly Acked. + */ + ; + } else { + /* Strike the TSN */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { + sctp_log_fr(biggest_tsn_newly_acked, + tp1->rec.data.TSN_seq, + tp1->sent, + SCTP_FR_LOG_STRIKE_CHUNK); + } + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + tp1->sent++; + } + if ((asoc->sctp_cmt_on_off == 1) && + SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { + /* + * CMT DAC algorithm: If SACK flag is set to + * 0, then lowest_newack test will not pass + * because it would have been set to the + * cumack earlier. If not already to be + * rtx'd, If not a mixed sack and if tp1 is + * not between two sacked TSNs, then mark by + * one more. NOTE that we are marking by one + * additional time since the SACK DAC flag + * indicates that two packets have been + * received after this missing TSN. + */ + if ((tp1->sent < SCTP_DATAGRAM_RESEND) && (num_dests_sacked == 1) && + compare_with_wrap(this_sack_lowest_newack, tp1->rec.data.TSN_seq, MAX_TSN)) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { + sctp_log_fr(48 + num_dests_sacked, + tp1->rec.data.TSN_seq, + tp1->sent, + SCTP_FR_LOG_STRIKE_CHUNK); + } + tp1->sent++; + } + } + } + if (tp1->sent == SCTP_DATAGRAM_RESEND) { + struct sctp_nets *alt; + + /* fix counts and things */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_RSND, + (tp1->whoTo ? (tp1->whoTo->flight_size) : 0), + tp1->book_size, + (uintptr_t) tp1->whoTo, + tp1->rec.data.TSN_seq); + } + if (tp1->whoTo) { + tp1->whoTo->net_ack++; + sctp_flight_size_decrease(tp1); + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { + sctp_log_rwnd(SCTP_INCREASE_PEER_RWND, + asoc->peers_rwnd, tp1->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); + } + /* add back to the rwnd */ + asoc->peers_rwnd += (tp1->send_size + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); + + /* remove from the total flight */ + sctp_total_flight_decrease(stcb, tp1); + + if ((stcb->asoc.peer_supports_prsctp) && + (PR_SCTP_RTX_ENABLED(tp1->flags))) { + /* + * Has it been retransmitted tv_sec times? - + * we store the retran count there. + */ + if (tp1->snd_count > tp1->rec.data.timetodrop.tv_sec) { + /* Yes, so drop it */ + if (tp1->data != NULL) { + (void)sctp_release_pr_sctp_chunk(stcb, tp1, + (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT), + SCTP_SO_NOT_LOCKED); + } + /* Make sure to flag we had a FR */ + tp1->whoTo->net_ack++; + tp1 = TAILQ_NEXT(tp1, sctp_next); + continue; + } + } + /* printf("OK, we are now ready to FR this guy\n"); */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { + sctp_log_fr(tp1->rec.data.TSN_seq, tp1->snd_count, + 0, SCTP_FR_MARKED); + } + if (strike_flag) { + /* This is a subsequent FR */ + SCTP_STAT_INCR(sctps_sendmultfastretrans); + } + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + if (asoc->sctp_cmt_on_off == 1) { + /* + * CMT: Using RTX_SSTHRESH policy for CMT. + * If CMT is being used, then pick dest with + * largest ssthresh for any retransmission. + */ + tp1->no_fr_allowed = 1; + alt = tp1->whoTo; + /* sa_ignore NO_NULL_CHK */ + if (asoc->sctp_cmt_pf > 0) { + /* + * JRS 5/18/07 - If CMT PF is on, + * use the PF version of + * find_alt_net() + */ + alt = sctp_find_alternate_net(stcb, alt, 2); + } else { + /* + * JRS 5/18/07 - If only CMT is on, + * use the CMT version of + * find_alt_net() + */ + /* sa_ignore NO_NULL_CHK */ + alt = sctp_find_alternate_net(stcb, alt, 1); + } + if (alt == NULL) { + alt = tp1->whoTo; + } + /* + * CUCv2: If a different dest is picked for + * the retransmission, then new + * (rtx-)pseudo_cumack needs to be tracked + * for orig dest. Let CUCv2 track new (rtx-) + * pseudo-cumack always. + */ + if (tp1->whoTo) { + tp1->whoTo->find_pseudo_cumack = 1; + tp1->whoTo->find_rtx_pseudo_cumack = 1; + } + } else {/* CMT is OFF */ + +#ifdef SCTP_FR_TO_ALTERNATE + /* Can we find an alternate? */ + alt = sctp_find_alternate_net(stcb, tp1->whoTo, 0); +#else + /* + * default behavior is to NOT retransmit + * FR's to an alternate. Armando Caro's + * paper details why. + */ + alt = tp1->whoTo; +#endif + } + + tp1->rec.data.doing_fast_retransmit = 1; + tot_retrans++; + /* mark the sending seq for possible subsequent FR's */ + /* + * printf("Marking TSN for FR new value %x\n", + * (uint32_t)tpi->rec.data.TSN_seq); + */ + if (TAILQ_EMPTY(&asoc->send_queue)) { + /* + * If the queue of send is empty then its + * the next sequence number that will be + * assigned so we subtract one from this to + * get the one we last sent. + */ + tp1->rec.data.fast_retran_tsn = sending_seq; + } else { + /* + * If there are chunks on the send queue + * (unsent data that has made it from the + * stream queues but not out the door, we + * take the first one (which will have the + * lowest TSN) and subtract one to get the + * one we last sent. + */ + struct sctp_tmit_chunk *ttt; + + ttt = TAILQ_FIRST(&asoc->send_queue); + tp1->rec.data.fast_retran_tsn = + ttt->rec.data.TSN_seq; + } + + if (tp1->do_rtt) { + /* + * this guy had a RTO calculation pending on + * it, cancel it + */ + tp1->do_rtt = 0; + } + if (alt != tp1->whoTo) { + /* yes, there is an alternate. */ + sctp_free_remote_addr(tp1->whoTo); + /* sa_ignore FREED_MEMORY */ + tp1->whoTo = alt; + atomic_add_int(&alt->ref_count, 1); + } + } + tp1 = TAILQ_NEXT(tp1, sctp_next); + } /* while (tp1) */ + + if (tot_retrans > 0) { + /* + * Setup the ecn nonce re-sync point. We do this since once + * we go to FR something we introduce a Karn's rule scenario + * and won't know the totals for the ECN bits. + */ + asoc->nonce_resync_tsn = sending_seq; + asoc->nonce_wait_for_ecne = 0; + asoc->nonce_sum_check = 0; + } +} + +struct sctp_tmit_chunk * +sctp_try_advance_peer_ack_point(struct sctp_tcb *stcb, + struct sctp_association *asoc) +{ + struct sctp_tmit_chunk *tp1, *tp2, *a_adv = NULL; + struct timeval now; + int now_filled = 0; + + if (asoc->peer_supports_prsctp == 0) { + return (NULL); + } + tp1 = TAILQ_FIRST(&asoc->sent_queue); + while (tp1) { + if (tp1->sent != SCTP_FORWARD_TSN_SKIP && + tp1->sent != SCTP_DATAGRAM_RESEND) { + /* no chance to advance, out of here */ + break; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { + if (tp1->sent == SCTP_FORWARD_TSN_SKIP) { + sctp_misc_ints(SCTP_FWD_TSN_CHECK, + asoc->advanced_peer_ack_point, + tp1->rec.data.TSN_seq, 0, 0); + } + } + if (!PR_SCTP_ENABLED(tp1->flags)) { + /* + * We can't fwd-tsn past any that are reliable aka + * retransmitted until the asoc fails. + */ + break; + } + if (!now_filled) { + (void)SCTP_GETTIME_TIMEVAL(&now); + now_filled = 1; + } + tp2 = TAILQ_NEXT(tp1, sctp_next); + /* + * now we got a chunk which is marked for another + * retransmission to a PR-stream but has run out its chances + * already maybe OR has been marked to skip now. Can we skip + * it if its a resend? + */ + if (tp1->sent == SCTP_DATAGRAM_RESEND && + (PR_SCTP_TTL_ENABLED(tp1->flags))) { + /* + * Now is this one marked for resend and its time is + * now up? + */ + if (timevalcmp(&now, &tp1->rec.data.timetodrop, >)) { + /* Yes so drop it */ + if (tp1->data) { + (void)sctp_release_pr_sctp_chunk(stcb, tp1, + (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT), + SCTP_SO_NOT_LOCKED); + } + } else { + /* + * No, we are done when hit one for resend + * whos time as not expired. + */ + break; + } + } + /* + * Ok now if this chunk is marked to drop it we can clean up + * the chunk, advance our peer ack point and we can check + * the next chunk. + */ + if (tp1->sent == SCTP_FORWARD_TSN_SKIP) { + /* advance PeerAckPoint goes forward */ + if (compare_with_wrap(tp1->rec.data.TSN_seq, + asoc->advanced_peer_ack_point, + MAX_TSN)) { + + asoc->advanced_peer_ack_point = tp1->rec.data.TSN_seq; + a_adv = tp1; + } else if (tp1->rec.data.TSN_seq == asoc->advanced_peer_ack_point) { + /* No update but we do save the chk */ + a_adv = tp1; + } + } else { + /* + * If it is still in RESEND we can advance no + * further + */ + break; + } + /* + * If we hit here we just dumped tp1, move to next tsn on + * sent queue. + */ + tp1 = tp2; + } + return (a_adv); +} + +static int +sctp_fs_audit(struct sctp_association *asoc) +{ + struct sctp_tmit_chunk *chk; + int inflight = 0, resend = 0, inbetween = 0, acked = 0, above = 0; + int entry_flight, entry_cnt, ret; + + entry_flight = asoc->total_flight; + entry_cnt = asoc->total_flight_count; + ret = 0; + + if (asoc->pr_sctp_cnt >= asoc->sent_queue_cnt) + return (0); + + TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { + if (chk->sent < SCTP_DATAGRAM_RESEND) { + printf("Chk TSN:%u size:%d inflight cnt:%d\n", + chk->rec.data.TSN_seq, + chk->send_size, + chk->snd_count + ); + inflight++; + } else if (chk->sent == SCTP_DATAGRAM_RESEND) { + resend++; + } else if (chk->sent < SCTP_DATAGRAM_ACKED) { + inbetween++; + } else if (chk->sent > SCTP_DATAGRAM_ACKED) { + above++; + } else { + acked++; + } + } + + if ((inflight > 0) || (inbetween > 0)) { +#ifdef INVARIANTS + panic("Flight size-express incorrect? \n"); +#else + printf("asoc->total_flight:%d cnt:%d\n", + entry_flight, entry_cnt); + + SCTP_PRINTF("Flight size-express incorrect F:%d I:%d R:%d Ab:%d ACK:%d\n", + inflight, inbetween, resend, above, acked); + ret = 1; +#endif + } + return (ret); +} + + +static void +sctp_window_probe_recovery(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_nets *net, + struct sctp_tmit_chunk *tp1) +{ + tp1->window_probe = 0; + if ((tp1->sent >= SCTP_DATAGRAM_ACKED) || (tp1->data == NULL)) { + /* TSN's skipped we do NOT move back. */ + sctp_misc_ints(SCTP_FLIGHT_LOG_DWN_WP_FWD, + tp1->whoTo->flight_size, + tp1->book_size, + (uintptr_t) tp1->whoTo, + tp1->rec.data.TSN_seq); + return; + } + /* First setup this by shrinking flight */ + sctp_flight_size_decrease(tp1); + sctp_total_flight_decrease(stcb, tp1); + /* Now mark for resend */ + tp1->sent = SCTP_DATAGRAM_RESEND; + sctp_ucount_incr(asoc->sent_queue_retran_cnt); + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_WP, + tp1->whoTo->flight_size, + tp1->book_size, + (uintptr_t) tp1->whoTo, + tp1->rec.data.TSN_seq); + } +} + +void +sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, + uint32_t rwnd, int nonce_sum_flag, int *abort_now) +{ + struct sctp_nets *net; + struct sctp_association *asoc; + struct sctp_tmit_chunk *tp1, *tp2; + uint32_t old_rwnd; + int win_probe_recovery = 0; + int win_probe_recovered = 0; + int j, done_once = 0; + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_SACK_ARRIVALS_ENABLE) { + sctp_misc_ints(SCTP_SACK_LOG_EXPRESS, cumack, + rwnd, stcb->asoc.last_acked_seq, stcb->asoc.peers_rwnd); + } + SCTP_TCB_LOCK_ASSERT(stcb); +#ifdef SCTP_ASOCLOG_OF_TSNS + stcb->asoc.cumack_log[stcb->asoc.cumack_log_at] = cumack; + stcb->asoc.cumack_log_at++; + if (stcb->asoc.cumack_log_at > SCTP_TSN_LOG_SIZE) { + stcb->asoc.cumack_log_at = 0; + } +#endif + asoc = &stcb->asoc; + old_rwnd = asoc->peers_rwnd; + if (compare_with_wrap(asoc->last_acked_seq, cumack, MAX_TSN)) { + /* old ack */ + return; + } else if (asoc->last_acked_seq == cumack) { + /* Window update sack */ + asoc->peers_rwnd = sctp_sbspace_sub(rwnd, + (uint32_t) (asoc->total_flight + (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)))); + if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { + /* SWS sender side engages */ + asoc->peers_rwnd = 0; + } + if (asoc->peers_rwnd > old_rwnd) { + goto again; + } + return; + } + /* First setup for CC stuff */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + net->prev_cwnd = net->cwnd; + net->net_ack = 0; + net->net_ack2 = 0; + + /* + * CMT: Reset CUC and Fast recovery algo variables before + * SACK processing + */ + net->new_pseudo_cumack = 0; + net->will_exit_fast_recovery = 0; + } + if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) { + uint32_t send_s; + + if (!TAILQ_EMPTY(&asoc->sent_queue)) { + tp1 = TAILQ_LAST(&asoc->sent_queue, + sctpchunk_listhead); + send_s = tp1->rec.data.TSN_seq + 1; + } else { + send_s = asoc->sending_seq; + } + if ((cumack == send_s) || + compare_with_wrap(cumack, send_s, MAX_TSN)) { +#ifndef INVARIANTS + struct mbuf *oper; + +#endif +#ifdef INVARIANTS + panic("Impossible sack 1"); +#else + + *abort_now = 1; + /* XXX */ + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) + + sizeof(uint32_t); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_25); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25; + sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + return; +#endif + } + } + asoc->this_sack_highest_gap = cumack; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INDATA, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + if (compare_with_wrap(cumack, asoc->last_acked_seq, MAX_TSN)) { + /* process the new consecutive TSN first */ + tp1 = TAILQ_FIRST(&asoc->sent_queue); + while (tp1) { + tp2 = TAILQ_NEXT(tp1, sctp_next); + if (compare_with_wrap(cumack, tp1->rec.data.TSN_seq, + MAX_TSN) || + cumack == tp1->rec.data.TSN_seq) { + if (tp1->sent == SCTP_DATAGRAM_UNSENT) { + printf("Warning, an unsent is now acked?\n"); + } + /* + * ECN Nonce: Add the nonce to the sender's + * nonce sum + */ + asoc->nonce_sum_expect_base += tp1->rec.data.ect_nonce; + if (tp1->sent < SCTP_DATAGRAM_ACKED) { + /* + * If it is less than ACKED, it is + * now no-longer in flight. Higher + * values may occur during marking + */ + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_CA, + tp1->whoTo->flight_size, + tp1->book_size, + (uintptr_t) tp1->whoTo, + tp1->rec.data.TSN_seq); + } + sctp_flight_size_decrease(tp1); + /* sa_ignore NO_NULL_CHK */ + sctp_total_flight_decrease(stcb, tp1); + } + tp1->whoTo->net_ack += tp1->send_size; + if (tp1->snd_count < 2) { + /* + * True non-retransmited + * chunk + */ + tp1->whoTo->net_ack2 += + tp1->send_size; + + /* update RTO too? */ + if (tp1->do_rtt) { + tp1->whoTo->RTO = + /* + * sa_ignore + * NO_NULL_CHK + */ + sctp_calculate_rto(stcb, + asoc, tp1->whoTo, + &tp1->sent_rcv_time, + sctp_align_safe_nocopy); + tp1->do_rtt = 0; + } + } + /* + * CMT: CUCv2 algorithm. From the + * cumack'd TSNs, for each TSN being + * acked for the first time, set the + * following variables for the + * corresp destination. + * new_pseudo_cumack will trigger a + * cwnd update. + * find_(rtx_)pseudo_cumack will + * trigger search for the next + * expected (rtx-)pseudo-cumack. + */ + tp1->whoTo->new_pseudo_cumack = 1; + tp1->whoTo->find_pseudo_cumack = 1; + tp1->whoTo->find_rtx_pseudo_cumack = 1; + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + /* sa_ignore NO_NULL_CHK */ + sctp_log_cwnd(stcb, tp1->whoTo, tp1->rec.data.TSN_seq, SCTP_CWND_LOG_FROM_SACK); + } + } + if (tp1->sent == SCTP_DATAGRAM_RESEND) { + sctp_ucount_decr(asoc->sent_queue_retran_cnt); + } + if (tp1->rec.data.chunk_was_revoked) { + /* deflate the cwnd */ + tp1->whoTo->cwnd -= tp1->book_size; + tp1->rec.data.chunk_was_revoked = 0; + } + tp1->sent = SCTP_DATAGRAM_ACKED; + TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next); + if (tp1->data) { + /* sa_ignore NO_NULL_CHK */ + sctp_free_bufspace(stcb, asoc, tp1, 1); + sctp_m_freem(tp1->data); + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { + sctp_log_sack(asoc->last_acked_seq, + cumack, + tp1->rec.data.TSN_seq, + 0, + 0, + SCTP_LOG_FREE_SENT); + } + tp1->data = NULL; + asoc->sent_queue_cnt--; + sctp_free_a_chunk(stcb, tp1); + tp1 = tp2; + } else { + break; + } + } + + } + /* sa_ignore NO_NULL_CHK */ + if (stcb->sctp_socket) { +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + SOCKBUF_LOCK(&stcb->sctp_socket->so_snd); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) { + /* sa_ignore NO_NULL_CHK */ + sctp_wakeup_log(stcb, cumack, 1, SCTP_WAKESND_FROM_SACK); + } +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + /* assoc was freed while we were unlocked */ + SCTP_SOCKET_UNLOCK(so, 1); + return; + } +#endif + sctp_sowwakeup_locked(stcb->sctp_ep, stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) { + sctp_wakeup_log(stcb, cumack, 1, SCTP_NOWAKE_FROM_SACK); + } + } + + /* JRS - Use the congestion control given in the CC module */ + if (asoc->last_acked_seq != cumack) + asoc->cc_functions.sctp_cwnd_update_after_sack(stcb, asoc, 1, 0, 0); + + asoc->last_acked_seq = cumack; + + if (TAILQ_EMPTY(&asoc->sent_queue)) { + /* nothing left in-flight */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + net->flight_size = 0; + net->partial_bytes_acked = 0; + } + asoc->total_flight = 0; + asoc->total_flight_count = 0; + } + /* ECN Nonce updates */ + if (asoc->ecn_nonce_allowed) { + if (asoc->nonce_sum_check) { + if (nonce_sum_flag != ((asoc->nonce_sum_expect_base) & SCTP_SACK_NONCE_SUM)) { + if (asoc->nonce_wait_for_ecne == 0) { + struct sctp_tmit_chunk *lchk; + + lchk = TAILQ_FIRST(&asoc->send_queue); + asoc->nonce_wait_for_ecne = 1; + if (lchk) { + asoc->nonce_wait_tsn = lchk->rec.data.TSN_seq; + } else { + asoc->nonce_wait_tsn = asoc->sending_seq; + } + } else { + if (compare_with_wrap(asoc->last_acked_seq, asoc->nonce_wait_tsn, MAX_TSN) || + (asoc->last_acked_seq == asoc->nonce_wait_tsn)) { + /* + * Misbehaving peer. We need + * to react to this guy + */ + asoc->ecn_allowed = 0; + asoc->ecn_nonce_allowed = 0; + } + } + } + } else { + /* See if Resynchronization Possible */ + if (compare_with_wrap(asoc->last_acked_seq, asoc->nonce_resync_tsn, MAX_TSN)) { + asoc->nonce_sum_check = 1; + /* + * Now we must calculate what the base is. + * We do this based on two things, we know + * the total's for all the segments + * gap-acked in the SACK (none). We also + * know the SACK's nonce sum, its in + * nonce_sum_flag. So we can build a truth + * table to back-calculate the new value of + * asoc->nonce_sum_expect_base: + * + * SACK-flag-Value Seg-Sums Base 0 0 0 + * 1 0 1 0 1 1 1 + * 1 0 + */ + asoc->nonce_sum_expect_base = (0 ^ nonce_sum_flag) & SCTP_SACK_NONCE_SUM; + } + } + } + /* RWND update */ + asoc->peers_rwnd = sctp_sbspace_sub(rwnd, + (uint32_t) (asoc->total_flight + (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)))); + if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { + /* SWS sender side engages */ + asoc->peers_rwnd = 0; + } + if (asoc->peers_rwnd > old_rwnd) { + win_probe_recovery = 1; + } + /* Now assure a timer where data is queued at */ +again: + j = 0; + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + int to_ticks; + + if (win_probe_recovery && (net->window_probe)) { + win_probe_recovered = 1; + /* + * Find first chunk that was used with window probe + * and clear the sent + */ + /* sa_ignore FREED_MEMORY */ + TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { + if (tp1->window_probe) { + /* move back to data send queue */ + sctp_window_probe_recovery(stcb, asoc, net, tp1); + break; + } + } + } + if (net->RTO == 0) { + to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto); + } else { + to_ticks = MSEC_TO_TICKS(net->RTO); + } + if (net->flight_size) { + j++; + (void)SCTP_OS_TIMER_START(&net->rxt_timer.timer, to_ticks, + sctp_timeout_handler, &net->rxt_timer); + if (net->window_probe) { + net->window_probe = 0; + } + } else { + if (net->window_probe) { + /* + * In window probes we must assure a timer + * is still running there + */ + net->window_probe = 0; + if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { + SCTP_OS_TIMER_START(&net->rxt_timer.timer, to_ticks, + sctp_timeout_handler, &net->rxt_timer); + } + } else if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_22); + } + if (SCTP_BASE_SYSCTL(sctp_early_fr)) { + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpidsck4); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_23); + } + } + } + } + if ((j == 0) && + (!TAILQ_EMPTY(&asoc->sent_queue)) && + (asoc->sent_queue_retran_cnt == 0) && + (win_probe_recovered == 0) && + (done_once == 0)) { + /* + * huh, this should not happen unless all packets are + * PR-SCTP and marked to skip of course. + */ + if (sctp_fs_audit(asoc)) { + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + net->flight_size = 0; + } + asoc->total_flight = 0; + asoc->total_flight_count = 0; + asoc->sent_queue_retran_cnt = 0; + TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + sctp_flight_size_increase(tp1); + sctp_total_flight_increase(stcb, tp1); + } else if (tp1->sent == SCTP_DATAGRAM_RESEND) { + sctp_ucount_incr(asoc->sent_queue_retran_cnt); + } + } + } + done_once = 1; + goto again; + } + /**********************************/ + /* Now what about shutdown issues */ + /**********************************/ + if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue)) { + /* nothing left on sendqueue.. consider done */ + /* clean up */ + if ((asoc->stream_queue_cnt == 1) && + ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || + (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) && + (asoc->locked_on_sending) + ) { + struct sctp_stream_queue_pending *sp; + + /* + * I may be in a state where we got all across.. but + * cannot write more due to a shutdown... we abort + * since the user did not indicate EOR in this case. + * The sp will be cleaned during free of the asoc. + */ + sp = TAILQ_LAST(&((asoc->locked_on_sending)->outqueue), + sctp_streamhead); + if ((sp) && (sp->length == 0)) { + /* Let cleanup code purge it */ + if (sp->msg_is_complete) { + asoc->stream_queue_cnt--; + } else { + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + asoc->locked_on_sending = NULL; + asoc->stream_queue_cnt--; + } + } + } + if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) && + (asoc->stream_queue_cnt == 0)) { + if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) { + /* Need to abort here */ + struct mbuf *oper; + + abort_out_now: + *abort_now = 1; + /* XXX */ + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) + + sizeof(uint32_t); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_24); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_24; + sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_RESPONSE_TO_USER_REQ, oper, SCTP_SO_NOT_LOCKED); + } else { + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_stop_timers_for_shutdown(stcb); + sctp_send_shutdown(stcb, + stcb->asoc.primary_destination); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, + stcb->sctp_ep, stcb, asoc->primary_destination); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, + stcb->sctp_ep, stcb, asoc->primary_destination); + } + } else if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) && + (asoc->stream_queue_cnt == 0)) { + if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) { + goto abort_out_now; + } + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_send_shutdown_ack(stcb, + stcb->asoc.primary_destination); + sctp_stop_timers_for_shutdown(stcb); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, + stcb->sctp_ep, stcb, asoc->primary_destination); + } + } + /*********************************************/ + /* Here we perform PR-SCTP procedures */ + /* (section 4.2) */ + /*********************************************/ + /* C1. update advancedPeerAckPoint */ + if (compare_with_wrap(cumack, asoc->advanced_peer_ack_point, MAX_TSN)) { + asoc->advanced_peer_ack_point = cumack; + } + /* PR-Sctp issues need to be addressed too */ + if ((asoc->peer_supports_prsctp) && (asoc->pr_sctp_cnt > 0)) { + struct sctp_tmit_chunk *lchk; + uint32_t old_adv_peer_ack_point; + + old_adv_peer_ack_point = asoc->advanced_peer_ack_point; + lchk = sctp_try_advance_peer_ack_point(stcb, asoc); + /* C3. See if we need to send a Fwd-TSN */ + if (compare_with_wrap(asoc->advanced_peer_ack_point, cumack, + MAX_TSN)) { + /* + * ISSUE with ECN, see FWD-TSN processing for notes + * on issues that will occur when the ECN NONCE + * stuff is put into SCTP for cross checking. + */ + if (compare_with_wrap(asoc->advanced_peer_ack_point, old_adv_peer_ack_point, + MAX_TSN)) { + send_forward_tsn(stcb, asoc); + /* + * ECN Nonce: Disable Nonce Sum check when + * FWD TSN is sent and store resync tsn + */ + asoc->nonce_sum_check = 0; + asoc->nonce_resync_tsn = asoc->advanced_peer_ack_point; + } else if (lchk) { + /* try to FR fwd-tsn's that get lost too */ + if (lchk->rec.data.fwd_tsn_cnt >= 3) { + send_forward_tsn(stcb, asoc); + } + } + } + if (lchk) { + /* Assure a timer is up */ + sctp_timer_start(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, lchk->whoTo); + } + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_RWND_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_SACK_RWND_UPDATE, + rwnd, + stcb->asoc.peers_rwnd, + stcb->asoc.total_flight, + stcb->asoc.total_output_queue_size); + } +} + +void +sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, + struct sctp_tcb *stcb, struct sctp_nets *net_from, + uint16_t num_seg, uint16_t num_nr_seg, uint16_t num_dup, + int *abort_now, uint8_t flags, + uint32_t cum_ack, uint32_t rwnd) +{ + struct sctp_association *asoc; + struct sctp_tmit_chunk *tp1, *tp2; + uint32_t last_tsn, biggest_tsn_acked, biggest_tsn_newly_acked, this_sack_lowest_newack; + uint32_t sav_cum_ack; + uint16_t wake_him = 0; + uint32_t send_s = 0; + long j; + int accum_moved = 0; + int will_exit_fast_recovery = 0; + uint32_t a_rwnd, old_rwnd; + int win_probe_recovery = 0; + int win_probe_recovered = 0; + struct sctp_nets *net = NULL; + int nonce_sum_flag, ecn_seg_sums = 0; + int done_once; + uint8_t reneged_all = 0; + uint8_t cmt_dac_flag; + + /* + * we take any chance we can to service our queues since we cannot + * get awoken when the socket is read from :< + */ + /* + * Now perform the actual SACK handling: 1) Verify that it is not an + * old sack, if so discard. 2) If there is nothing left in the send + * queue (cum-ack is equal to last acked) then you have a duplicate + * too, update any rwnd change and verify no timers are running. + * then return. 3) Process any new consequtive data i.e. cum-ack + * moved process these first and note that it moved. 4) Process any + * sack blocks. 5) Drop any acked from the queue. 6) Check for any + * revoked blocks and mark. 7) Update the cwnd. 8) Nothing left, + * sync up flightsizes and things, stop all timers and also check + * for shutdown_pending state. If so then go ahead and send off the + * shutdown. If in shutdown recv, send off the shutdown-ack and + * start that timer, Ret. 9) Strike any non-acked things and do FR + * procedure if needed being sure to set the FR flag. 10) Do pr-sctp + * procedures. 11) Apply any FR penalties. 12) Assure we will SACK + * if in shutdown_recv state. + */ + SCTP_TCB_LOCK_ASSERT(stcb); + /* CMT DAC algo */ + this_sack_lowest_newack = 0; + j = 0; + SCTP_STAT_INCR(sctps_slowpath_sack); + last_tsn = cum_ack; + nonce_sum_flag = flags & SCTP_SACK_NONCE_SUM; + cmt_dac_flag = flags & SCTP_SACK_CMT_DAC; +#ifdef SCTP_ASOCLOG_OF_TSNS + stcb->asoc.cumack_log[stcb->asoc.cumack_log_at] = cum_ack; + stcb->asoc.cumack_log_at++; + if (stcb->asoc.cumack_log_at > SCTP_TSN_LOG_SIZE) { + stcb->asoc.cumack_log_at = 0; + } +#endif + a_rwnd = rwnd; + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_SACK_ARRIVALS_ENABLE) { + sctp_misc_ints(SCTP_SACK_LOG_NORMAL, cum_ack, + rwnd, stcb->asoc.last_acked_seq, stcb->asoc.peers_rwnd); + } + old_rwnd = stcb->asoc.peers_rwnd; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INDATA, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + asoc = &stcb->asoc; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { + sctp_log_sack(asoc->last_acked_seq, + cum_ack, + 0, + num_seg, + num_dup, + SCTP_LOG_NEW_SACK); + } + if ((num_dup) && (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_FR_LOGGING_ENABLE | SCTP_EARLYFR_LOGGING_ENABLE))) { + uint16_t i; + uint32_t *dupdata, dblock; + + for (i = 0; i < num_dup; i++) { + dupdata = (uint32_t *) sctp_m_getptr(m, offset_dup + i * sizeof(uint32_t), + sizeof(uint32_t), (uint8_t *) & dblock); + if (dupdata == NULL) { + break; + } + sctp_log_fr(*dupdata, 0, 0, SCTP_FR_DUPED); + } + } + if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) { + /* reality check */ + if (!TAILQ_EMPTY(&asoc->sent_queue)) { + tp1 = TAILQ_LAST(&asoc->sent_queue, + sctpchunk_listhead); + send_s = tp1->rec.data.TSN_seq + 1; + } else { + tp1 = NULL; + send_s = asoc->sending_seq; + } + if (cum_ack == send_s || + compare_with_wrap(cum_ack, send_s, MAX_TSN)) { + struct mbuf *oper; + + /* + * no way, we have not even sent this TSN out yet. + * Peer is hopelessly messed up with us. + */ + printf("NEW cum_ack:%x send_s:%x is smaller or equal\n", + cum_ack, send_s); + if (tp1) { + printf("Got send_s from tsn:%x + 1 of tp1:%p\n", + tp1->rec.data.TSN_seq, tp1); + } + hopeless_peer: + *abort_now = 1; + /* XXX */ + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) + + sizeof(uint32_t); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_25); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_25; + sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + return; + } + } + /**********************/ + /* 1) check the range */ + /**********************/ + if (compare_with_wrap(asoc->last_acked_seq, last_tsn, MAX_TSN)) { + /* acking something behind */ + return; + } + sav_cum_ack = asoc->last_acked_seq; + + /* update the Rwnd of the peer */ + if (TAILQ_EMPTY(&asoc->sent_queue) && + TAILQ_EMPTY(&asoc->send_queue) && + (asoc->stream_queue_cnt == 0)) { + /* nothing left on send/sent and strmq */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { + sctp_log_rwnd_set(SCTP_SET_PEER_RWND_VIA_SACK, + asoc->peers_rwnd, 0, 0, a_rwnd); + } + asoc->peers_rwnd = a_rwnd; + if (asoc->sent_queue_retran_cnt) { + asoc->sent_queue_retran_cnt = 0; + } + if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { + /* SWS sender side engages */ + asoc->peers_rwnd = 0; + } + /* stop any timers */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_26); + if (SCTP_BASE_SYSCTL(sctp_early_fr)) { + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpidsck1); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_26); + } + } + net->partial_bytes_acked = 0; + net->flight_size = 0; + } + asoc->total_flight = 0; + asoc->total_flight_count = 0; + return; + } + /* + * We init netAckSz and netAckSz2 to 0. These are used to track 2 + * things. The total byte count acked is tracked in netAckSz AND + * netAck2 is used to track the total bytes acked that are un- + * amibguious and were never retransmitted. We track these on a per + * destination address basis. + */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + net->prev_cwnd = net->cwnd; + net->net_ack = 0; + net->net_ack2 = 0; + + /* + * CMT: Reset CUC and Fast recovery algo variables before + * SACK processing + */ + net->new_pseudo_cumack = 0; + net->will_exit_fast_recovery = 0; + } + /* process the new consecutive TSN first */ + tp1 = TAILQ_FIRST(&asoc->sent_queue); + while (tp1) { + if (compare_with_wrap(last_tsn, tp1->rec.data.TSN_seq, + MAX_TSN) || + last_tsn == tp1->rec.data.TSN_seq) { + if (tp1->sent != SCTP_DATAGRAM_UNSENT) { + /* + * ECN Nonce: Add the nonce to the sender's + * nonce sum + */ + asoc->nonce_sum_expect_base += tp1->rec.data.ect_nonce; + accum_moved = 1; + if (tp1->sent < SCTP_DATAGRAM_ACKED) { + /* + * If it is less than ACKED, it is + * now no-longer in flight. Higher + * values may occur during marking + */ + if ((tp1->whoTo->dest_state & + SCTP_ADDR_UNCONFIRMED) && + (tp1->snd_count < 2)) { + /* + * If there was no retran + * and the address is + * un-confirmed and we sent + * there and are now + * sacked.. its confirmed, + * mark it so. + */ + tp1->whoTo->dest_state &= + ~SCTP_ADDR_UNCONFIRMED; + } + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_CA, + tp1->whoTo->flight_size, + tp1->book_size, + (uintptr_t) tp1->whoTo, + tp1->rec.data.TSN_seq); + } + sctp_flight_size_decrease(tp1); + sctp_total_flight_decrease(stcb, tp1); + } + tp1->whoTo->net_ack += tp1->send_size; + + /* CMT SFR and DAC algos */ + this_sack_lowest_newack = tp1->rec.data.TSN_seq; + tp1->whoTo->saw_newack = 1; + + if (tp1->snd_count < 2) { + /* + * True non-retransmited + * chunk + */ + tp1->whoTo->net_ack2 += + tp1->send_size; + + /* update RTO too? */ + if (tp1->do_rtt) { + tp1->whoTo->RTO = + sctp_calculate_rto(stcb, + asoc, tp1->whoTo, + &tp1->sent_rcv_time, + sctp_align_safe_nocopy); + tp1->do_rtt = 0; + } + } + /* + * CMT: CUCv2 algorithm. From the + * cumack'd TSNs, for each TSN being + * acked for the first time, set the + * following variables for the + * corresp destination. + * new_pseudo_cumack will trigger a + * cwnd update. + * find_(rtx_)pseudo_cumack will + * trigger search for the next + * expected (rtx-)pseudo-cumack. + */ + tp1->whoTo->new_pseudo_cumack = 1; + tp1->whoTo->find_pseudo_cumack = 1; + tp1->whoTo->find_rtx_pseudo_cumack = 1; + + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { + sctp_log_sack(asoc->last_acked_seq, + cum_ack, + tp1->rec.data.TSN_seq, + 0, + 0, + SCTP_LOG_TSN_ACKED); + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, tp1->whoTo, tp1->rec.data.TSN_seq, SCTP_CWND_LOG_FROM_SACK); + } + } + if (tp1->sent == SCTP_DATAGRAM_RESEND) { + sctp_ucount_decr(asoc->sent_queue_retran_cnt); +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xB3, + (asoc->sent_queue_retran_cnt & 0x000000ff)); +#endif + } + if (tp1->rec.data.chunk_was_revoked) { + /* deflate the cwnd */ + tp1->whoTo->cwnd -= tp1->book_size; + tp1->rec.data.chunk_was_revoked = 0; + } + tp1->sent = SCTP_DATAGRAM_ACKED; + } + } else { + break; + } + tp1 = TAILQ_NEXT(tp1, sctp_next); + } + biggest_tsn_newly_acked = biggest_tsn_acked = last_tsn; + /* always set this up to cum-ack */ + asoc->this_sack_highest_gap = last_tsn; + + if ((num_seg > 0) || (num_nr_seg > 0)) { + + /* + * CMT: SFR algo (and HTNA) - this_sack_highest_newack has + * to be greater than the cumack. Also reset saw_newack to 0 + * for all dests. + */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + net->saw_newack = 0; + net->this_sack_highest_newack = last_tsn; + } + + /* + * thisSackHighestGap will increase while handling NEW + * segments this_sack_highest_newack will increase while + * handling NEWLY ACKED chunks. this_sack_lowest_newack is + * used for CMT DAC algo. saw_newack will also change. + */ + if (sctp_handle_segments(m, &offset_seg, stcb, asoc, last_tsn, &biggest_tsn_acked, + &biggest_tsn_newly_acked, &this_sack_lowest_newack, + num_seg, num_nr_seg, &ecn_seg_sums)) { + wake_him++; + } + if (SCTP_BASE_SYSCTL(sctp_strict_sacks)) { + /* + * validate the biggest_tsn_acked in the gap acks if + * strict adherence is wanted. + */ + if ((biggest_tsn_acked == send_s) || + (compare_with_wrap(biggest_tsn_acked, send_s, MAX_TSN))) { + /* + * peer is either confused or we are under + * attack. We must abort. + */ + printf("Hopeless peer! biggest_tsn_acked:%x largest seq:%x\n", + biggest_tsn_acked, + send_s); + + goto hopeless_peer; + } + } + } + /*******************************************/ + /* cancel ALL T3-send timer if accum moved */ + /*******************************************/ + if (asoc->sctp_cmt_on_off == 1) { + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + if (net->new_pseudo_cumack) + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_27); + + } + } else { + if (accum_moved) { + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_28); + } + } + } + /********************************************/ + /* drop the acked chunks from the sentqueue */ + /********************************************/ + asoc->last_acked_seq = cum_ack; + + tp1 = TAILQ_FIRST(&asoc->sent_queue); + if (tp1 == NULL) + goto done_with_it; + do { + if (compare_with_wrap(tp1->rec.data.TSN_seq, cum_ack, + MAX_TSN)) { + break; + } + if (tp1->sent == SCTP_DATAGRAM_UNSENT) { + /* no more sent on list */ + printf("Warning, tp1->sent == %d and its now acked?\n", + tp1->sent); + } + tp2 = TAILQ_NEXT(tp1, sctp_next); + TAILQ_REMOVE(&asoc->sent_queue, tp1, sctp_next); + if (tp1->pr_sctp_on) { + if (asoc->pr_sctp_cnt != 0) + asoc->pr_sctp_cnt--; + } + if (TAILQ_EMPTY(&asoc->sent_queue) && + (asoc->total_flight > 0)) { +#ifdef INVARIANTS + panic("Warning flight size is postive and should be 0"); +#else + SCTP_PRINTF("Warning flight size incorrect should be 0 is %d\n", + asoc->total_flight); +#endif + asoc->total_flight = 0; + } + if (tp1->data) { + /* sa_ignore NO_NULL_CHK */ + sctp_free_bufspace(stcb, asoc, tp1, 1); + sctp_m_freem(tp1->data); + if (asoc->peer_supports_prsctp && PR_SCTP_BUF_ENABLED(tp1->flags)) { + asoc->sent_queue_cnt_removeable--; + } + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_LOGGING_ENABLE) { + sctp_log_sack(asoc->last_acked_seq, + cum_ack, + tp1->rec.data.TSN_seq, + 0, + 0, + SCTP_LOG_FREE_SENT); + } + tp1->data = NULL; + asoc->sent_queue_cnt--; + sctp_free_a_chunk(stcb, tp1); + wake_him++; + tp1 = tp2; + } while (tp1 != NULL); + +done_with_it: + /* sa_ignore NO_NULL_CHK */ + if ((wake_him) && (stcb->sctp_socket)) { +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + SOCKBUF_LOCK(&stcb->sctp_socket->so_snd); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) { + sctp_wakeup_log(stcb, cum_ack, wake_him, SCTP_WAKESND_FROM_SACK); + } +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + /* assoc was freed while we were unlocked */ + SCTP_SOCKET_UNLOCK(so, 1); + return; + } +#endif + sctp_sowwakeup_locked(stcb->sctp_ep, stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_WAKE_LOGGING_ENABLE) { + sctp_wakeup_log(stcb, cum_ack, wake_him, SCTP_NOWAKE_FROM_SACK); + } + } + + if (asoc->fast_retran_loss_recovery && accum_moved) { + if (compare_with_wrap(asoc->last_acked_seq, + asoc->fast_recovery_tsn, MAX_TSN) || + asoc->last_acked_seq == asoc->fast_recovery_tsn) { + /* Setup so we will exit RFC2582 fast recovery */ + will_exit_fast_recovery = 1; + } + } + /* + * Check for revoked fragments: + * + * if Previous sack - Had no frags then we can't have any revoked if + * Previous sack - Had frag's then - If we now have frags aka + * num_seg > 0 call sctp_check_for_revoked() to tell if peer revoked + * some of them. else - The peer revoked all ACKED fragments, since + * we had some before and now we have NONE. + */ + + if (num_seg) { + sctp_check_for_revoked(stcb, asoc, cum_ack, biggest_tsn_acked); + asoc->saw_sack_with_frags = 1; + } else if (asoc->saw_sack_with_frags) { + int cnt_revoked = 0; + + tp1 = TAILQ_FIRST(&asoc->sent_queue); + if (tp1 != NULL) { + /* Peer revoked all dg's marked or acked */ + TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { + if (tp1->sent == SCTP_DATAGRAM_ACKED) { + tp1->sent = SCTP_DATAGRAM_SENT; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_UP_REVOKE, + tp1->whoTo->flight_size, + tp1->book_size, + (uintptr_t) tp1->whoTo, + tp1->rec.data.TSN_seq); + } + sctp_flight_size_increase(tp1); + sctp_total_flight_increase(stcb, tp1); + tp1->rec.data.chunk_was_revoked = 1; + /* + * To ensure that this increase in + * flightsize, which is artificial, + * does not throttle the sender, we + * also increase the cwnd + * artificially. + */ + tp1->whoTo->cwnd += tp1->book_size; + cnt_revoked++; + } + } + if (cnt_revoked) { + reneged_all = 1; + } + } + asoc->saw_sack_with_frags = 0; + } + if (num_nr_seg > 0) + asoc->saw_sack_with_nr_frags = 1; + else + asoc->saw_sack_with_nr_frags = 0; + + /* JRS - Use the congestion control given in the CC module */ + asoc->cc_functions.sctp_cwnd_update_after_sack(stcb, asoc, accum_moved, reneged_all, will_exit_fast_recovery); + + if (TAILQ_EMPTY(&asoc->sent_queue)) { + /* nothing left in-flight */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + /* stop all timers */ + if (SCTP_BASE_SYSCTL(sctp_early_fr)) { + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpidsck4); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_29); + } + } + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, net, SCTP_FROM_SCTP_INDATA + SCTP_LOC_30); + net->flight_size = 0; + net->partial_bytes_acked = 0; + } + asoc->total_flight = 0; + asoc->total_flight_count = 0; + } + /**********************************/ + /* Now what about shutdown issues */ + /**********************************/ + if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue)) { + /* nothing left on sendqueue.. consider done */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { + sctp_log_rwnd_set(SCTP_SET_PEER_RWND_VIA_SACK, + asoc->peers_rwnd, 0, 0, a_rwnd); + } + asoc->peers_rwnd = a_rwnd; + if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { + /* SWS sender side engages */ + asoc->peers_rwnd = 0; + } + /* clean up */ + if ((asoc->stream_queue_cnt == 1) && + ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || + (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED)) && + (asoc->locked_on_sending) + ) { + struct sctp_stream_queue_pending *sp; + + /* + * I may be in a state where we got all across.. but + * cannot write more due to a shutdown... we abort + * since the user did not indicate EOR in this case. + */ + sp = TAILQ_LAST(&((asoc->locked_on_sending)->outqueue), + sctp_streamhead); + if ((sp) && (sp->length == 0)) { + asoc->locked_on_sending = NULL; + if (sp->msg_is_complete) { + asoc->stream_queue_cnt--; + } else { + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + asoc->stream_queue_cnt--; + } + } + } + if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) && + (asoc->stream_queue_cnt == 0)) { + if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) { + /* Need to abort here */ + struct mbuf *oper; + + abort_out_now: + *abort_now = 1; + /* XXX */ + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) + + sizeof(uint32_t); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_31); + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_31; + sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_RESPONSE_TO_USER_REQ, oper, SCTP_SO_NOT_LOCKED); + return; + } else { + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_stop_timers_for_shutdown(stcb); + sctp_send_shutdown(stcb, + stcb->asoc.primary_destination); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, + stcb->sctp_ep, stcb, asoc->primary_destination); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, + stcb->sctp_ep, stcb, asoc->primary_destination); + } + return; + } else if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) && + (asoc->stream_queue_cnt == 0)) { + if (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT) { + goto abort_out_now; + } + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_send_shutdown_ack(stcb, + stcb->asoc.primary_destination); + sctp_stop_timers_for_shutdown(stcb); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, + stcb->sctp_ep, stcb, asoc->primary_destination); + return; + } + } + /* + * Now here we are going to recycle net_ack for a different use... + * HEADS UP. + */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + net->net_ack = 0; + } + + /* + * CMT DAC algorithm: If SACK DAC flag was 0, then no extra marking + * to be done. Setting this_sack_lowest_newack to the cum_ack will + * automatically ensure that. + */ + if ((asoc->sctp_cmt_on_off == 1) && + SCTP_BASE_SYSCTL(sctp_cmt_use_dac) && + (cmt_dac_flag == 0)) { + this_sack_lowest_newack = cum_ack; + } + if ((num_seg > 0) || (num_nr_seg > 0)) { + sctp_strike_gap_ack_chunks(stcb, asoc, biggest_tsn_acked, + biggest_tsn_newly_acked, this_sack_lowest_newack, accum_moved); + } + /* JRS - Use the congestion control given in the CC module */ + asoc->cc_functions.sctp_cwnd_update_after_fr(stcb, asoc); + + /****************************************************************** + * Here we do the stuff with ECN Nonce checking. + * We basically check to see if the nonce sum flag was incorrect + * or if resynchronization needs to be done. Also if we catch a + * misbehaving receiver we give him the kick. + ******************************************************************/ + + if (asoc->ecn_nonce_allowed) { + if (asoc->nonce_sum_check) { + if (nonce_sum_flag != ((asoc->nonce_sum_expect_base + ecn_seg_sums) & SCTP_SACK_NONCE_SUM)) { + if (asoc->nonce_wait_for_ecne == 0) { + struct sctp_tmit_chunk *lchk; + + lchk = TAILQ_FIRST(&asoc->send_queue); + asoc->nonce_wait_for_ecne = 1; + if (lchk) { + asoc->nonce_wait_tsn = lchk->rec.data.TSN_seq; + } else { + asoc->nonce_wait_tsn = asoc->sending_seq; + } + } else { + if (compare_with_wrap(asoc->last_acked_seq, asoc->nonce_wait_tsn, MAX_TSN) || + (asoc->last_acked_seq == asoc->nonce_wait_tsn)) { + /* + * Misbehaving peer. We need + * to react to this guy + */ + asoc->ecn_allowed = 0; + asoc->ecn_nonce_allowed = 0; + } + } + } + } else { + /* See if Resynchronization Possible */ + if (compare_with_wrap(asoc->last_acked_seq, asoc->nonce_resync_tsn, MAX_TSN)) { + asoc->nonce_sum_check = 1; + /* + * now we must calculate what the base is. + * We do this based on two things, we know + * the total's for all the segments + * gap-acked in the SACK, its stored in + * ecn_seg_sums. We also know the SACK's + * nonce sum, its in nonce_sum_flag. So we + * can build a truth table to back-calculate + * the new value of + * asoc->nonce_sum_expect_base: + * + * SACK-flag-Value Seg-Sums Base 0 0 0 + * 1 0 1 0 1 1 1 + * 1 0 + */ + asoc->nonce_sum_expect_base = (ecn_seg_sums ^ nonce_sum_flag) & SCTP_SACK_NONCE_SUM; + } + } + } + /* Now are we exiting loss recovery ? */ + if (will_exit_fast_recovery) { + /* Ok, we must exit fast recovery */ + asoc->fast_retran_loss_recovery = 0; + } + if ((asoc->sat_t3_loss_recovery) && + ((compare_with_wrap(asoc->last_acked_seq, asoc->sat_t3_recovery_tsn, + MAX_TSN) || + (asoc->last_acked_seq == asoc->sat_t3_recovery_tsn)))) { + /* end satellite t3 loss recovery */ + asoc->sat_t3_loss_recovery = 0; + } + /* + * CMT Fast recovery + */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + if (net->will_exit_fast_recovery) { + /* Ok, we must exit fast recovery */ + net->fast_retran_loss_recovery = 0; + } + } + + /* Adjust and set the new rwnd value */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { + sctp_log_rwnd_set(SCTP_SET_PEER_RWND_VIA_SACK, + asoc->peers_rwnd, asoc->total_flight, (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)), a_rwnd); + } + asoc->peers_rwnd = sctp_sbspace_sub(a_rwnd, + (uint32_t) (asoc->total_flight + (asoc->total_flight_count * SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)))); + if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { + /* SWS sender side engages */ + asoc->peers_rwnd = 0; + } + if (asoc->peers_rwnd > old_rwnd) { + win_probe_recovery = 1; + } + /* + * Now we must setup so we have a timer up for anyone with + * outstanding data. + */ + done_once = 0; +again: + j = 0; + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + if (win_probe_recovery && (net->window_probe)) { + win_probe_recovered = 1; + /*- + * Find first chunk that was used with + * window probe and clear the event. Put + * it back into the send queue as if has + * not been sent. + */ + TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { + if (tp1->window_probe) { + sctp_window_probe_recovery(stcb, asoc, net, tp1); + break; + } + } + } + if (net->flight_size) { + j++; + if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { + sctp_timer_start(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, net); + } + if (net->window_probe) { + net->window_probe = 0; + } + } else { + if (net->window_probe) { + /* + * In window probes we must assure a timer + * is still running there + */ + if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { + sctp_timer_start(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, net); + + } + } else if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_22); + } + if (SCTP_BASE_SYSCTL(sctp_early_fr)) { + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpidsck4); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net, + SCTP_FROM_SCTP_INDATA + SCTP_LOC_23); + } + } + } + } + if ((j == 0) && + (!TAILQ_EMPTY(&asoc->sent_queue)) && + (asoc->sent_queue_retran_cnt == 0) && + (win_probe_recovered == 0) && + (done_once == 0)) { + /* + * huh, this should not happen unless all packets are + * PR-SCTP and marked to skip of course. + */ + if (sctp_fs_audit(asoc)) { + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + net->flight_size = 0; + } + asoc->total_flight = 0; + asoc->total_flight_count = 0; + asoc->sent_queue_retran_cnt = 0; + TAILQ_FOREACH(tp1, &asoc->sent_queue, sctp_next) { + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + sctp_flight_size_increase(tp1); + sctp_total_flight_increase(stcb, tp1); + } else if (tp1->sent == SCTP_DATAGRAM_RESEND) { + sctp_ucount_incr(asoc->sent_queue_retran_cnt); + } + } + } + done_once = 1; + goto again; + } + /*********************************************/ + /* Here we perform PR-SCTP procedures */ + /* (section 4.2) */ + /*********************************************/ + /* C1. update advancedPeerAckPoint */ + if (compare_with_wrap(cum_ack, asoc->advanced_peer_ack_point, MAX_TSN)) { + asoc->advanced_peer_ack_point = cum_ack; + } + /* C2. try to further move advancedPeerAckPoint ahead */ + if ((asoc->peer_supports_prsctp) && (asoc->pr_sctp_cnt > 0)) { + struct sctp_tmit_chunk *lchk; + uint32_t old_adv_peer_ack_point; + + old_adv_peer_ack_point = asoc->advanced_peer_ack_point; + lchk = sctp_try_advance_peer_ack_point(stcb, asoc); + /* C3. See if we need to send a Fwd-TSN */ + if (compare_with_wrap(asoc->advanced_peer_ack_point, cum_ack, + MAX_TSN)) { + /* + * ISSUE with ECN, see FWD-TSN processing for notes + * on issues that will occur when the ECN NONCE + * stuff is put into SCTP for cross checking. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { + sctp_misc_ints(SCTP_FWD_TSN_CHECK, + 0xee, cum_ack, asoc->advanced_peer_ack_point, + old_adv_peer_ack_point); + } + if (compare_with_wrap(asoc->advanced_peer_ack_point, old_adv_peer_ack_point, + MAX_TSN)) { + + send_forward_tsn(stcb, asoc); + /* + * ECN Nonce: Disable Nonce Sum check when + * FWD TSN is sent and store resync tsn + */ + asoc->nonce_sum_check = 0; + asoc->nonce_resync_tsn = asoc->advanced_peer_ack_point; + } else if (lchk) { + /* try to FR fwd-tsn's that get lost too */ + if (lchk->rec.data.fwd_tsn_cnt >= 3) { + send_forward_tsn(stcb, asoc); + } + } + } + if (lchk) { + /* Assure a timer is up */ + sctp_timer_start(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, lchk->whoTo); + } + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SACK_RWND_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_SACK_RWND_UPDATE, + a_rwnd, + stcb->asoc.peers_rwnd, + stcb->asoc.total_flight, + stcb->asoc.total_output_queue_size); + } +} + +void +sctp_update_acked(struct sctp_tcb *stcb, struct sctp_shutdown_chunk *cp, + struct sctp_nets *netp, int *abort_flag) +{ + /* Copy cum-ack */ + uint32_t cum_ack, a_rwnd; + + cum_ack = ntohl(cp->cumulative_tsn_ack); + /* Arrange so a_rwnd does NOT change */ + a_rwnd = stcb->asoc.peers_rwnd + stcb->asoc.total_flight; + + /* Now call the express sack handling */ + sctp_express_handle_sack(stcb, cum_ack, a_rwnd, 0, abort_flag); +} + +static void +sctp_kick_prsctp_reorder_queue(struct sctp_tcb *stcb, + struct sctp_stream_in *strmin) +{ + struct sctp_queued_to_read *ctl, *nctl; + struct sctp_association *asoc; + uint16_t tt; + + asoc = &stcb->asoc; + tt = strmin->last_sequence_delivered; + /* + * First deliver anything prior to and including the stream no that + * came in + */ + ctl = TAILQ_FIRST(&strmin->inqueue); + while (ctl) { + nctl = TAILQ_NEXT(ctl, next); + if (compare_with_wrap(tt, ctl->sinfo_ssn, MAX_SEQ) || + (tt == ctl->sinfo_ssn)) { + /* this is deliverable now */ + TAILQ_REMOVE(&strmin->inqueue, ctl, next); + /* subtract pending on streams */ + asoc->size_on_all_streams -= ctl->length; + sctp_ucount_decr(asoc->cnt_on_all_streams); + /* deliver it to at least the delivery-q */ + if (stcb->sctp_socket) { + sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); + sctp_add_to_readq(stcb->sctp_ep, stcb, + ctl, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED); + } + } else { + /* no more delivery now. */ + break; + } + ctl = nctl; + } + /* + * now we must deliver things in queue the normal way if any are + * now ready. + */ + tt = strmin->last_sequence_delivered + 1; + ctl = TAILQ_FIRST(&strmin->inqueue); + while (ctl) { + nctl = TAILQ_NEXT(ctl, next); + if (tt == ctl->sinfo_ssn) { + /* this is deliverable now */ + TAILQ_REMOVE(&strmin->inqueue, ctl, next); + /* subtract pending on streams */ + asoc->size_on_all_streams -= ctl->length; + sctp_ucount_decr(asoc->cnt_on_all_streams); + /* deliver it to at least the delivery-q */ + strmin->last_sequence_delivered = ctl->sinfo_ssn; + if (stcb->sctp_socket) { + sctp_mark_non_revokable(asoc, ctl->sinfo_tsn); + sctp_add_to_readq(stcb->sctp_ep, stcb, + ctl, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_HELD, SCTP_SO_NOT_LOCKED); + + } + tt = strmin->last_sequence_delivered + 1; + } else { + break; + } + ctl = nctl; + } +} + +static void +sctp_flush_reassm_for_str_seq(struct sctp_tcb *stcb, + struct sctp_association *asoc, + uint16_t stream, uint16_t seq) +{ + struct sctp_tmit_chunk *chk, *at; + + if (!TAILQ_EMPTY(&asoc->reasmqueue)) { + /* For each one on here see if we need to toss it */ + /* + * For now large messages held on the reasmqueue that are + * complete will be tossed too. We could in theory do more + * work to spin through and stop after dumping one msg aka + * seeing the start of a new msg at the head, and call the + * delivery function... to see if it can be delivered... But + * for now we just dump everything on the queue. + */ + chk = TAILQ_FIRST(&asoc->reasmqueue); + while (chk) { + at = TAILQ_NEXT(chk, sctp_next); + /* + * Do not toss it if on a different stream or marked + * for unordered delivery in which case the stream + * sequence number has no meaning. + */ + if ((chk->rec.data.stream_number != stream) || + ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == SCTP_DATA_UNORDERED)) { + chk = at; + continue; + } + if (chk->rec.data.stream_seq == seq) { + /* It needs to be tossed */ + TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); + if (compare_with_wrap(chk->rec.data.TSN_seq, + asoc->tsn_last_delivered, MAX_TSN)) { + asoc->tsn_last_delivered = + chk->rec.data.TSN_seq; + asoc->str_of_pdapi = + chk->rec.data.stream_number; + asoc->ssn_of_pdapi = + chk->rec.data.stream_seq; + asoc->fragment_flags = + chk->rec.data.rcv_flags; + } + asoc->size_on_reasm_queue -= chk->send_size; + sctp_ucount_decr(asoc->cnt_on_reasm_queue); + + /* Clear up any stream problem */ + if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != + SCTP_DATA_UNORDERED && + (compare_with_wrap(chk->rec.data.stream_seq, + asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered, + MAX_SEQ))) { + /* + * We must dump forward this streams + * sequence number if the chunk is + * not unordered that is being + * skipped. There is a chance that + * if the peer does not include the + * last fragment in its FWD-TSN we + * WILL have a problem here since + * you would have a partial chunk in + * queue that may not be + * deliverable. Also if a Partial + * delivery API as started the user + * may get a partial chunk. The next + * read returning a new chunk... + * really ugly but I see no way + * around it! Maybe a notify?? + */ + asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered = + chk->rec.data.stream_seq; + } + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + sctp_free_a_chunk(stcb, chk); + } else if (compare_with_wrap(chk->rec.data.stream_seq, seq, MAX_SEQ)) { + /* + * If the stream_seq is > than the purging + * one, we are done + */ + break; + } + chk = at; + } + } +} + + +void +sctp_handle_forward_tsn(struct sctp_tcb *stcb, + struct sctp_forward_tsn_chunk *fwd, + int *abort_flag, struct mbuf *m, int offset) +{ + /* + * ISSUES that MUST be fixed for ECN! When we are the sender of the + * forward TSN, when the SACK comes back that acknowledges the + * FWD-TSN we must reset the NONCE sum to match correctly. This will + * get quite tricky since we may have sent more data interveneing + * and must carefully account for what the SACK says on the nonce + * and any gaps that are reported. This work will NOT be done here, + * but I note it here since it is really related to PR-SCTP and + * FWD-TSN's + */ + + /* The pr-sctp fwd tsn */ + /* + * here we will perform all the data receiver side steps for + * processing FwdTSN, as required in by pr-sctp draft: + * + * Assume we get FwdTSN(x): + * + * 1) update local cumTSN to x 2) try to further advance cumTSN to x + + * others we have 3) examine and update re-ordering queue on + * pr-in-streams 4) clean up re-assembly queue 5) Send a sack to + * report where we are. + */ + struct sctp_association *asoc; + uint32_t new_cum_tsn, gap; + unsigned int i, fwd_sz, cumack_set_flag, m_size; + uint32_t str_seq; + struct sctp_stream_in *strm; + struct sctp_tmit_chunk *chk, *at; + struct sctp_queued_to_read *ctl, *sv; + + cumack_set_flag = 0; + asoc = &stcb->asoc; + if ((fwd_sz = ntohs(fwd->ch.chunk_length)) < sizeof(struct sctp_forward_tsn_chunk)) { + SCTPDBG(SCTP_DEBUG_INDATA1, + "Bad size too small/big fwd-tsn\n"); + return; + } + m_size = (stcb->asoc.mapping_array_size << 3); + /*************************************************************/ + /* 1. Here we update local cumTSN and shift the bitmap array */ + /*************************************************************/ + new_cum_tsn = ntohl(fwd->new_cumulative_tsn); + + if (compare_with_wrap(asoc->cumulative_tsn, new_cum_tsn, MAX_TSN) || + asoc->cumulative_tsn == new_cum_tsn) { + /* Already got there ... */ + return; + } + /* + * now we know the new TSN is more advanced, let's find the actual + * gap + */ + SCTP_CALC_TSN_TO_GAP(gap, new_cum_tsn, asoc->mapping_array_base_tsn); + asoc->cumulative_tsn = new_cum_tsn; + if (gap >= m_size) { + if ((long)gap > sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv)) { + struct mbuf *oper; + + /* + * out of range (of single byte chunks in the rwnd I + * give out). This must be an attacker. + */ + *abort_flag = 1; + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + 3 * sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) + + (sizeof(uint32_t) * 3); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_INDATA + SCTP_LOC_33); + ippp++; + *ippp = asoc->highest_tsn_inside_map; + ippp++; + *ippp = new_cum_tsn; + } + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_INDATA + SCTP_LOC_33; + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_PEER_FAULTY, oper, SCTP_SO_NOT_LOCKED); + return; + } + SCTP_STAT_INCR(sctps_fwdtsn_map_over); + + memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size); + asoc->mapping_array_base_tsn = new_cum_tsn + 1; + asoc->highest_tsn_inside_map = new_cum_tsn; + + memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size); + asoc->highest_tsn_inside_nr_map = new_cum_tsn; + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { + sctp_log_map(0, 3, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); + } + asoc->last_echo_tsn = asoc->highest_tsn_inside_map; + } else { + SCTP_TCB_LOCK_ASSERT(stcb); + for (i = 0; i <= gap; i++) { + if (!SCTP_IS_TSN_PRESENT(asoc->mapping_array, i) && + !SCTP_IS_TSN_PRESENT(asoc->nr_mapping_array, i)) { + SCTP_SET_TSN_PRESENT(asoc->nr_mapping_array, i); + if (compare_with_wrap(asoc->mapping_array_base_tsn + i, asoc->highest_tsn_inside_nr_map, MAX_TSN)) { + asoc->highest_tsn_inside_nr_map = asoc->mapping_array_base_tsn + i; + } + } + } + } + /*************************************************************/ + /* 2. Clear up re-assembly queue */ + /*************************************************************/ + /* + * First service it if pd-api is up, just in case we can progress it + * forward + */ + if (asoc->fragmented_delivery_inprogress) { + sctp_service_reassembly(stcb, asoc); + } + if (!TAILQ_EMPTY(&asoc->reasmqueue)) { + /* For each one on here see if we need to toss it */ + /* + * For now large messages held on the reasmqueue that are + * complete will be tossed too. We could in theory do more + * work to spin through and stop after dumping one msg aka + * seeing the start of a new msg at the head, and call the + * delivery function... to see if it can be delivered... But + * for now we just dump everything on the queue. + */ + chk = TAILQ_FIRST(&asoc->reasmqueue); + while (chk) { + at = TAILQ_NEXT(chk, sctp_next); + if ((compare_with_wrap(new_cum_tsn, + chk->rec.data.TSN_seq, MAX_TSN)) || + (new_cum_tsn == chk->rec.data.TSN_seq)) { + /* It needs to be tossed */ + TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); + if (compare_with_wrap(chk->rec.data.TSN_seq, + asoc->tsn_last_delivered, MAX_TSN)) { + asoc->tsn_last_delivered = + chk->rec.data.TSN_seq; + asoc->str_of_pdapi = + chk->rec.data.stream_number; + asoc->ssn_of_pdapi = + chk->rec.data.stream_seq; + asoc->fragment_flags = + chk->rec.data.rcv_flags; + } + asoc->size_on_reasm_queue -= chk->send_size; + sctp_ucount_decr(asoc->cnt_on_reasm_queue); + + /* Clear up any stream problem */ + if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) != + SCTP_DATA_UNORDERED && + (compare_with_wrap(chk->rec.data.stream_seq, + asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered, + MAX_SEQ))) { + /* + * We must dump forward this streams + * sequence number if the chunk is + * not unordered that is being + * skipped. There is a chance that + * if the peer does not include the + * last fragment in its FWD-TSN we + * WILL have a problem here since + * you would have a partial chunk in + * queue that may not be + * deliverable. Also if a Partial + * delivery API as started the user + * may get a partial chunk. The next + * read returning a new chunk... + * really ugly but I see no way + * around it! Maybe a notify?? + */ + asoc->strmin[chk->rec.data.stream_number].last_sequence_delivered = + chk->rec.data.stream_seq; + } + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + sctp_free_a_chunk(stcb, chk); + } else { + /* + * Ok we have gone beyond the end of the + * fwd-tsn's mark. + */ + break; + } + chk = at; + } + } + /*******************************************************/ + /* 3. Update the PR-stream re-ordering queues and fix */ + /* delivery issues as needed. */ + /*******************************************************/ + fwd_sz -= sizeof(*fwd); + if (m && fwd_sz) { + /* New method. */ + unsigned int num_str; + struct sctp_strseq *stseq, strseqbuf; + + offset += sizeof(*fwd); + + SCTP_INP_READ_LOCK(stcb->sctp_ep); + num_str = fwd_sz / sizeof(struct sctp_strseq); + for (i = 0; i < num_str; i++) { + uint16_t st; + + stseq = (struct sctp_strseq *)sctp_m_getptr(m, offset, + sizeof(struct sctp_strseq), + (uint8_t *) & strseqbuf); + offset += sizeof(struct sctp_strseq); + if (stseq == NULL) { + break; + } + /* Convert */ + st = ntohs(stseq->stream); + stseq->stream = st; + st = ntohs(stseq->sequence); + stseq->sequence = st; + + /* now process */ + + /* + * Ok we now look for the stream/seq on the read + * queue where its not all delivered. If we find it + * we transmute the read entry into a PDI_ABORTED. + */ + if (stseq->stream >= asoc->streamincnt) { + /* screwed up streams, stop! */ + break; + } + if ((asoc->str_of_pdapi == stseq->stream) && + (asoc->ssn_of_pdapi == stseq->sequence)) { + /* + * If this is the one we were partially + * delivering now then we no longer are. + * Note this will change with the reassembly + * re-write. + */ + asoc->fragmented_delivery_inprogress = 0; + } + sctp_flush_reassm_for_str_seq(stcb, asoc, stseq->stream, stseq->sequence); + TAILQ_FOREACH(ctl, &stcb->sctp_ep->read_queue, next) { + if ((ctl->sinfo_stream == stseq->stream) && + (ctl->sinfo_ssn == stseq->sequence)) { + str_seq = (stseq->stream << 16) | stseq->sequence; + ctl->end_added = 1; + ctl->pdapi_aborted = 1; + sv = stcb->asoc.control_pdapi; + stcb->asoc.control_pdapi = ctl; + sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION, + stcb, + SCTP_PARTIAL_DELIVERY_ABORTED, + (void *)&str_seq, + SCTP_SO_NOT_LOCKED); + stcb->asoc.control_pdapi = sv; + break; + } else if ((ctl->sinfo_stream == stseq->stream) && + (compare_with_wrap(ctl->sinfo_ssn, stseq->sequence, MAX_SEQ))) { + /* We are past our victim SSN */ + break; + } + } + strm = &asoc->strmin[stseq->stream]; + if (compare_with_wrap(stseq->sequence, + strm->last_sequence_delivered, MAX_SEQ)) { + /* Update the sequence number */ + strm->last_sequence_delivered = + stseq->sequence; + } + /* now kick the stream the new way */ + /* sa_ignore NO_NULL_CHK */ + sctp_kick_prsctp_reorder_queue(stcb, strm); + } + SCTP_INP_READ_UNLOCK(stcb->sctp_ep); + } + /* + * Now slide thing forward. + */ + sctp_slide_mapping_arrays(stcb); + + if (!TAILQ_EMPTY(&asoc->reasmqueue)) { + /* now lets kick out and check for more fragmented delivery */ + /* sa_ignore NO_NULL_CHK */ + sctp_deliver_reasm_check(stcb, &stcb->asoc); + } +} diff --git a/freebsd/sys/netinet/sctp_indata.h b/freebsd/sys/netinet/sctp_indata.h new file mode 100644 index 00000000..a231ecaf --- /dev/null +++ b/freebsd/sys/netinet/sctp_indata.h @@ -0,0 +1,129 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_indata.h,v 1.9 2005/03/06 16:04:17 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_indata_h__ +#define __sctp_indata_h__ + +#if defined(_KERNEL) || defined(__Userspace__) + +struct sctp_queued_to_read * +sctp_build_readq_entry(struct sctp_tcb *stcb, + struct sctp_nets *net, + uint32_t tsn, uint32_t ppid, + uint32_t context, uint16_t stream_no, + uint16_t stream_seq, uint8_t flags, + struct mbuf *dm); + + +#define sctp_build_readq_entry_mac(_ctl, in_it, a, net, tsn, ppid, context, stream_no, stream_seq, flags, dm) do { \ + if (_ctl) { \ + atomic_add_int(&((net)->ref_count), 1); \ + (_ctl)->sinfo_stream = stream_no; \ + (_ctl)->sinfo_ssn = stream_seq; \ + (_ctl)->sinfo_flags = (flags << 8); \ + (_ctl)->sinfo_ppid = ppid; \ + (_ctl)->sinfo_context = a; \ + (_ctl)->sinfo_timetolive = 0; \ + (_ctl)->sinfo_tsn = tsn; \ + (_ctl)->sinfo_cumtsn = tsn; \ + (_ctl)->sinfo_assoc_id = sctp_get_associd((in_it)); \ + (_ctl)->length = 0; \ + (_ctl)->held_length = 0; \ + (_ctl)->whoFrom = net; \ + (_ctl)->data = dm; \ + (_ctl)->tail_mbuf = NULL; \ + (_ctl)->aux_data = NULL; \ + (_ctl)->stcb = (in_it); \ + (_ctl)->port_from = (in_it)->rport; \ + (_ctl)->spec_flags = 0; \ + (_ctl)->do_not_ref_stcb = 0; \ + (_ctl)->end_added = 0; \ + (_ctl)->pdapi_aborted = 0; \ + (_ctl)->some_taken = 0; \ + } \ +} while (0) + + + +struct mbuf * +sctp_build_ctl_nchunk(struct sctp_inpcb *inp, + struct sctp_sndrcvinfo *sinfo); + +char * +sctp_build_ctl_cchunk(struct sctp_inpcb *inp, + int *control_len, + struct sctp_sndrcvinfo *sinfo); + +void sctp_set_rwnd(struct sctp_tcb *, struct sctp_association *); + +uint32_t +sctp_calc_rwnd(struct sctp_tcb *stcb, struct sctp_association *asoc); + +void +sctp_express_handle_sack(struct sctp_tcb *stcb, uint32_t cumack, + uint32_t rwnd, int nonce_sum_flag, int *abort_now); + +void +sctp_handle_sack(struct mbuf *m, int offset_seg, int offset_dup, + struct sctp_tcb *stcb, struct sctp_nets *net_from, + uint16_t num_seg, uint16_t num_nr_seg, uint16_t num_dup, + int *abort_now, uint8_t flags, + uint32_t cum_ack, uint32_t rwnd); + +/* draft-ietf-tsvwg-usctp */ +void +sctp_handle_forward_tsn(struct sctp_tcb *, + struct sctp_forward_tsn_chunk *, int *, struct mbuf *, int); + +struct sctp_tmit_chunk * + sctp_try_advance_peer_ack_point(struct sctp_tcb *, struct sctp_association *); + +void sctp_service_queues(struct sctp_tcb *, struct sctp_association *); + +void +sctp_update_acked(struct sctp_tcb *, struct sctp_shutdown_chunk *, + struct sctp_nets *, int *); + +int +sctp_process_data(struct mbuf **, int, int *, int, struct sctphdr *, + struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *, uint32_t *); + +void sctp_slide_mapping_arrays(struct sctp_tcb *stcb); + +void sctp_sack_check(struct sctp_tcb *, int, int *); + +#endif +#endif diff --git a/freebsd/sys/netinet/sctp_input.c b/freebsd/sys/netinet/sctp_input.c new file mode 100644 index 00000000..080813b4 --- /dev/null +++ b/freebsd/sys/netinet/sctp_input.c @@ -0,0 +1,5965 @@ +#include + +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_input.c,v 1.27 2005/03/06 16:04:17 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +static void +sctp_stop_all_cookie_timers(struct sctp_tcb *stcb) +{ + struct sctp_nets *net; + + /* + * This now not only stops all cookie timers it also stops any INIT + * timers as well. This will make sure that the timers are stopped + * in all collision cases. + */ + SCTP_TCB_LOCK_ASSERT(stcb); + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if (net->rxt_timer.type == SCTP_TIMER_TYPE_COOKIE) { + sctp_timer_stop(SCTP_TIMER_TYPE_COOKIE, + stcb->sctp_ep, + stcb, + net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_1); + } else if (net->rxt_timer.type == SCTP_TIMER_TYPE_INIT) { + sctp_timer_stop(SCTP_TIMER_TYPE_INIT, + stcb->sctp_ep, + stcb, + net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_2); + } + } +} + +/* INIT handler */ +static void +sctp_handle_init(struct mbuf *m, int iphlen, int offset, struct sctphdr *sh, + struct sctp_init_chunk *cp, struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net, int *abort_no_unlock, uint32_t vrf_id, uint16_t port) +{ + struct sctp_init *init; + struct mbuf *op_err; + uint32_t init_limit; + + SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init: handling INIT tcb:%p\n", + stcb); + if (stcb == NULL) { + SCTP_INP_RLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + goto outnow; + } + } + op_err = NULL; + init = &cp->init; + /* First are we accepting? */ + if ((inp->sctp_socket->so_qlimit == 0) && (stcb == NULL)) { + SCTPDBG(SCTP_DEBUG_INPUT2, + "sctp_handle_init: Abort, so_qlimit:%d\n", + inp->sctp_socket->so_qlimit); + /* + * FIX ME ?? What about TCP model and we have a + * match/restart case? Actually no fix is needed. the lookup + * will always find the existing assoc so stcb would not be + * NULL. It may be questionable to do this since we COULD + * just send back the INIT-ACK and hope that the app did + * accept()'s by the time the COOKIE was sent. But there is + * a price to pay for COOKIE generation and I don't want to + * pay it on the chance that the app will actually do some + * accepts(). The App just looses and should NOT be in this + * state :-) + */ + sctp_abort_association(inp, stcb, m, iphlen, sh, op_err, + vrf_id, port); + if (stcb) + *abort_no_unlock = 1; + goto outnow; + } + if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) { + /* Invalid length */ + op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM); + sctp_abort_association(inp, stcb, m, iphlen, sh, op_err, + vrf_id, port); + if (stcb) + *abort_no_unlock = 1; + goto outnow; + } + /* validate parameters */ + if (init->initiate_tag == 0) { + /* protocol error... send abort */ + op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM); + sctp_abort_association(inp, stcb, m, iphlen, sh, op_err, + vrf_id, port); + if (stcb) + *abort_no_unlock = 1; + goto outnow; + } + if (ntohl(init->a_rwnd) < SCTP_MIN_RWND) { + /* invalid parameter... send abort */ + op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM); + sctp_abort_association(inp, stcb, m, iphlen, sh, op_err, + vrf_id, port); + if (stcb) + *abort_no_unlock = 1; + goto outnow; + } + if (init->num_inbound_streams == 0) { + /* protocol error... send abort */ + op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM); + sctp_abort_association(inp, stcb, m, iphlen, sh, op_err, + vrf_id, port); + if (stcb) + *abort_no_unlock = 1; + goto outnow; + } + if (init->num_outbound_streams == 0) { + /* protocol error... send abort */ + op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM); + sctp_abort_association(inp, stcb, m, iphlen, sh, op_err, + vrf_id, port); + if (stcb) + *abort_no_unlock = 1; + goto outnow; + } + init_limit = offset + ntohs(cp->ch.chunk_length); + if (sctp_validate_init_auth_params(m, offset + sizeof(*cp), + init_limit)) { + /* auth parameter(s) error... send abort */ + sctp_abort_association(inp, stcb, m, iphlen, sh, NULL, vrf_id, port); + if (stcb) + *abort_no_unlock = 1; + goto outnow; + } + /* send an INIT-ACK w/cookie */ + SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending INIT-ACK\n"); + sctp_send_initiate_ack(inp, stcb, m, iphlen, offset, sh, cp, vrf_id, port, + ((stcb == NULL) ? SCTP_HOLDS_LOCK : SCTP_NOT_LOCKED)); +outnow: + if (stcb == NULL) { + SCTP_INP_RUNLOCK(inp); + } +} + +/* + * process peer "INIT/INIT-ACK" chunk returns value < 0 on error + */ + +int +sctp_is_there_unsent_data(struct sctp_tcb *stcb) +{ + int unsent_data = 0; + struct sctp_stream_queue_pending *sp; + struct sctp_stream_out *strq; + struct sctp_association *asoc; + + /* + * This function returns the number of streams that have true unsent + * data on them. Note that as it looks through it will clean up any + * places that have old data that has been sent but left at top of + * stream queue. + */ + asoc = &stcb->asoc; + SCTP_TCB_SEND_LOCK(stcb); + if (!TAILQ_EMPTY(&asoc->out_wheel)) { + /* Check to see if some data queued */ + TAILQ_FOREACH(strq, &asoc->out_wheel, next_spoke) { + is_there_another: + /* sa_ignore FREED_MEMORY */ + sp = TAILQ_FIRST(&strq->outqueue); + if (sp == NULL) { + continue; + } + if ((sp->msg_is_complete) && + (sp->length == 0) && + (sp->sender_all_done)) { + /* + * We are doing differed cleanup. Last time + * through when we took all the data the + * sender_all_done was not set. + */ + if (sp->put_last_out == 0) { + SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n"); + SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d\n", + sp->sender_all_done, + sp->length, + sp->msg_is_complete, + sp->put_last_out); + } + atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1); + TAILQ_REMOVE(&strq->outqueue, sp, next); + if (sp->net) { + sctp_free_remote_addr(sp->net); + sp->net = NULL; + } + if (sp->data) { + sctp_m_freem(sp->data); + sp->data = NULL; + } + sctp_free_a_strmoq(stcb, sp); + goto is_there_another; + } else { + unsent_data++; + continue; + } + } + } + SCTP_TCB_SEND_UNLOCK(stcb); + return (unsent_data); +} + +static int +sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct sctp_init *init; + struct sctp_association *asoc; + struct sctp_nets *lnet; + unsigned int i; + + init = &cp->init; + asoc = &stcb->asoc; + /* save off parameters */ + asoc->peer_vtag = ntohl(init->initiate_tag); + asoc->peers_rwnd = ntohl(init->a_rwnd); + if (!TAILQ_EMPTY(&asoc->nets)) { + /* update any ssthresh's that may have a default */ + TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) { + lnet->ssthresh = asoc->peers_rwnd; + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) { + sctp_log_cwnd(stcb, lnet, 0, SCTP_CWND_INITIALIZATION); + } + } + } + SCTP_TCB_SEND_LOCK(stcb); + if (asoc->pre_open_streams > ntohs(init->num_inbound_streams)) { + unsigned int newcnt; + struct sctp_stream_out *outs; + struct sctp_stream_queue_pending *sp; + struct sctp_tmit_chunk *chk, *chk_next; + + /* abandon the upper streams */ + newcnt = ntohs(init->num_inbound_streams); + if (!TAILQ_EMPTY(&asoc->send_queue)) { + chk = TAILQ_FIRST(&asoc->send_queue); + while (chk) { + chk_next = TAILQ_NEXT(chk, sctp_next); + if (chk->rec.data.stream_number >= newcnt) { + TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next); + asoc->send_queue_cnt--; + if (chk->data != NULL) { + sctp_free_bufspace(stcb, asoc, chk, 1); + sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, + SCTP_NOTIFY_DATAGRAM_UNSENT, chk, SCTP_SO_NOT_LOCKED); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + } + sctp_free_a_chunk(stcb, chk); + /* sa_ignore FREED_MEMORY */ + } + chk = chk_next; + } + } + if (asoc->strmout) { + for (i = newcnt; i < asoc->pre_open_streams; i++) { + outs = &asoc->strmout[i]; + sp = TAILQ_FIRST(&outs->outqueue); + while (sp) { + TAILQ_REMOVE(&outs->outqueue, sp, next); + asoc->stream_queue_cnt--; + sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, + stcb, SCTP_NOTIFY_DATAGRAM_UNSENT, + sp, SCTP_SO_NOT_LOCKED); + if (sp->data) { + sctp_m_freem(sp->data); + sp->data = NULL; + } + if (sp->net) { + sctp_free_remote_addr(sp->net); + sp->net = NULL; + } + /* Free the chunk */ + sctp_free_a_strmoq(stcb, sp); + /* sa_ignore FREED_MEMORY */ + sp = TAILQ_FIRST(&outs->outqueue); + } + } + } + /* cut back the count */ + asoc->pre_open_streams = newcnt; + } + SCTP_TCB_SEND_UNLOCK(stcb); + asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams; + /* init tsn's */ + asoc->highest_tsn_inside_map = asoc->asconf_seq_in = ntohl(init->initial_tsn) - 1; + /* EY - nr_sack: initialize highest tsn in nr_mapping_array */ + asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { + sctp_log_map(0, 5, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); + } + /* This is the next one we expect */ + asoc->str_reset_seq_in = asoc->asconf_seq_in + 1; + + asoc->mapping_array_base_tsn = ntohl(init->initial_tsn); + asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->asconf_seq_in; + asoc->last_echo_tsn = asoc->asconf_seq_in; + asoc->advanced_peer_ack_point = asoc->last_acked_seq; + /* open the requested streams */ + + if (asoc->strmin != NULL) { + /* Free the old ones */ + struct sctp_queued_to_read *ctl; + + for (i = 0; i < asoc->streamincnt; i++) { + ctl = TAILQ_FIRST(&asoc->strmin[i].inqueue); + while (ctl) { + TAILQ_REMOVE(&asoc->strmin[i].inqueue, ctl, next); + sctp_free_remote_addr(ctl->whoFrom); + ctl->whoFrom = NULL; + sctp_m_freem(ctl->data); + ctl->data = NULL; + sctp_free_a_readq(stcb, ctl); + ctl = TAILQ_FIRST(&asoc->strmin[i].inqueue); + } + } + SCTP_FREE(asoc->strmin, SCTP_M_STRMI); + } + asoc->streamincnt = ntohs(init->num_outbound_streams); + if (asoc->streamincnt > MAX_SCTP_STREAMS) { + asoc->streamincnt = MAX_SCTP_STREAMS; + } + SCTP_MALLOC(asoc->strmin, struct sctp_stream_in *, asoc->streamincnt * + sizeof(struct sctp_stream_in), SCTP_M_STRMI); + if (asoc->strmin == NULL) { + /* we didn't get memory for the streams! */ + SCTPDBG(SCTP_DEBUG_INPUT2, "process_init: couldn't get memory for the streams!\n"); + return (-1); + } + for (i = 0; i < asoc->streamincnt; i++) { + asoc->strmin[i].stream_no = i; + asoc->strmin[i].last_sequence_delivered = 0xffff; + /* + * U-stream ranges will be set when the cookie is unpacked. + * Or for the INIT sender they are un set (if pr-sctp not + * supported) when the INIT-ACK arrives. + */ + TAILQ_INIT(&asoc->strmin[i].inqueue); + asoc->strmin[i].delivery_started = 0; + } + /* + * load_address_from_init will put the addresses into the + * association when the COOKIE is processed or the INIT-ACK is + * processed. Both types of COOKIE's existing and new call this + * routine. It will remove addresses that are no longer in the + * association (for the restarting case where addresses are + * removed). Up front when the INIT arrives we will discard it if it + * is a restart and new addresses have been added. + */ + /* sa_ignore MEMLEAK */ + return (0); +} + +/* + * INIT-ACK message processing/consumption returns value < 0 on error + */ +static int +sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, + struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb, + struct sctp_nets *net, int *abort_no_unlock, uint32_t vrf_id) +{ + struct sctp_association *asoc; + struct mbuf *op_err; + int retval, abort_flag; + uint32_t initack_limit; + int nat_friendly = 0; + + /* First verify that we have no illegal param's */ + abort_flag = 0; + op_err = NULL; + + op_err = sctp_arethere_unrecognized_parameters(m, + (offset + sizeof(struct sctp_init_chunk)), + &abort_flag, (struct sctp_chunkhdr *)cp, &nat_friendly); + if (abort_flag) { + /* Send an abort and notify peer */ + sctp_abort_an_association(stcb->sctp_ep, stcb, SCTP_CAUSE_PROTOCOL_VIOLATION, op_err, SCTP_SO_NOT_LOCKED); + *abort_no_unlock = 1; + return (-1); + } + asoc = &stcb->asoc; + asoc->peer_supports_nat = (uint8_t) nat_friendly; + /* process the peer's parameters in the INIT-ACK */ + retval = sctp_process_init((struct sctp_init_chunk *)cp, stcb, net); + if (retval < 0) { + return (retval); + } + initack_limit = offset + ntohs(cp->ch.chunk_length); + /* load all addresses */ + if ((retval = sctp_load_addresses_from_init(stcb, m, iphlen, + (offset + sizeof(struct sctp_init_chunk)), initack_limit, sh, + NULL))) { + /* Huh, we should abort */ + SCTPDBG(SCTP_DEBUG_INPUT1, + "Load addresses from INIT causes an abort %d\n", + retval); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh, + NULL, 0, net->port); + *abort_no_unlock = 1; + return (-1); + } + /* if the peer doesn't support asconf, flush the asconf queue */ + if (asoc->peer_supports_asconf == 0) { + struct sctp_asconf_addr *aparam; + + while (!TAILQ_EMPTY(&asoc->asconf_queue)) { + /* sa_ignore FREED_MEMORY */ + aparam = TAILQ_FIRST(&asoc->asconf_queue); + TAILQ_REMOVE(&asoc->asconf_queue, aparam, next); + SCTP_FREE(aparam, SCTP_M_ASC_ADDR); + } + } + stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs, + stcb->asoc.local_hmacs); + if (op_err) { + sctp_queue_op_err(stcb, op_err); + /* queuing will steal away the mbuf chain to the out queue */ + op_err = NULL; + } + /* extract the cookie and queue it to "echo" it back... */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + net->error_count = 0; + + /* + * Cancel the INIT timer, We do this first before queueing the + * cookie. We always cancel at the primary to assue that we are + * canceling the timer started by the INIT which always goes to the + * primary. + */ + sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, + asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_4); + + /* calculate the RTO */ + net->RTO = sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, sctp_align_safe_nocopy); + + retval = sctp_send_cookie_echo(m, offset, stcb, net); + if (retval < 0) { + /* + * No cookie, we probably should send a op error. But in any + * case if there is no cookie in the INIT-ACK, we can + * abandon the peer, its broke. + */ + if (retval == -3) { + /* We abort with an error of missing mandatory param */ + op_err = + sctp_generate_invmanparam(SCTP_CAUSE_MISSING_PARAM); + if (op_err) { + /* + * Expand beyond to include the mandatory + * param cookie + */ + struct sctp_inv_mandatory_param *mp; + + SCTP_BUF_LEN(op_err) = + sizeof(struct sctp_inv_mandatory_param); + mp = mtod(op_err, + struct sctp_inv_mandatory_param *); + /* Subtract the reserved param */ + mp->length = + htons(sizeof(struct sctp_inv_mandatory_param) - 2); + mp->num_param = htonl(1); + mp->param = htons(SCTP_STATE_COOKIE); + mp->resv = 0; + } + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, + sh, op_err, 0, net->port); + *abort_no_unlock = 1; + } + return (retval); + } + return (0); +} + +static void +sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, + struct sctp_tcb *stcb, struct sctp_nets *net) +{ + struct sockaddr_storage store; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + struct sctp_nets *r_net, *f_net; + struct timeval tv; + int req_prim = 0; + + if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_heartbeat_chunk)) { + /* Invalid length */ + return; + } + sin = (struct sockaddr_in *)&store; + sin6 = (struct sockaddr_in6 *)&store; + + memset(&store, 0, sizeof(store)); + if (cp->heartbeat.hb_info.addr_family == AF_INET && + cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in)) { + sin->sin_family = cp->heartbeat.hb_info.addr_family; + sin->sin_len = cp->heartbeat.hb_info.addr_len; + sin->sin_port = stcb->rport; + memcpy(&sin->sin_addr, cp->heartbeat.hb_info.address, + sizeof(sin->sin_addr)); + } else if (cp->heartbeat.hb_info.addr_family == AF_INET6 && + cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in6)) { + sin6->sin6_family = cp->heartbeat.hb_info.addr_family; + sin6->sin6_len = cp->heartbeat.hb_info.addr_len; + sin6->sin6_port = stcb->rport; + memcpy(&sin6->sin6_addr, cp->heartbeat.hb_info.address, + sizeof(sin6->sin6_addr)); + } else { + return; + } + r_net = sctp_findnet(stcb, (struct sockaddr *)sin); + if (r_net == NULL) { + SCTPDBG(SCTP_DEBUG_INPUT1, "Huh? I can't find the address I sent it to, discard\n"); + return; + } + if ((r_net && (r_net->dest_state & SCTP_ADDR_UNCONFIRMED)) && + (r_net->heartbeat_random1 == cp->heartbeat.hb_info.random_value1) && + (r_net->heartbeat_random2 == cp->heartbeat.hb_info.random_value2)) { + /* + * If the its a HB and it's random value is correct when can + * confirm the destination. + */ + r_net->dest_state &= ~SCTP_ADDR_UNCONFIRMED; + if (r_net->dest_state & SCTP_ADDR_REQ_PRIMARY) { + stcb->asoc.primary_destination = r_net; + r_net->dest_state &= ~SCTP_ADDR_WAS_PRIMARY; + r_net->dest_state &= ~SCTP_ADDR_REQ_PRIMARY; + f_net = TAILQ_FIRST(&stcb->asoc.nets); + if (f_net != r_net) { + /* + * first one on the list is NOT the primary + * sctp_cmpaddr() is much more efficent if + * the primary is the first on the list, + * make it so. + */ + TAILQ_REMOVE(&stcb->asoc.nets, r_net, sctp_next); + TAILQ_INSERT_HEAD(&stcb->asoc.nets, r_net, sctp_next); + } + req_prim = 1; + } + sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, + stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED); + } + r_net->error_count = 0; + r_net->hb_responded = 1; + tv.tv_sec = cp->heartbeat.hb_info.time_value_1; + tv.tv_usec = cp->heartbeat.hb_info.time_value_2; + if (r_net->dest_state & SCTP_ADDR_NOT_REACHABLE) { + r_net->dest_state &= ~SCTP_ADDR_NOT_REACHABLE; + r_net->dest_state |= SCTP_ADDR_REACHABLE; + sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, + SCTP_HEARTBEAT_SUCCESS, (void *)r_net, SCTP_SO_NOT_LOCKED); + /* now was it the primary? if so restore */ + if (r_net->dest_state & SCTP_ADDR_WAS_PRIMARY) { + (void)sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, r_net); + } + } + /* + * JRS 5/14/07 - If CMT PF is on and the destination is in PF state, + * set the destination to active state and set the cwnd to one or + * two MTU's based on whether PF1 or PF2 is being used. If a T3 + * timer is running, for the destination, stop the timer because a + * PF-heartbeat was received. + */ + if ((stcb->asoc.sctp_cmt_on_off == 1) && + (stcb->asoc.sctp_cmt_pf > 0) && + ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) { + if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, net, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_5); + } + net->dest_state &= ~SCTP_ADDR_PF; + net->cwnd = net->mtu * stcb->asoc.sctp_cmt_pf; + SCTPDBG(SCTP_DEBUG_INPUT1, "Destination %p moved from PF to reachable with cwnd %d.\n", + net, net->cwnd); + } + /* Now lets do a RTO with this */ + r_net->RTO = sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, sctp_align_safe_nocopy); + /* Mobility adaptation */ + if (req_prim) { + if ((sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_BASE) || + sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_FASTHANDOFF)) && + sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_PRIM_DELETED)) { + + sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_TIMER + SCTP_LOC_7); + if (sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_FASTHANDOFF)) { + sctp_assoc_immediate_retrans(stcb, + stcb->asoc.primary_destination); + } + if (sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_BASE)) { + sctp_move_chunks_from_net(stcb, + stcb->asoc.deleted_primary); + } + sctp_delete_prim_timer(stcb->sctp_ep, stcb, + stcb->asoc.deleted_primary); + } + } +} + +static int +sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) +{ + /* + * return 0 means we want you to proceed with the abort non-zero + * means no abort processing + */ + struct sctpasochead *head; + + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) { + /* generate a new vtag and send init */ + LIST_REMOVE(stcb, sctp_asocs); + stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); + head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; + /* + * put it in the bucket in the vtag hash of assoc's for the + * system + */ + LIST_INSERT_HEAD(head, stcb, sctp_asocs); + sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); + return (1); + } + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) { + /* + * treat like a case where the cookie expired i.e.: - dump + * current cookie. - generate a new vtag. - resend init. + */ + /* generate a new vtag and send init */ + LIST_REMOVE(stcb, sctp_asocs); + stcb->asoc.state &= ~SCTP_STATE_COOKIE_ECHOED; + stcb->asoc.state |= SCTP_STATE_COOKIE_WAIT; + sctp_stop_all_cookie_timers(stcb); + sctp_toss_old_cookies(stcb, &stcb->asoc); + stcb->asoc.my_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); + head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; + /* + * put it in the bucket in the vtag hash of assoc's for the + * system + */ + LIST_INSERT_HEAD(head, stcb, sctp_asocs); + sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); + return (1); + } + return (0); +} + +static int +sctp_handle_nat_missing_state(struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + /* + * return 0 means we want you to proceed with the abort non-zero + * means no abort processing + */ + if (stcb->asoc.peer_supports_auth == 0) { + SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_nat_missing_state: Peer does not support AUTH, cannot send an asconf\n"); + return (0); + } + sctp_asconf_send_nat_state_update(stcb, net); + return (1); +} + + +static void +sctp_handle_abort(struct sctp_abort_chunk *cp, + struct sctp_tcb *stcb, struct sctp_nets *net) +{ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + uint16_t len; + + SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: handling ABORT\n"); + if (stcb == NULL) + return; + + len = ntohs(cp->ch.chunk_length); + if (len > sizeof(struct sctp_chunkhdr)) { + /* + * Need to check the cause codes for our two magic nat + * aborts which don't kill the assoc necessarily. + */ + struct sctp_abort_chunk *cpnext; + struct sctp_missing_nat_state *natc; + uint16_t cause; + + cpnext = cp; + cpnext++; + natc = (struct sctp_missing_nat_state *)cpnext; + cause = ntohs(natc->cause); + if (cause == SCTP_CAUSE_NAT_COLLIDING_STATE) { + SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n", + cp->ch.chunk_flags); + if (sctp_handle_nat_colliding_state(stcb)) { + return; + } + } else if (cause == SCTP_CAUSE_NAT_MISSING_STATE) { + SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state abort flags:%x\n", + cp->ch.chunk_flags); + if (sctp_handle_nat_missing_state(stcb, net)) { + return; + } + } + } + /* stop any receive timers */ + sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_6); + /* notify user of the abort and clean up... */ + sctp_abort_notification(stcb, 0, SCTP_SO_NOT_LOCKED); + /* free the tcb */ +#if defined(SCTP_PANIC_ON_ABORT) + printf("stcb:%p state:%d rport:%d net:%p\n", + stcb, stcb->asoc.state, stcb->rport, net); + if (!(stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) { + panic("Received an ABORT"); + } else { + printf("No panic its in state %x closed\n", stcb->asoc.state); + } +#endif + SCTP_STAT_INCR_COUNTER32(sctps_aborted); + if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } +#ifdef SCTP_ASOCLOG_OF_TSNS + sctp_print_out_track_log(stcb); +#endif +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + stcb->asoc.state |= SCTP_STATE_WAS_ABORTED; + (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_6); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: finished\n"); +} + +static void +sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, + struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_flag) +{ + struct sctp_association *asoc; + int some_on_streamwheel; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + + SCTPDBG(SCTP_DEBUG_INPUT2, + "sctp_handle_shutdown: handling SHUTDOWN\n"); + if (stcb == NULL) + return; + asoc = &stcb->asoc; + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { + return; + } + if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_shutdown_chunk)) { + /* Shutdown NOT the expected size */ + return; + } else { + sctp_update_acked(stcb, cp, net, abort_flag); + if (*abort_flag) { + return; + } + } + if (asoc->control_pdapi) { + /* + * With a normal shutdown we assume the end of last record. + */ + SCTP_INP_READ_LOCK(stcb->sctp_ep); + asoc->control_pdapi->end_added = 1; + asoc->control_pdapi->pdapi_aborted = 1; + asoc->control_pdapi = NULL; + SCTP_INP_READ_UNLOCK(stcb->sctp_ep); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + /* assoc was freed while we were unlocked */ + SCTP_SOCKET_UNLOCK(so, 1); + return; + } +#endif + sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } + /* goto SHUTDOWN_RECEIVED state to block new requests */ + if (stcb->sctp_socket) { + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT)) { + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_RECEIVED); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + /* + * notify upper layer that peer has initiated a + * shutdown + */ + sctp_ulp_notify(SCTP_NOTIFY_PEER_SHUTDOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); + + /* reset time */ + (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); + } + } + if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) { + /* + * stop the shutdown timer, since we WILL move to + * SHUTDOWN-ACK-SENT. + */ + sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_8); + } + /* Now is there unsent data on a stream somewhere? */ + some_on_streamwheel = sctp_is_there_unsent_data(stcb); + + if (!TAILQ_EMPTY(&asoc->send_queue) || + !TAILQ_EMPTY(&asoc->sent_queue) || + some_on_streamwheel) { + /* By returning we will push more data out */ + return; + } else { + /* no outstanding data to send, so move on... */ + /* send SHUTDOWN-ACK */ + sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination); + /* move to SHUTDOWN-ACK-SENT state */ + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_ACK_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_stop_timers_for_shutdown(stcb); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, + stcb, net); + } +} + +static void +sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp, + struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct sctp_association *asoc; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + + so = SCTP_INP_SO(stcb->sctp_ep); +#endif + SCTPDBG(SCTP_DEBUG_INPUT2, + "sctp_handle_shutdown_ack: handling SHUTDOWN ACK\n"); + if (stcb == NULL) + return; + + asoc = &stcb->asoc; + /* process according to association state */ + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { + /* unexpected SHUTDOWN-ACK... do OOTB handling... */ + sctp_send_shutdown_complete(stcb, net, 1); + SCTP_TCB_UNLOCK(stcb); + return; + } + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + /* unexpected SHUTDOWN-ACK... so ignore... */ + SCTP_TCB_UNLOCK(stcb); + return; + } + if (asoc->control_pdapi) { + /* + * With a normal shutdown we assume the end of last record. + */ + SCTP_INP_READ_LOCK(stcb->sctp_ep); + asoc->control_pdapi->end_added = 1; + asoc->control_pdapi->pdapi_aborted = 1; + asoc->control_pdapi = NULL; + SCTP_INP_READ_UNLOCK(stcb->sctp_ep); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + /* assoc was freed while we were unlocked */ + SCTP_SOCKET_UNLOCK(so, 1); + return; + } +#endif + sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } + /* are the queues empty? */ + if (!TAILQ_EMPTY(&asoc->send_queue) || + !TAILQ_EMPTY(&asoc->sent_queue) || + !TAILQ_EMPTY(&asoc->out_wheel)) { + sctp_report_all_outbound(stcb, 0, SCTP_SO_NOT_LOCKED); + } + /* stop the timer */ + sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9); + /* send SHUTDOWN-COMPLETE */ + sctp_send_shutdown_complete(stcb, net, 0); + /* notify upper layer protocol */ + if (stcb->sctp_socket) { + sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); + if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + /* Set the connected flag to disconnected */ + stcb->sctp_ep->sctp_socket->so_snd.sb_cc = 0; + } + } + SCTP_STAT_INCR_COUNTER32(sctps_shutdown); + /* free the TCB but first save off the ep */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_10); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif +} + +/* + * Skip past the param header and then we will find the chunk that caused the + * problem. There are two possiblities ASCONF or FWD-TSN other than that and + * our peer must be broken. + */ +static void +sctp_process_unrecog_chunk(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr, + struct sctp_nets *net) +{ + struct sctp_chunkhdr *chk; + + chk = (struct sctp_chunkhdr *)((caddr_t)phdr + sizeof(*phdr)); + switch (chk->chunk_type) { + case SCTP_ASCONF_ACK: + case SCTP_ASCONF: + sctp_asconf_cleanup(stcb, net); + break; + case SCTP_FORWARD_CUM_TSN: + stcb->asoc.peer_supports_prsctp = 0; + break; + default: + SCTPDBG(SCTP_DEBUG_INPUT2, + "Peer does not support chunk type %d(%x)??\n", + chk->chunk_type, (uint32_t) chk->chunk_type); + break; + } +} + +/* + * Skip past the param header and then we will find the param that caused the + * problem. There are a number of param's in a ASCONF OR the prsctp param + * these will turn of specific features. + */ +static void +sctp_process_unrecog_param(struct sctp_tcb *stcb, struct sctp_paramhdr *phdr) +{ + struct sctp_paramhdr *pbad; + + pbad = phdr + 1; + switch (ntohs(pbad->param_type)) { + /* pr-sctp draft */ + case SCTP_PRSCTP_SUPPORTED: + stcb->asoc.peer_supports_prsctp = 0; + break; + case SCTP_SUPPORTED_CHUNK_EXT: + break; + /* draft-ietf-tsvwg-addip-sctp */ + case SCTP_HAS_NAT_SUPPORT: + stcb->asoc.peer_supports_nat = 0; + break; + case SCTP_ECN_NONCE_SUPPORTED: + stcb->asoc.peer_supports_ecn_nonce = 0; + stcb->asoc.ecn_nonce_allowed = 0; + stcb->asoc.ecn_allowed = 0; + break; + case SCTP_ADD_IP_ADDRESS: + case SCTP_DEL_IP_ADDRESS: + case SCTP_SET_PRIM_ADDR: + stcb->asoc.peer_supports_asconf = 0; + break; + case SCTP_SUCCESS_REPORT: + case SCTP_ERROR_CAUSE_IND: + SCTPDBG(SCTP_DEBUG_INPUT2, "Huh, the peer does not support success? or error cause?\n"); + SCTPDBG(SCTP_DEBUG_INPUT2, + "Turning off ASCONF to this strange peer\n"); + stcb->asoc.peer_supports_asconf = 0; + break; + default: + SCTPDBG(SCTP_DEBUG_INPUT2, + "Peer does not support param type %d(%x)??\n", + pbad->param_type, (uint32_t) pbad->param_type); + break; + } +} + +static int +sctp_handle_error(struct sctp_chunkhdr *ch, + struct sctp_tcb *stcb, struct sctp_nets *net) +{ + int chklen; + struct sctp_paramhdr *phdr; + uint16_t error_type; + uint16_t error_len; + struct sctp_association *asoc; + int adjust; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + + /* parse through all of the errors and process */ + asoc = &stcb->asoc; + phdr = (struct sctp_paramhdr *)((caddr_t)ch + + sizeof(struct sctp_chunkhdr)); + chklen = ntohs(ch->chunk_length) - sizeof(struct sctp_chunkhdr); + while ((size_t)chklen >= sizeof(struct sctp_paramhdr)) { + /* Process an Error Cause */ + error_type = ntohs(phdr->param_type); + error_len = ntohs(phdr->param_length); + if ((error_len > chklen) || (error_len == 0)) { + /* invalid param length for this param */ + SCTPDBG(SCTP_DEBUG_INPUT1, "Bogus length in error param- chunk left:%d errorlen:%d\n", + chklen, error_len); + return (0); + } + switch (error_type) { + case SCTP_CAUSE_INVALID_STREAM: + case SCTP_CAUSE_MISSING_PARAM: + case SCTP_CAUSE_INVALID_PARAM: + case SCTP_CAUSE_NO_USER_DATA: + SCTPDBG(SCTP_DEBUG_INPUT1, "Software error we got a %d back? We have a bug :/ (or do they?)\n", + error_type); + break; + case SCTP_CAUSE_NAT_COLLIDING_STATE: + SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state abort flags:%x\n", + ch->chunk_flags); + if (sctp_handle_nat_colliding_state(stcb)) { + return (0); + } + break; + case SCTP_CAUSE_NAT_MISSING_STATE: + SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state abort flags:%x\n", + ch->chunk_flags); + if (sctp_handle_nat_missing_state(stcb, net)) { + return (0); + } + break; + case SCTP_CAUSE_STALE_COOKIE: + /* + * We only act if we have echoed a cookie and are + * waiting. + */ + if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) { + int *p; + + p = (int *)((caddr_t)phdr + sizeof(*phdr)); + /* Save the time doubled */ + asoc->cookie_preserve_req = ntohl(*p) << 1; + asoc->stale_cookie_count++; + if (asoc->stale_cookie_count > + asoc->max_init_times) { + sctp_abort_notification(stcb, 0, SCTP_SO_NOT_LOCKED); + /* now free the asoc */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_11); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + return (-1); + } + /* blast back to INIT state */ + sctp_toss_old_cookies(stcb, &stcb->asoc); + asoc->state &= ~SCTP_STATE_COOKIE_ECHOED; + asoc->state |= SCTP_STATE_COOKIE_WAIT; + sctp_stop_all_cookie_timers(stcb); + sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); + } + break; + case SCTP_CAUSE_UNRESOLVABLE_ADDR: + /* + * Nothing we can do here, we don't do hostname + * addresses so if the peer does not like my IPv6 + * (or IPv4 for that matter) it does not matter. If + * they don't support that type of address, they can + * NOT possibly get that packet type... i.e. with no + * IPv6 you can't recieve a IPv6 packet. so we can + * safely ignore this one. If we ever added support + * for HOSTNAME Addresses, then we would need to do + * something here. + */ + break; + case SCTP_CAUSE_UNRECOG_CHUNK: + sctp_process_unrecog_chunk(stcb, phdr, net); + break; + case SCTP_CAUSE_UNRECOG_PARAM: + sctp_process_unrecog_param(stcb, phdr); + break; + case SCTP_CAUSE_COOKIE_IN_SHUTDOWN: + /* + * We ignore this since the timer will drive out a + * new cookie anyway and there timer will drive us + * to send a SHUTDOWN_COMPLETE. We can't send one + * here since we don't have their tag. + */ + break; + case SCTP_CAUSE_DELETING_LAST_ADDR: + case SCTP_CAUSE_RESOURCE_SHORTAGE: + case SCTP_CAUSE_DELETING_SRC_ADDR: + /* + * We should NOT get these here, but in a + * ASCONF-ACK. + */ + SCTPDBG(SCTP_DEBUG_INPUT2, "Peer sends ASCONF errors in a Operational Error?<%d>?\n", + error_type); + break; + case SCTP_CAUSE_OUT_OF_RESC: + /* + * And what, pray tell do we do with the fact that + * the peer is out of resources? Not really sure we + * could do anything but abort. I suspect this + * should have came WITH an abort instead of in a + * OP-ERROR. + */ + break; + default: + SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_handle_error: unknown error type = 0x%xh\n", + error_type); + break; + } + adjust = SCTP_SIZE32(error_len); + chklen -= adjust; + phdr = (struct sctp_paramhdr *)((caddr_t)phdr + adjust); + } + return (0); +} + +static int +sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, + struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb, + struct sctp_nets *net, int *abort_no_unlock, uint32_t vrf_id) +{ + struct sctp_init_ack *init_ack; + struct mbuf *op_err; + + SCTPDBG(SCTP_DEBUG_INPUT2, + "sctp_handle_init_ack: handling INIT-ACK\n"); + + if (stcb == NULL) { + SCTPDBG(SCTP_DEBUG_INPUT2, + "sctp_handle_init_ack: TCB is null\n"); + return (-1); + } + if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_ack_chunk)) { + /* Invalid length */ + op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh, + op_err, 0, net->port); + *abort_no_unlock = 1; + return (-1); + } + init_ack = &cp->init; + /* validate parameters */ + if (init_ack->initiate_tag == 0) { + /* protocol error... send an abort */ + op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh, + op_err, 0, net->port); + *abort_no_unlock = 1; + return (-1); + } + if (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) { + /* protocol error... send an abort */ + op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh, + op_err, 0, net->port); + *abort_no_unlock = 1; + return (-1); + } + if (init_ack->num_inbound_streams == 0) { + /* protocol error... send an abort */ + op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh, + op_err, 0, net->port); + *abort_no_unlock = 1; + return (-1); + } + if (init_ack->num_outbound_streams == 0) { + /* protocol error... send an abort */ + op_err = sctp_generate_invmanparam(SCTP_CAUSE_INVALID_PARAM); + sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, sh, + op_err, 0, net->port); + *abort_no_unlock = 1; + return (-1); + } + /* process according to association state... */ + switch (stcb->asoc.state & SCTP_STATE_MASK) { + case SCTP_STATE_COOKIE_WAIT: + /* this is the expected state for this chunk */ + /* process the INIT-ACK parameters */ + if (stcb->asoc.primary_destination->dest_state & + SCTP_ADDR_UNCONFIRMED) { + /* + * The primary is where we sent the INIT, we can + * always consider it confirmed when the INIT-ACK is + * returned. Do this before we load addresses + * though. + */ + stcb->asoc.primary_destination->dest_state &= + ~SCTP_ADDR_UNCONFIRMED; + sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, + stcb, 0, (void *)stcb->asoc.primary_destination, SCTP_SO_NOT_LOCKED); + } + if (sctp_process_init_ack(m, iphlen, offset, sh, cp, stcb, + net, abort_no_unlock, vrf_id) < 0) { + /* error in parsing parameters */ + return (-1); + } + /* update our state */ + SCTPDBG(SCTP_DEBUG_INPUT2, "moving to COOKIE-ECHOED state\n"); + SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_ECHOED); + + /* reset the RTO calc */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); + /* + * collapse the init timer back in case of a exponential + * backoff + */ + sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, stcb->sctp_ep, + stcb, net); + /* + * the send at the end of the inbound data processing will + * cause the cookie to be sent + */ + break; + case SCTP_STATE_SHUTDOWN_SENT: + /* incorrect state... discard */ + break; + case SCTP_STATE_COOKIE_ECHOED: + /* incorrect state... discard */ + break; + case SCTP_STATE_OPEN: + /* incorrect state... discard */ + break; + case SCTP_STATE_EMPTY: + case SCTP_STATE_INUSE: + default: + /* incorrect state... discard */ + return (-1); + break; + } + SCTPDBG(SCTP_DEBUG_INPUT1, "Leaving handle-init-ack end\n"); + return (0); +} + +static struct sctp_tcb * +sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, + struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, + struct sctp_inpcb *inp, struct sctp_nets **netp, + struct sockaddr *init_src, int *notification, + int auth_skipped, uint32_t auth_offset, uint32_t auth_len, + uint32_t vrf_id, uint16_t port); + + +/* + * handle a state cookie for an existing association m: input packet mbuf + * chain-- assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a + * "split" mbuf and the cookie signature does not exist offset: offset into + * mbuf to the cookie-echo chunk + */ +static struct sctp_tcb * +sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, + struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, + struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp, + struct sockaddr *init_src, int *notification, sctp_assoc_t * sac_assoc_id, + uint32_t vrf_id, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint16_t port) +{ + struct sctp_association *asoc; + struct sctp_init_chunk *init_cp, init_buf; + struct sctp_init_ack_chunk *initack_cp, initack_buf; + struct sctp_nets *net; + struct mbuf *op_err; + struct sctp_paramhdr *ph; + int chk_length; + int init_offset, initack_offset, i; + int retval; + int spec_flag = 0; + uint32_t how_indx; + + net = *netp; + /* I know that the TCB is non-NULL from the caller */ + asoc = &stcb->asoc; + for (how_indx = 0; how_indx < sizeof(asoc->cookie_how); how_indx++) { + if (asoc->cookie_how[how_indx] == 0) + break; + } + if (how_indx < sizeof(asoc->cookie_how)) { + asoc->cookie_how[how_indx] = 1; + } + if (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { + /* SHUTDOWN came in after sending INIT-ACK */ + sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination); + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), + 0, M_DONTWAIT, 1, MT_DATA); + if (op_err == NULL) { + /* FOOBAR */ + return (NULL); + } + /* Set the len */ + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_paramhdr); + ph = mtod(op_err, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_COOKIE_IN_SHUTDOWN); + ph->param_length = htons(sizeof(struct sctp_paramhdr)); + sctp_send_operr_to(m, iphlen, op_err, cookie->peers_vtag, + vrf_id, net->port); + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 2; + return (NULL); + } + /* + * find and validate the INIT chunk in the cookie (peer's info) the + * INIT should start after the cookie-echo header struct (chunk + * header, state cookie header struct) + */ + init_offset = offset += sizeof(struct sctp_cookie_echo_chunk); + + init_cp = (struct sctp_init_chunk *) + sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk), + (uint8_t *) & init_buf); + if (init_cp == NULL) { + /* could not pull a INIT chunk in cookie */ + return (NULL); + } + chk_length = ntohs(init_cp->ch.chunk_length); + if (init_cp->ch.chunk_type != SCTP_INITIATION) { + return (NULL); + } + /* + * find and validate the INIT-ACK chunk in the cookie (my info) the + * INIT-ACK follows the INIT chunk + */ + initack_offset = init_offset + SCTP_SIZE32(chk_length); + initack_cp = (struct sctp_init_ack_chunk *) + sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk), + (uint8_t *) & initack_buf); + if (initack_cp == NULL) { + /* could not pull INIT-ACK chunk in cookie */ + return (NULL); + } + chk_length = ntohs(initack_cp->ch.chunk_length); + if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) { + return (NULL); + } + if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && + (ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag)) { + /* + * case D in Section 5.2.4 Table 2: MMAA process accordingly + * to get into the OPEN state + */ + if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) { + /*- + * Opps, this means that we somehow generated two vtag's + * the same. I.e. we did: + * Us Peer + * <---INIT(tag=a)------ + * ----INIT-ACK(tag=t)--> + * ----INIT(tag=t)------> *1 + * <---INIT-ACK(tag=a)--- + * <----CE(tag=t)------------- *2 + * + * At point *1 we should be generating a different + * tag t'. Which means we would throw away the CE and send + * ours instead. Basically this is case C (throw away side). + */ + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 17; + return (NULL); + + } + switch SCTP_GET_STATE + (asoc) { + case SCTP_STATE_COOKIE_WAIT: + case SCTP_STATE_COOKIE_ECHOED: + /* + * INIT was sent but got a COOKIE_ECHO with the + * correct tags... just accept it...but we must + * process the init so that we can make sure we have + * the right seq no's. + */ + /* First we must process the INIT !! */ + retval = sctp_process_init(init_cp, stcb, net); + if (retval < 0) { + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 3; + return (NULL); + } + /* we have already processed the INIT so no problem */ + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, + net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_12); + sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_13); + /* update current state */ + if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) + SCTP_STAT_INCR_COUNTER32(sctps_activeestab); + else + SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); + + SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, + stcb->sctp_ep, stcb, asoc->primary_destination); + } + SCTP_STAT_INCR_GAUGE32(sctps_currestab); + sctp_stop_all_cookie_timers(stcb); + if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && + (inp->sctp_socket->so_qlimit == 0) + ) { +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + /* + * Here is where collision would go if we + * did a connect() and instead got a + * init/init-ack/cookie done before the + * init-ack came back.. + */ + stcb->sctp_ep->sctp_flags |= + SCTP_PCB_FLAGS_CONNECTED; +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_add_int(&stcb->asoc.refcnt, -1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + SCTP_SOCKET_UNLOCK(so, 1); + return (NULL); + } +#endif + soisconnected(stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } + /* notify upper layer */ + *notification = SCTP_NOTIFY_ASSOC_UP; + /* + * since we did not send a HB make sure we don't + * double things + */ + net->hb_responded = 1; + net->RTO = sctp_calculate_rto(stcb, asoc, net, + &cookie->time_entered, sctp_align_unsafe_makecopy); + + if (stcb->asoc.sctp_autoclose_ticks && + (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE))) { + sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, + inp, stcb, NULL); + } + break; + default: + /* + * we're in the OPEN state (or beyond), so peer must + * have simply lost the COOKIE-ACK + */ + break; + } /* end switch */ + sctp_stop_all_cookie_timers(stcb); + /* + * We ignore the return code here.. not sure if we should + * somehow abort.. but we do have an existing asoc. This + * really should not fail. + */ + if (sctp_load_addresses_from_init(stcb, m, iphlen, + init_offset + sizeof(struct sctp_init_chunk), + initack_offset, sh, init_src)) { + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 4; + return (NULL); + } + /* respond with a COOKIE-ACK */ + sctp_toss_old_cookies(stcb, asoc); + sctp_send_cookie_ack(stcb); + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 5; + return (stcb); + } + if (ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag && + ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag && + cookie->tie_tag_my_vtag == 0 && + cookie->tie_tag_peer_vtag == 0) { + /* + * case C in Section 5.2.4 Table 2: XMOO silently discard + */ + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 6; + return (NULL); + } + /* + * If nat support, and the below and stcb is established, send back + * a ABORT(colliding state) if we are established. + */ + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) && + (asoc->peer_supports_nat) && + ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && + ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) || + (asoc->peer_vtag == 0)))) { + /* + * Special case - Peer's support nat. We may have two init's + * that we gave out the same tag on since one was not + * established.. i.e. we get INIT from host-1 behind the nat + * and we respond tag-a, we get a INIT from host-2 behind + * the nat and we get tag-a again. Then we bring up host-1 + * (or 2's) assoc, Then comes the cookie from hsot-2 (or 1). + * Now we have colliding state. We must send an abort here + * with colliding state indication. + */ + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), + 0, M_DONTWAIT, 1, MT_DATA); + if (op_err == NULL) { + /* FOOBAR */ + return (NULL); + } + /* pre-reserve some space */ +#ifdef INET6 + SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); +#else + SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); +#endif + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); + /* Set the len */ + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_paramhdr); + ph = mtod(op_err, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_NAT_COLLIDING_STATE); + ph->param_length = htons(sizeof(struct sctp_paramhdr)); + sctp_send_abort(m, iphlen, sh, 0, op_err, vrf_id, port); + return (NULL); + } + if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && + ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) || + (asoc->peer_vtag == 0))) { + /* + * case B in Section 5.2.4 Table 2: MXAA or MOAA my info + * should be ok, re-accept peer info + */ + if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) { + /* + * Extension of case C. If we hit this, then the + * random number generator returned the same vtag + * when we first sent our INIT-ACK and when we later + * sent our INIT. The side with the seq numbers that + * are different will be the one that normnally + * would have hit case C. This in effect "extends" + * our vtags in this collision case to be 64 bits. + * The same collision could occur aka you get both + * vtag and seq number the same twice in a row.. but + * is much less likely. If it did happen then we + * would proceed through and bring up the assoc.. we + * may end up with the wrong stream setup however.. + * which would be bad.. but there is no way to + * tell.. until we send on a stream that does not + * exist :-) + */ + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 7; + + return (NULL); + } + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 8; + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_14); + sctp_stop_all_cookie_timers(stcb); + /* + * since we did not send a HB make sure we don't double + * things + */ + net->hb_responded = 1; + if (stcb->asoc.sctp_autoclose_ticks && + sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { + sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, + NULL); + } + asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); + asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); + + /* Note last_cwr_tsn? where is this used? */ + asoc->last_cwr_tsn = asoc->init_seq_number - 1; + if (ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) { + /* + * Ok the peer probably discarded our data (if we + * echoed a cookie+data). So anything on the + * sent_queue should be marked for retransmit, we + * may not get something to kick us so it COULD + * still take a timeout to move these.. but it can't + * hurt to mark them. + */ + struct sctp_tmit_chunk *chk; + + TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { + if (chk->sent < SCTP_DATAGRAM_RESEND) { + chk->sent = SCTP_DATAGRAM_RESEND; + sctp_flight_size_decrease(chk); + sctp_total_flight_decrease(stcb, chk); + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + spec_flag++; + } + } + + } + /* process the INIT info (peer's info) */ + retval = sctp_process_init(init_cp, stcb, net); + if (retval < 0) { + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 9; + return (NULL); + } + if (sctp_load_addresses_from_init(stcb, m, iphlen, + init_offset + sizeof(struct sctp_init_chunk), + initack_offset, sh, init_src)) { + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 10; + return (NULL); + } + if ((asoc->state & SCTP_STATE_COOKIE_WAIT) || + (asoc->state & SCTP_STATE_COOKIE_ECHOED)) { + *notification = SCTP_NOTIFY_ASSOC_UP; + + if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && + (inp->sctp_socket->so_qlimit == 0)) { +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + stcb->sctp_ep->sctp_flags |= + SCTP_PCB_FLAGS_CONNECTED; +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_add_int(&stcb->asoc.refcnt, -1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + SCTP_SOCKET_UNLOCK(so, 1); + return (NULL); + } +#endif + soisconnected(stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } + if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) + SCTP_STAT_INCR_COUNTER32(sctps_activeestab); + else + SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); + SCTP_STAT_INCR_GAUGE32(sctps_currestab); + } else if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { + SCTP_STAT_INCR_COUNTER32(sctps_restartestab); + } else { + SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); + } + SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, + stcb->sctp_ep, stcb, asoc->primary_destination); + } + sctp_stop_all_cookie_timers(stcb); + sctp_toss_old_cookies(stcb, asoc); + sctp_send_cookie_ack(stcb); + if (spec_flag) { + /* + * only if we have retrans set do we do this. What + * this call does is get only the COOKIE-ACK out and + * then when we return the normal call to + * sctp_chunk_output will get the retrans out behind + * this. + */ + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_COOKIE_ACK, SCTP_SO_NOT_LOCKED); + } + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 11; + + return (stcb); + } + if ((ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag && + ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) && + cookie->tie_tag_my_vtag == asoc->my_vtag_nonce && + cookie->tie_tag_peer_vtag == asoc->peer_vtag_nonce && + cookie->tie_tag_peer_vtag != 0) { + struct sctpasochead *head; + + if (asoc->peer_supports_nat) { + /* + * This is a gross gross hack. just call the + * cookie_new code since we are allowing a duplicate + * association. I hope this works... + */ + return (sctp_process_cookie_new(m, iphlen, offset, sh, cookie, cookie_len, + inp, netp, init_src, notification, + auth_skipped, auth_offset, auth_len, + vrf_id, port)); + } + /* + * case A in Section 5.2.4 Table 2: XXMM (peer restarted) + */ + /* temp code */ + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 12; + sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_15); + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); + + *sac_assoc_id = sctp_get_associd(stcb); + /* notify upper layer */ + *notification = SCTP_NOTIFY_ASSOC_RESTART; + atomic_add_int(&stcb->asoc.refcnt, 1); + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_OPEN) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT)) { + SCTP_STAT_INCR_GAUGE32(sctps_currestab); + } + if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { + SCTP_STAT_INCR_GAUGE32(sctps_restartestab); + } else if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) { + SCTP_STAT_INCR_GAUGE32(sctps_collisionestab); + } + if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { + SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, + stcb->sctp_ep, stcb, asoc->primary_destination); + + } else if (!(asoc->state & SCTP_STATE_SHUTDOWN_SENT)) { + /* move to OPEN state, if not in SHUTDOWN_SENT */ + SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + } + asoc->pre_open_streams = + ntohs(initack_cp->init.num_outbound_streams); + asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn); + asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number; + asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; + + asoc->last_cwr_tsn = asoc->init_seq_number - 1; + asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1; + + asoc->str_reset_seq_in = asoc->init_seq_number; + + asoc->advanced_peer_ack_point = asoc->last_acked_seq; + if (asoc->mapping_array) { + memset(asoc->mapping_array, 0, + asoc->mapping_array_size); + } + if (asoc->nr_mapping_array) { + memset(asoc->nr_mapping_array, 0, + asoc->mapping_array_size); + } + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_INFO_WLOCK(); + SCTP_INP_WLOCK(stcb->sctp_ep); + SCTP_TCB_LOCK(stcb); + atomic_add_int(&stcb->asoc.refcnt, -1); + /* send up all the data */ + SCTP_TCB_SEND_LOCK(stcb); + + sctp_report_all_outbound(stcb, 1, SCTP_SO_NOT_LOCKED); + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + stcb->asoc.strmout[i].stream_no = i; + stcb->asoc.strmout[i].next_sequence_sent = 0; + stcb->asoc.strmout[i].last_msg_incomplete = 0; + } + /* process the INIT-ACK info (my info) */ + asoc->my_vtag = ntohl(initack_cp->init.initiate_tag); + asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); + + /* pull from vtag hash */ + LIST_REMOVE(stcb, sctp_asocs); + /* re-insert to new vtag position */ + head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, + SCTP_BASE_INFO(hashasocmark))]; + /* + * put it in the bucket in the vtag hash of assoc's for the + * system + */ + LIST_INSERT_HEAD(head, stcb, sctp_asocs); + + /* process the INIT info (peer's info) */ + SCTP_TCB_SEND_UNLOCK(stcb); + SCTP_INP_WUNLOCK(stcb->sctp_ep); + SCTP_INP_INFO_WUNLOCK(); + + retval = sctp_process_init(init_cp, stcb, net); + if (retval < 0) { + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 13; + + return (NULL); + } + /* + * since we did not send a HB make sure we don't double + * things + */ + net->hb_responded = 1; + + if (sctp_load_addresses_from_init(stcb, m, iphlen, + init_offset + sizeof(struct sctp_init_chunk), + initack_offset, sh, init_src)) { + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 14; + + return (NULL); + } + /* respond with a COOKIE-ACK */ + sctp_stop_all_cookie_timers(stcb); + sctp_toss_old_cookies(stcb, asoc); + sctp_send_cookie_ack(stcb); + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 15; + + return (stcb); + } + if (how_indx < sizeof(asoc->cookie_how)) + asoc->cookie_how[how_indx] = 16; + /* all other cases... */ + return (NULL); +} + + +/* + * handle a state cookie for a new association m: input packet mbuf chain-- + * assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a "split" mbuf + * and the cookie signature does not exist offset: offset into mbuf to the + * cookie-echo chunk length: length of the cookie chunk to: where the init + * was from returns a new TCB + */ +struct sctp_tcb * +sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, + struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, + struct sctp_inpcb *inp, struct sctp_nets **netp, + struct sockaddr *init_src, int *notification, + int auth_skipped, uint32_t auth_offset, uint32_t auth_len, + uint32_t vrf_id, uint16_t port) +{ + struct sctp_tcb *stcb; + struct sctp_init_chunk *init_cp, init_buf; + struct sctp_init_ack_chunk *initack_cp, initack_buf; + struct sockaddr_storage sa_store; + struct sockaddr *initack_src = (struct sockaddr *)&sa_store; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + struct sctp_association *asoc; + int chk_length; + int init_offset, initack_offset, initack_limit; + int retval; + int error = 0; + uint32_t old_tag; + uint8_t auth_chunk_buf[SCTP_PARAM_BUFFER_SIZE]; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + + so = SCTP_INP_SO(inp); +#endif + + /* + * find and validate the INIT chunk in the cookie (peer's info) the + * INIT should start after the cookie-echo header struct (chunk + * header, state cookie header struct) + */ + init_offset = offset + sizeof(struct sctp_cookie_echo_chunk); + init_cp = (struct sctp_init_chunk *) + sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk), + (uint8_t *) & init_buf); + if (init_cp == NULL) { + /* could not pull a INIT chunk in cookie */ + SCTPDBG(SCTP_DEBUG_INPUT1, + "process_cookie_new: could not pull INIT chunk hdr\n"); + return (NULL); + } + chk_length = ntohs(init_cp->ch.chunk_length); + if (init_cp->ch.chunk_type != SCTP_INITIATION) { + SCTPDBG(SCTP_DEBUG_INPUT1, "HUH? process_cookie_new: could not find INIT chunk!\n"); + return (NULL); + } + initack_offset = init_offset + SCTP_SIZE32(chk_length); + /* + * find and validate the INIT-ACK chunk in the cookie (my info) the + * INIT-ACK follows the INIT chunk + */ + initack_cp = (struct sctp_init_ack_chunk *) + sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk), + (uint8_t *) & initack_buf); + if (initack_cp == NULL) { + /* could not pull INIT-ACK chunk in cookie */ + SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: could not pull INIT-ACK chunk hdr\n"); + return (NULL); + } + chk_length = ntohs(initack_cp->ch.chunk_length); + if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) { + return (NULL); + } + /* + * NOTE: We can't use the INIT_ACK's chk_length to determine the + * "initack_limit" value. This is because the chk_length field + * includes the length of the cookie, but the cookie is omitted when + * the INIT and INIT_ACK are tacked onto the cookie... + */ + initack_limit = offset + cookie_len; + + /* + * now that we know the INIT/INIT-ACK are in place, create a new TCB + * and popluate + */ + + /* + * Here we do a trick, we set in NULL for the proc/thread argument. + * We do this since in effect we only use the p argument when the + * socket is unbound and we must do an implicit bind. Since we are + * getting a cookie, we cannot be unbound. + */ + stcb = sctp_aloc_assoc(inp, init_src, &error, + ntohl(initack_cp->init.initiate_tag), vrf_id, + (struct thread *)NULL + ); + if (stcb == NULL) { + struct mbuf *op_err; + + /* memory problem? */ + SCTPDBG(SCTP_DEBUG_INPUT1, + "process_cookie_new: no room for another TCB!\n"); + op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC); + + sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen, + sh, op_err, vrf_id, port); + return (NULL); + } + /* get the correct sctp_nets */ + if (netp) + *netp = sctp_findnet(stcb, init_src); + + asoc = &stcb->asoc; + /* get scope variables out of cookie */ + asoc->ipv4_local_scope = cookie->ipv4_scope; + asoc->site_scope = cookie->site_scope; + asoc->local_scope = cookie->local_scope; + asoc->loopback_scope = cookie->loopback_scope; + + if ((asoc->ipv4_addr_legal != cookie->ipv4_addr_legal) || + (asoc->ipv6_addr_legal != cookie->ipv6_addr_legal)) { + struct mbuf *op_err; + + /* + * Houston we have a problem. The EP changed while the + * cookie was in flight. Only recourse is to abort the + * association. + */ + atomic_add_int(&stcb->asoc.refcnt, 1); + op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC); + sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen, + sh, op_err, vrf_id, port); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, + SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return (NULL); + } + /* process the INIT-ACK info (my info) */ + old_tag = asoc->my_vtag; + asoc->my_vtag = ntohl(initack_cp->init.initiate_tag); + asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); + asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); + asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn); + asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number; + asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; + asoc->last_cwr_tsn = asoc->init_seq_number - 1; + asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1; + asoc->str_reset_seq_in = asoc->init_seq_number; + + asoc->advanced_peer_ack_point = asoc->last_acked_seq; + + /* process the INIT info (peer's info) */ + if (netp) + retval = sctp_process_init(init_cp, stcb, *netp); + else + retval = 0; + if (retval < 0) { + atomic_add_int(&stcb->asoc.refcnt, 1); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return (NULL); + } + /* load all addresses */ + if (sctp_load_addresses_from_init(stcb, m, iphlen, + init_offset + sizeof(struct sctp_init_chunk), initack_offset, sh, + init_src)) { + atomic_add_int(&stcb->asoc.refcnt, 1); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_17); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return (NULL); + } + /* + * verify any preceding AUTH chunk that was skipped + */ + /* pull the local authentication parameters from the cookie/init-ack */ + sctp_auth_get_cookie_params(stcb, m, + initack_offset + sizeof(struct sctp_init_ack_chunk), + initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk))); + if (auth_skipped) { + struct sctp_auth_chunk *auth; + + auth = (struct sctp_auth_chunk *) + sctp_m_getptr(m, auth_offset, auth_len, auth_chunk_buf); + if ((auth == NULL) || sctp_handle_auth(stcb, auth, m, auth_offset)) { + /* auth HMAC failed, dump the assoc and packet */ + SCTPDBG(SCTP_DEBUG_AUTH1, + "COOKIE-ECHO: AUTH failed\n"); + atomic_add_int(&stcb->asoc.refcnt, 1); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_18); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return (NULL); + } else { + /* remaining chunks checked... good to go */ + stcb->asoc.authenticated = 1; + } + } + /* update current state */ + SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n"); + SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, + stcb->sctp_ep, stcb, asoc->primary_destination); + } + sctp_stop_all_cookie_timers(stcb); + SCTP_STAT_INCR_COUNTER32(sctps_passiveestab); + SCTP_STAT_INCR_GAUGE32(sctps_currestab); + + /* + * if we're doing ASCONFs, check to see if we have any new local + * addresses that need to get added to the peer (eg. addresses + * changed while cookie echo in flight). This needs to be done + * after we go to the OPEN state to do the correct asconf + * processing. else, make sure we have the correct addresses in our + * lists + */ + + /* warning, we re-use sin, sin6, sa_store here! */ + /* pull in local_address (our "from" address) */ + if (cookie->laddr_type == SCTP_IPV4_ADDRESS) { + /* source addr is IPv4 */ + sin = (struct sockaddr_in *)initack_src; + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_addr.s_addr = cookie->laddress[0]; + } else if (cookie->laddr_type == SCTP_IPV6_ADDRESS) { + /* source addr is IPv6 */ + sin6 = (struct sockaddr_in6 *)initack_src; + memset(sin6, 0, sizeof(*sin6)); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_scope_id = cookie->scope_id; + memcpy(&sin6->sin6_addr, cookie->laddress, + sizeof(sin6->sin6_addr)); + } else { + atomic_add_int(&stcb->asoc.refcnt, 1); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_19); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return (NULL); + } + + /* set up to notify upper layer */ + *notification = SCTP_NOTIFY_ASSOC_UP; + if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && + (inp->sctp_socket->so_qlimit == 0)) { + /* + * This is an endpoint that called connect() how it got a + * cookie that is NEW is a bit of a mystery. It must be that + * the INIT was sent, but before it got there.. a complete + * INIT/INIT-ACK/COOKIE arrived. But of course then it + * should have went to the other code.. not here.. oh well.. + * a bit of protection is worth having.. + */ + stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + SCTP_SOCKET_UNLOCK(so, 1); + return (NULL); + } +#endif + soisconnected(stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } else if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && + (inp->sctp_socket->so_qlimit)) { + /* + * We don't want to do anything with this one. Since it is + * the listening guy. The timer will get started for + * accepted connections in the caller. + */ + ; + } + /* since we did not send a HB make sure we don't double things */ + if ((netp) && (*netp)) + (*netp)->hb_responded = 1; + + if (stcb->asoc.sctp_autoclose_ticks && + sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { + sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); + } + /* calculate the RTT */ + (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); + if ((netp) && (*netp)) { + (*netp)->RTO = sctp_calculate_rto(stcb, asoc, *netp, + &cookie->time_entered, sctp_align_unsafe_makecopy); + } + /* respond with a COOKIE-ACK */ + sctp_send_cookie_ack(stcb); + + /* + * check the address lists for any ASCONFs that need to be sent + * AFTER the cookie-ack is sent + */ + sctp_check_address_list(stcb, m, + initack_offset + sizeof(struct sctp_init_ack_chunk), + initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk)), + initack_src, cookie->local_scope, cookie->site_scope, + cookie->ipv4_scope, cookie->loopback_scope); + + + return (stcb); +} + +/* + * CODE LIKE THIS NEEDS TO RUN IF the peer supports the NAT extension, i.e + * we NEED to make sure we are not already using the vtag. If so we + * need to send back an ABORT-TRY-AGAIN-WITH-NEW-TAG No middle box bit! + head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag, + SCTP_BASE_INFO(hashasocmark))]; + LIST_FOREACH(stcb, head, sctp_asocs) { + if ((stcb->asoc.my_vtag == tag) && (stcb->rport == rport) && (inp == stcb->sctp_ep)) { + -- SEND ABORT - TRY AGAIN -- + } + } +*/ + +/* + * handles a COOKIE-ECHO message stcb: modified to either a new or left as + * existing (non-NULL) TCB + */ +static struct mbuf * +sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, + struct sctphdr *sh, struct sctp_cookie_echo_chunk *cp, + struct sctp_inpcb **inp_p, struct sctp_tcb **stcb, struct sctp_nets **netp, + int auth_skipped, uint32_t auth_offset, uint32_t auth_len, + struct sctp_tcb **locked_tcb, uint32_t vrf_id, uint16_t port) +{ + struct sctp_state_cookie *cookie; + struct sockaddr_in6 sin6; + struct sockaddr_in sin; + struct sctp_tcb *l_stcb = *stcb; + struct sctp_inpcb *l_inp; + struct sockaddr *to; + sctp_assoc_t sac_restart_id; + struct sctp_pcb *ep; + struct mbuf *m_sig; + uint8_t calc_sig[SCTP_SIGNATURE_SIZE], tmp_sig[SCTP_SIGNATURE_SIZE]; + uint8_t *sig; + uint8_t cookie_ok = 0; + unsigned int size_of_pkt, sig_offset, cookie_offset; + unsigned int cookie_len; + struct timeval now; + struct timeval time_expires; + struct sockaddr_storage dest_store; + struct sockaddr *localep_sa = (struct sockaddr *)&dest_store; + struct ip *iph; + int notification = 0; + struct sctp_nets *netl; + int had_a_existing_tcb = 0; + + SCTPDBG(SCTP_DEBUG_INPUT2, + "sctp_handle_cookie: handling COOKIE-ECHO\n"); + + if (inp_p == NULL) { + return (NULL); + } + /* First get the destination address setup too. */ + iph = mtod(m, struct ip *); + switch (iph->ip_v) { + case IPVERSION: + { + /* its IPv4 */ + struct sockaddr_in *lsin; + + lsin = (struct sockaddr_in *)(localep_sa); + memset(lsin, 0, sizeof(*lsin)); + lsin->sin_family = AF_INET; + lsin->sin_len = sizeof(*lsin); + lsin->sin_port = sh->dest_port; + lsin->sin_addr.s_addr = iph->ip_dst.s_addr; + size_of_pkt = SCTP_GET_IPV4_LENGTH(iph); + break; + } +#ifdef INET6 + case IPV6_VERSION >> 4: + { + /* its IPv6 */ + struct ip6_hdr *ip6; + struct sockaddr_in6 *lsin6; + + lsin6 = (struct sockaddr_in6 *)(localep_sa); + memset(lsin6, 0, sizeof(*lsin6)); + lsin6->sin6_family = AF_INET6; + lsin6->sin6_len = sizeof(struct sockaddr_in6); + ip6 = mtod(m, struct ip6_hdr *); + lsin6->sin6_port = sh->dest_port; + lsin6->sin6_addr = ip6->ip6_dst; + size_of_pkt = SCTP_GET_IPV6_LENGTH(ip6) + iphlen; + break; + } +#endif + default: + return (NULL); + } + + cookie = &cp->cookie; + cookie_offset = offset + sizeof(struct sctp_chunkhdr); + cookie_len = ntohs(cp->ch.chunk_length); + + if ((cookie->peerport != sh->src_port) && + (cookie->myport != sh->dest_port) && + (cookie->my_vtag != sh->v_tag)) { + /* + * invalid ports or bad tag. Note that we always leave the + * v_tag in the header in network order and when we stored + * it in the my_vtag slot we also left it in network order. + * This maintains the match even though it may be in the + * opposite byte order of the machine :-> + */ + return (NULL); + } + if (cookie_len > size_of_pkt || + cookie_len < sizeof(struct sctp_cookie_echo_chunk) + + sizeof(struct sctp_init_chunk) + + sizeof(struct sctp_init_ack_chunk) + SCTP_SIGNATURE_SIZE) { + /* cookie too long! or too small */ + return (NULL); + } + /* + * split off the signature into its own mbuf (since it should not be + * calculated in the sctp_hmac_m() call). + */ + sig_offset = offset + cookie_len - SCTP_SIGNATURE_SIZE; + if (sig_offset > size_of_pkt) { + /* packet not correct size! */ + /* XXX this may already be accounted for earlier... */ + return (NULL); + } + m_sig = m_split(m, sig_offset, M_DONTWAIT); + if (m_sig == NULL) { + /* out of memory or ?? */ + return (NULL); + } +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = m_sig; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_SPLIT); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + + /* + * compute the signature/digest for the cookie + */ + ep = &(*inp_p)->sctp_ep; + l_inp = *inp_p; + if (l_stcb) { + SCTP_TCB_UNLOCK(l_stcb); + } + SCTP_INP_RLOCK(l_inp); + if (l_stcb) { + SCTP_TCB_LOCK(l_stcb); + } + /* which cookie is it? */ + if ((cookie->time_entered.tv_sec < (long)ep->time_of_secret_change) && + (ep->current_secret_number != ep->last_secret_number)) { + /* it's the old cookie */ + (void)sctp_hmac_m(SCTP_HMAC, + (uint8_t *) ep->secret_key[(int)ep->last_secret_number], + SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); + } else { + /* it's the current cookie */ + (void)sctp_hmac_m(SCTP_HMAC, + (uint8_t *) ep->secret_key[(int)ep->current_secret_number], + SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); + } + /* get the signature */ + SCTP_INP_RUNLOCK(l_inp); + sig = (uint8_t *) sctp_m_getptr(m_sig, 0, SCTP_SIGNATURE_SIZE, (uint8_t *) & tmp_sig); + if (sig == NULL) { + /* couldn't find signature */ + sctp_m_freem(m_sig); + return (NULL); + } + /* compare the received digest with the computed digest */ + if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) { + /* try the old cookie? */ + if ((cookie->time_entered.tv_sec == (long)ep->time_of_secret_change) && + (ep->current_secret_number != ep->last_secret_number)) { + /* compute digest with old */ + (void)sctp_hmac_m(SCTP_HMAC, + (uint8_t *) ep->secret_key[(int)ep->last_secret_number], + SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); + /* compare */ + if (memcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) == 0) + cookie_ok = 1; + } + } else { + cookie_ok = 1; + } + + /* + * Now before we continue we must reconstruct our mbuf so that + * normal processing of any other chunks will work. + */ + { + struct mbuf *m_at; + + m_at = m; + while (SCTP_BUF_NEXT(m_at) != NULL) { + m_at = SCTP_BUF_NEXT(m_at); + } + SCTP_BUF_NEXT(m_at) = m_sig; + } + + if (cookie_ok == 0) { + SCTPDBG(SCTP_DEBUG_INPUT2, "handle_cookie_echo: cookie signature validation failed!\n"); + SCTPDBG(SCTP_DEBUG_INPUT2, + "offset = %u, cookie_offset = %u, sig_offset = %u\n", + (uint32_t) offset, cookie_offset, sig_offset); + return (NULL); + } + /* + * check the cookie timestamps to be sure it's not stale + */ + (void)SCTP_GETTIME_TIMEVAL(&now); + /* Expire time is in Ticks, so we convert to seconds */ + time_expires.tv_sec = cookie->time_entered.tv_sec + TICKS_TO_SEC(cookie->cookie_life); + time_expires.tv_usec = cookie->time_entered.tv_usec; + /* + * TODO sctp_constants.h needs alternative time macros when _KERNEL + * is undefined. + */ + if (timevalcmp(&now, &time_expires, >)) { + /* cookie is stale! */ + struct mbuf *op_err; + struct sctp_stale_cookie_msg *scm; + uint32_t tim; + + op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_stale_cookie_msg), + 0, M_DONTWAIT, 1, MT_DATA); + if (op_err == NULL) { + /* FOOBAR */ + return (NULL); + } + /* Set the len */ + SCTP_BUF_LEN(op_err) = sizeof(struct sctp_stale_cookie_msg); + scm = mtod(op_err, struct sctp_stale_cookie_msg *); + scm->ph.param_type = htons(SCTP_CAUSE_STALE_COOKIE); + scm->ph.param_length = htons((sizeof(struct sctp_paramhdr) + + (sizeof(uint32_t)))); + /* seconds to usec */ + tim = (now.tv_sec - time_expires.tv_sec) * 1000000; + /* add in usec */ + if (tim == 0) + tim = now.tv_usec - cookie->time_entered.tv_usec; + scm->time_usec = htonl(tim); + sctp_send_operr_to(m, iphlen, op_err, cookie->peers_vtag, + vrf_id, port); + return (NULL); + } + /* + * Now we must see with the lookup address if we have an existing + * asoc. This will only happen if we were in the COOKIE-WAIT state + * and a INIT collided with us and somewhere the peer sent the + * cookie on another address besides the single address our assoc + * had for him. In this case we will have one of the tie-tags set at + * least AND the address field in the cookie can be used to look it + * up. + */ + to = NULL; + if (cookie->addr_type == SCTP_IPV6_ADDRESS) { + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); + sin6.sin6_port = sh->src_port; + sin6.sin6_scope_id = cookie->scope_id; + memcpy(&sin6.sin6_addr.s6_addr, cookie->address, + sizeof(sin6.sin6_addr.s6_addr)); + to = (struct sockaddr *)&sin6; + } else if (cookie->addr_type == SCTP_IPV4_ADDRESS) { + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + sin.sin_port = sh->src_port; + sin.sin_addr.s_addr = cookie->address[0]; + to = (struct sockaddr *)&sin; + } else { + /* This should not happen */ + return (NULL); + } + if ((*stcb == NULL) && to) { + /* Yep, lets check */ + *stcb = sctp_findassociation_ep_addr(inp_p, to, netp, localep_sa, NULL); + if (*stcb == NULL) { + /* + * We should have only got back the same inp. If we + * got back a different ep we have a problem. The + * original findep got back l_inp and now + */ + if (l_inp != *inp_p) { + SCTP_PRINTF("Bad problem find_ep got a diff inp then special_locate?\n"); + } + } else { + if (*locked_tcb == NULL) { + /* + * In this case we found the assoc only + * after we locked the create lock. This + * means we are in a colliding case and we + * must make sure that we unlock the tcb if + * its one of the cases where we throw away + * the incoming packets. + */ + *locked_tcb = *stcb; + + /* + * We must also increment the inp ref count + * since the ref_count flags was set when we + * did not find the TCB, now we found it + * which reduces the refcount.. we must + * raise it back out to balance it all :-) + */ + SCTP_INP_INCR_REF((*stcb)->sctp_ep); + if ((*stcb)->sctp_ep != l_inp) { + SCTP_PRINTF("Huh? ep:%p diff then l_inp:%p?\n", + (*stcb)->sctp_ep, l_inp); + } + } + } + } + if (to == NULL) { + return (NULL); + } + cookie_len -= SCTP_SIGNATURE_SIZE; + if (*stcb == NULL) { + /* this is the "normal" case... get a new TCB */ + *stcb = sctp_process_cookie_new(m, iphlen, offset, sh, cookie, + cookie_len, *inp_p, netp, to, ¬ification, + auth_skipped, auth_offset, auth_len, vrf_id, port); + } else { + /* this is abnormal... cookie-echo on existing TCB */ + had_a_existing_tcb = 1; + *stcb = sctp_process_cookie_existing(m, iphlen, offset, sh, + cookie, cookie_len, *inp_p, *stcb, netp, to, + ¬ification, &sac_restart_id, vrf_id, auth_skipped, auth_offset, auth_len, port); + } + + if (*stcb == NULL) { + /* still no TCB... must be bad cookie-echo */ + return (NULL); + } + /* + * Ok, we built an association so confirm the address we sent the + * INIT-ACK to. + */ + netl = sctp_findnet(*stcb, to); + /* + * This code should in theory NOT run but + */ + if (netl == NULL) { + /* TSNH! Huh, why do I need to add this address here? */ + int ret; + + ret = sctp_add_remote_addr(*stcb, to, SCTP_DONOT_SETSCOPE, + SCTP_IN_COOKIE_PROC); + netl = sctp_findnet(*stcb, to); + } + if (netl) { + if (netl->dest_state & SCTP_ADDR_UNCONFIRMED) { + netl->dest_state &= ~SCTP_ADDR_UNCONFIRMED; + (void)sctp_set_primary_addr((*stcb), (struct sockaddr *)NULL, + netl); + sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, + (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); + } + } + if (*stcb) { + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, *inp_p, + *stcb, NULL); + } + if ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { + if (!had_a_existing_tcb || + (((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) { + /* + * If we have a NEW cookie or the connect never + * reached the connected state during collision we + * must do the TCP accept thing. + */ + struct socket *so, *oso; + struct sctp_inpcb *inp; + + if (notification == SCTP_NOTIFY_ASSOC_RESTART) { + /* + * For a restart we will keep the same + * socket, no need to do anything. I THINK!! + */ + sctp_ulp_notify(notification, *stcb, 0, (void *)&sac_restart_id, SCTP_SO_NOT_LOCKED); + return (m); + } + oso = (*inp_p)->sctp_socket; + atomic_add_int(&(*stcb)->asoc.refcnt, 1); + SCTP_TCB_UNLOCK((*stcb)); + so = sonewconn(oso, 0 + ); + SCTP_TCB_LOCK((*stcb)); + atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); + + if (so == NULL) { + struct mbuf *op_err; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *pcb_so; + +#endif + /* Too many sockets */ + SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: no room for another socket!\n"); + op_err = sctp_generate_invmanparam(SCTP_CAUSE_OUT_OF_RESC); + sctp_abort_association(*inp_p, NULL, m, iphlen, + sh, op_err, vrf_id, port); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + pcb_so = SCTP_INP_SO(*inp_p); + atomic_add_int(&(*stcb)->asoc.refcnt, 1); + SCTP_TCB_UNLOCK((*stcb)); + SCTP_SOCKET_LOCK(pcb_so, 1); + SCTP_TCB_LOCK((*stcb)); + atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); +#endif + (void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_20); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(pcb_so, 1); +#endif + return (NULL); + } + inp = (struct sctp_inpcb *)so->so_pcb; + SCTP_INP_INCR_REF(inp); + /* + * We add the unbound flag here so that if we get an + * soabort() before we get the move_pcb done, we + * will properly cleanup. + */ + inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE | + SCTP_PCB_FLAGS_CONNECTED | + SCTP_PCB_FLAGS_IN_TCPPOOL | + SCTP_PCB_FLAGS_UNBOUND | + (SCTP_PCB_COPY_FLAGS & (*inp_p)->sctp_flags) | + SCTP_PCB_FLAGS_DONT_WAKE); + inp->sctp_features = (*inp_p)->sctp_features; + inp->sctp_mobility_features = (*inp_p)->sctp_mobility_features; + inp->sctp_socket = so; + inp->sctp_frag_point = (*inp_p)->sctp_frag_point; + inp->sctp_cmt_on_off = (*inp_p)->sctp_cmt_on_off; + inp->partial_delivery_point = (*inp_p)->partial_delivery_point; + inp->sctp_context = (*inp_p)->sctp_context; + inp->inp_starting_point_for_iterator = NULL; + /* + * copy in the authentication parameters from the + * original endpoint + */ + if (inp->sctp_ep.local_hmacs) + sctp_free_hmaclist(inp->sctp_ep.local_hmacs); + inp->sctp_ep.local_hmacs = + sctp_copy_hmaclist((*inp_p)->sctp_ep.local_hmacs); + if (inp->sctp_ep.local_auth_chunks) + sctp_free_chunklist(inp->sctp_ep.local_auth_chunks); + inp->sctp_ep.local_auth_chunks = + sctp_copy_chunklist((*inp_p)->sctp_ep.local_auth_chunks); + + /* + * Now we must move it from one hash table to + * another and get the tcb in the right place. + */ + + /* + * This is where the one-2-one socket is put into + * the accept state waiting for the accept! + */ + if (*stcb) { + (*stcb)->asoc.state |= SCTP_STATE_IN_ACCEPT_QUEUE; + } + sctp_move_pcb_and_assoc(*inp_p, inp, *stcb); + + atomic_add_int(&(*stcb)->asoc.refcnt, 1); + SCTP_TCB_UNLOCK((*stcb)); + + sctp_pull_off_control_to_new_inp((*inp_p), inp, *stcb, + 0); + SCTP_TCB_LOCK((*stcb)); + atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); + + + /* + * now we must check to see if we were aborted while + * the move was going on and the lock/unlock + * happened. + */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + /* + * yep it was, we leave the assoc attached + * to the socket since the sctp_inpcb_free() + * call will send an abort for us. + */ + SCTP_INP_DECR_REF(inp); + return (NULL); + } + SCTP_INP_DECR_REF(inp); + /* Switch over to the new guy */ + *inp_p = inp; + sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); + + /* + * Pull it from the incomplete queue and wake the + * guy + */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + atomic_add_int(&(*stcb)->asoc.refcnt, 1); + SCTP_TCB_UNLOCK((*stcb)); + SCTP_SOCKET_LOCK(so, 1); +#endif + soisconnected(so); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_TCB_LOCK((*stcb)); + atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); + SCTP_SOCKET_UNLOCK(so, 1); +#endif + return (m); + } + } + if ((notification) && ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) { + sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); + } + return (m); +} + +static void +sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp, + struct sctp_tcb *stcb, struct sctp_nets *net) +{ + /* cp must not be used, others call this without a c-ack :-) */ + struct sctp_association *asoc; + + SCTPDBG(SCTP_DEBUG_INPUT2, + "sctp_handle_cookie_ack: handling COOKIE-ACK\n"); + if (stcb == NULL) + return; + + asoc = &stcb->asoc; + + sctp_stop_all_cookie_timers(stcb); + /* process according to association state */ + if (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) { + /* state change only needed when I am in right state */ + SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n"); + SCTP_SET_STATE(asoc, SCTP_STATE_OPEN); + if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, + stcb->sctp_ep, stcb, asoc->primary_destination); + + } + /* update RTO */ + SCTP_STAT_INCR_COUNTER32(sctps_activeestab); + SCTP_STAT_INCR_GAUGE32(sctps_currestab); + if (asoc->overall_error_count == 0) { + net->RTO = sctp_calculate_rto(stcb, asoc, net, + &asoc->time_entered, sctp_align_safe_nocopy); + } + (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); + sctp_ulp_notify(SCTP_NOTIFY_ASSOC_UP, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); + if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + SCTP_SOCKET_UNLOCK(so, 1); + return; + } +#endif + soisconnected(stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, + stcb, net); + /* + * since we did not send a HB make sure we don't double + * things + */ + net->hb_responded = 1; + + if (stcb->asoc.sctp_autoclose_ticks && + sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_AUTOCLOSE)) { + sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, + stcb->sctp_ep, stcb, NULL); + } + /* + * send ASCONF if parameters are pending and ASCONFs are + * allowed (eg. addresses changed when init/cookie echo were + * in flight) + */ + if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) && + (stcb->asoc.peer_supports_asconf) && + (!TAILQ_EMPTY(&stcb->asoc.asconf_queue))) { +#ifdef SCTP_TIMER_BASED_ASCONF + sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, + stcb->sctp_ep, stcb, + stcb->asoc.primary_destination); +#else + sctp_send_asconf(stcb, stcb->asoc.primary_destination, + SCTP_ADDR_NOT_LOCKED); +#endif + } + } + /* Toss the cookie if I can */ + sctp_toss_old_cookies(stcb, asoc); + if (!TAILQ_EMPTY(&asoc->sent_queue)) { + /* Restart the timer if we have pending data */ + struct sctp_tmit_chunk *chk; + + chk = TAILQ_FIRST(&asoc->sent_queue); + if (chk) { + sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, chk->whoTo); + } + } +} + +static void +sctp_handle_ecn_echo(struct sctp_ecne_chunk *cp, + struct sctp_tcb *stcb) +{ + struct sctp_nets *net; + struct sctp_tmit_chunk *lchk; + uint32_t tsn; + + if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_ecne_chunk)) { + return; + } + SCTP_STAT_INCR(sctps_recvecne); + tsn = ntohl(cp->tsn); + /* ECN Nonce stuff: need a resync and disable the nonce sum check */ + /* Also we make sure we disable the nonce_wait */ + lchk = TAILQ_FIRST(&stcb->asoc.send_queue); + if (lchk == NULL) { + stcb->asoc.nonce_resync_tsn = stcb->asoc.sending_seq; + } else { + stcb->asoc.nonce_resync_tsn = lchk->rec.data.TSN_seq; + } + stcb->asoc.nonce_wait_for_ecne = 0; + stcb->asoc.nonce_sum_check = 0; + + /* Find where it was sent, if possible */ + net = NULL; + lchk = TAILQ_FIRST(&stcb->asoc.sent_queue); + while (lchk) { + if (lchk->rec.data.TSN_seq == tsn) { + net = lchk->whoTo; + break; + } + if (compare_with_wrap(lchk->rec.data.TSN_seq, tsn, MAX_SEQ)) + break; + lchk = TAILQ_NEXT(lchk, sctp_next); + } + if (net == NULL) + /* default is we use the primary */ + net = stcb->asoc.primary_destination; + + if (compare_with_wrap(tsn, stcb->asoc.last_cwr_tsn, MAX_TSN)) { + /* + * JRS - Use the congestion control given in the pluggable + * CC module + */ + stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo(stcb, net); + /* + * we reduce once every RTT. So we will only lower cwnd at + * the next sending seq i.e. the resync_tsn. + */ + stcb->asoc.last_cwr_tsn = stcb->asoc.nonce_resync_tsn; + } + /* + * We always send a CWR this way if our previous one was lost our + * peer will get an update, or if it is not time again to reduce we + * still get the cwr to the peer. + */ + sctp_send_cwr(stcb, net, tsn); +} + +static void +sctp_handle_ecn_cwr(struct sctp_cwr_chunk *cp, struct sctp_tcb *stcb) +{ + /* + * Here we get a CWR from the peer. We must look in the outqueue and + * make sure that we have a covered ECNE in teh control chunk part. + * If so remove it. + */ + struct sctp_tmit_chunk *chk; + struct sctp_ecne_chunk *ecne; + + TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) { + if (chk->rec.chunk_id.id != SCTP_ECN_ECHO) { + continue; + } + /* + * Look for and remove if it is the right TSN. Since there + * is only ONE ECNE on the control queue at any one time we + * don't need to worry about more than one! + */ + ecne = mtod(chk->data, struct sctp_ecne_chunk *); + if (compare_with_wrap(ntohl(cp->tsn), ntohl(ecne->tsn), + MAX_TSN) || (cp->tsn == ecne->tsn)) { + /* this covers this ECNE, we can remove it */ + stcb->asoc.ecn_echo_cnt_onq--; + TAILQ_REMOVE(&stcb->asoc.control_send_queue, chk, + sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + stcb->asoc.ctrl_queue_cnt--; + sctp_free_a_chunk(stcb, chk); + break; + } + } +} + +static void +sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp, + struct sctp_tcb *stcb, struct sctp_nets *net) +{ + struct sctp_association *asoc; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + + SCTPDBG(SCTP_DEBUG_INPUT2, + "sctp_handle_shutdown_complete: handling SHUTDOWN-COMPLETE\n"); + if (stcb == NULL) + return; + + asoc = &stcb->asoc; + /* process according to association state */ + if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT) { + /* unexpected SHUTDOWN-COMPLETE... so ignore... */ + SCTPDBG(SCTP_DEBUG_INPUT2, + "sctp_handle_shutdown_complete: not in SCTP_STATE_SHUTDOWN_ACK_SENT --- ignore\n"); + SCTP_TCB_UNLOCK(stcb); + return; + } + /* notify upper layer protocol */ + if (stcb->sctp_socket) { + sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); + /* are the queues empty? they should be */ + if (!TAILQ_EMPTY(&asoc->send_queue) || + !TAILQ_EMPTY(&asoc->sent_queue) || + !TAILQ_EMPTY(&asoc->out_wheel)) { + sctp_report_all_outbound(stcb, 0, SCTP_SO_NOT_LOCKED); + } + } + /* stop the timer */ + sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_22); + SCTP_STAT_INCR_COUNTER32(sctps_shutdown); + /* free the TCB */ + SCTPDBG(SCTP_DEBUG_INPUT2, + "sctp_handle_shutdown_complete: calls free-asoc\n"); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_23); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + return; +} + +static int +process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc, + struct sctp_nets *net, uint8_t flg) +{ + switch (desc->chunk_type) { + case SCTP_DATA: + /* find the tsn to resend (possibly */ + { + uint32_t tsn; + struct sctp_tmit_chunk *tp1; + + tsn = ntohl(desc->tsn_ifany); + tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue); + while (tp1) { + if (tp1->rec.data.TSN_seq == tsn) { + /* found it */ + break; + } + if (compare_with_wrap(tp1->rec.data.TSN_seq, tsn, + MAX_TSN)) { + /* not found */ + tp1 = NULL; + break; + } + tp1 = TAILQ_NEXT(tp1, sctp_next); + } + if (tp1 == NULL) { + /* + * Do it the other way , aka without paying + * attention to queue seq order. + */ + SCTP_STAT_INCR(sctps_pdrpdnfnd); + tp1 = TAILQ_FIRST(&stcb->asoc.sent_queue); + while (tp1) { + if (tp1->rec.data.TSN_seq == tsn) { + /* found it */ + break; + } + tp1 = TAILQ_NEXT(tp1, sctp_next); + } + } + if (tp1 == NULL) { + SCTP_STAT_INCR(sctps_pdrptsnnf); + } + if ((tp1) && (tp1->sent < SCTP_DATAGRAM_ACKED)) { + uint8_t *ddp; + + if (((flg & SCTP_BADCRC) == 0) && + ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) { + return (0); + } + if ((stcb->asoc.peers_rwnd == 0) && + ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) { + SCTP_STAT_INCR(sctps_pdrpdiwnp); + return (0); + } + if (stcb->asoc.peers_rwnd == 0 && + (flg & SCTP_FROM_MIDDLE_BOX)) { + SCTP_STAT_INCR(sctps_pdrpdizrw); + return (0); + } + ddp = (uint8_t *) (mtod(tp1->data, caddr_t)+ + sizeof(struct sctp_data_chunk)); + { + unsigned int iii; + + for (iii = 0; iii < sizeof(desc->data_bytes); + iii++) { + if (ddp[iii] != desc->data_bytes[iii]) { + SCTP_STAT_INCR(sctps_pdrpbadd); + return (-1); + } + } + } + /* + * We zero out the nonce so resync not + * needed + */ + tp1->rec.data.ect_nonce = 0; + + if (tp1->do_rtt) { + /* + * this guy had a RTO calculation + * pending on it, cancel it + */ + tp1->do_rtt = 0; + } + SCTP_STAT_INCR(sctps_pdrpmark); + if (tp1->sent != SCTP_DATAGRAM_RESEND) + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + /* + * mark it as if we were doing a FR, since + * we will be getting gap ack reports behind + * the info from the router. + */ + tp1->rec.data.doing_fast_retransmit = 1; + /* + * mark the tsn with what sequences can + * cause a new FR. + */ + if (TAILQ_EMPTY(&stcb->asoc.send_queue)) { + tp1->rec.data.fast_retran_tsn = stcb->asoc.sending_seq; + } else { + tp1->rec.data.fast_retran_tsn = (TAILQ_FIRST(&stcb->asoc.send_queue))->rec.data.TSN_seq; + } + + /* restart the timer */ + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, tp1->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_24); + sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, tp1->whoTo); + + /* fix counts and things */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PDRP, + tp1->whoTo->flight_size, + tp1->book_size, + (uintptr_t) stcb, + tp1->rec.data.TSN_seq); + } + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + sctp_flight_size_decrease(tp1); + sctp_total_flight_decrease(stcb, tp1); + } + tp1->sent = SCTP_DATAGRAM_RESEND; + } { + /* audit code */ + unsigned int audit; + + audit = 0; + TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) { + if (tp1->sent == SCTP_DATAGRAM_RESEND) + audit++; + } + TAILQ_FOREACH(tp1, &stcb->asoc.control_send_queue, + sctp_next) { + if (tp1->sent == SCTP_DATAGRAM_RESEND) + audit++; + } + if (audit != stcb->asoc.sent_queue_retran_cnt) { + SCTP_PRINTF("**Local Audit finds cnt:%d asoc cnt:%d\n", + audit, stcb->asoc.sent_queue_retran_cnt); +#ifndef SCTP_AUDITING_ENABLED + stcb->asoc.sent_queue_retran_cnt = audit; +#endif + } + } + } + break; + case SCTP_ASCONF: + { + struct sctp_tmit_chunk *asconf; + + TAILQ_FOREACH(asconf, &stcb->asoc.control_send_queue, + sctp_next) { + if (asconf->rec.chunk_id.id == SCTP_ASCONF) { + break; + } + } + if (asconf) { + if (asconf->sent != SCTP_DATAGRAM_RESEND) + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + asconf->sent = SCTP_DATAGRAM_RESEND; + asconf->snd_count--; + } + } + break; + case SCTP_INITIATION: + /* resend the INIT */ + stcb->asoc.dropped_special_cnt++; + if (stcb->asoc.dropped_special_cnt < SCTP_RETRY_DROPPED_THRESH) { + /* + * If we can get it in, in a few attempts we do + * this, otherwise we let the timer fire. + */ + sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, + stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_25); + sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); + } + break; + case SCTP_SELECTIVE_ACK: + case SCTP_NR_SELECTIVE_ACK: + /* resend the sack */ + sctp_send_sack(stcb); + break; + case SCTP_HEARTBEAT_REQUEST: + /* resend a demand HB */ + if ((stcb->asoc.overall_error_count + 3) < stcb->asoc.max_send_times) { + /* + * Only retransmit if we KNOW we wont destroy the + * tcb + */ + (void)sctp_send_hb(stcb, 1, net); + } + break; + case SCTP_SHUTDOWN: + sctp_send_shutdown(stcb, net); + break; + case SCTP_SHUTDOWN_ACK: + sctp_send_shutdown_ack(stcb, net); + break; + case SCTP_COOKIE_ECHO: + { + struct sctp_tmit_chunk *cookie; + + cookie = NULL; + TAILQ_FOREACH(cookie, &stcb->asoc.control_send_queue, + sctp_next) { + if (cookie->rec.chunk_id.id == SCTP_COOKIE_ECHO) { + break; + } + } + if (cookie) { + if (cookie->sent != SCTP_DATAGRAM_RESEND) + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + cookie->sent = SCTP_DATAGRAM_RESEND; + sctp_stop_all_cookie_timers(stcb); + } + } + break; + case SCTP_COOKIE_ACK: + sctp_send_cookie_ack(stcb); + break; + case SCTP_ASCONF_ACK: + /* resend last asconf ack */ + sctp_send_asconf_ack(stcb); + break; + case SCTP_FORWARD_CUM_TSN: + send_forward_tsn(stcb, &stcb->asoc); + break; + /* can't do anything with these */ + case SCTP_PACKET_DROPPED: + case SCTP_INITIATION_ACK: /* this should not happen */ + case SCTP_HEARTBEAT_ACK: + case SCTP_ABORT_ASSOCIATION: + case SCTP_OPERATION_ERROR: + case SCTP_SHUTDOWN_COMPLETE: + case SCTP_ECN_ECHO: + case SCTP_ECN_CWR: + default: + break; + } + return (0); +} + +void +sctp_reset_in_stream(struct sctp_tcb *stcb, int number_entries, uint16_t * list) +{ + int i; + uint16_t temp; + + /* + * We set things to 0xffff since this is the last delivered sequence + * and we will be sending in 0 after the reset. + */ + + if (number_entries) { + for (i = 0; i < number_entries; i++) { + temp = ntohs(list[i]); + if (temp >= stcb->asoc.streamincnt) { + continue; + } + stcb->asoc.strmin[temp].last_sequence_delivered = 0xffff; + } + } else { + list = NULL; + for (i = 0; i < stcb->asoc.streamincnt; i++) { + stcb->asoc.strmin[i].last_sequence_delivered = 0xffff; + } + } + sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_RECV, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); +} + +static void +sctp_reset_out_streams(struct sctp_tcb *stcb, int number_entries, uint16_t * list) +{ + int i; + + if (number_entries == 0) { + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + stcb->asoc.strmout[i].next_sequence_sent = 0; + } + } else if (number_entries) { + for (i = 0; i < number_entries; i++) { + uint16_t temp; + + temp = ntohs(list[i]); + if (temp >= stcb->asoc.streamoutcnt) { + /* no such stream */ + continue; + } + stcb->asoc.strmout[temp].next_sequence_sent = 0; + } + } + sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_SEND, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); +} + + +struct sctp_stream_reset_out_request * +sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chunk **bchk) +{ + struct sctp_association *asoc; + struct sctp_stream_reset_out_req *req; + struct sctp_stream_reset_out_request *r; + struct sctp_tmit_chunk *chk; + int len, clen; + + asoc = &stcb->asoc; + if (TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { + asoc->stream_reset_outstanding = 0; + return (NULL); + } + if (stcb->asoc.str_reset == NULL) { + asoc->stream_reset_outstanding = 0; + return (NULL); + } + chk = stcb->asoc.str_reset; + if (chk->data == NULL) { + return (NULL); + } + if (bchk) { + /* he wants a copy of the chk pointer */ + *bchk = chk; + } + clen = chk->send_size; + req = mtod(chk->data, struct sctp_stream_reset_out_req *); + r = &req->sr_req; + if (ntohl(r->request_seq) == seq) { + /* found it */ + return (r); + } + len = SCTP_SIZE32(ntohs(r->ph.param_length)); + if (clen > (len + (int)sizeof(struct sctp_chunkhdr))) { + /* move to the next one, there can only be a max of two */ + r = (struct sctp_stream_reset_out_request *)((caddr_t)r + len); + if (ntohl(r->request_seq) == seq) { + return (r); + } + } + /* that seq is not here */ + return (NULL); +} + +static void +sctp_clean_up_stream_reset(struct sctp_tcb *stcb) +{ + struct sctp_association *asoc; + struct sctp_tmit_chunk *chk = stcb->asoc.str_reset; + + if (stcb->asoc.str_reset == NULL) { + return; + } + asoc = &stcb->asoc; + + sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_26); + TAILQ_REMOVE(&asoc->control_send_queue, + chk, + sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + asoc->ctrl_queue_cnt--; + sctp_free_a_chunk(stcb, chk); + /* sa_ignore NO_NULL_CHK */ + stcb->asoc.str_reset = NULL; +} + + +static int +sctp_handle_stream_reset_response(struct sctp_tcb *stcb, + uint32_t seq, uint32_t action, + struct sctp_stream_reset_response *respin) +{ + uint16_t type; + int lparm_len; + struct sctp_association *asoc = &stcb->asoc; + struct sctp_tmit_chunk *chk; + struct sctp_stream_reset_out_request *srparam; + int number_entries; + + if (asoc->stream_reset_outstanding == 0) { + /* duplicate */ + return (0); + } + if (seq == stcb->asoc.str_reset_seq_out) { + srparam = sctp_find_stream_reset(stcb, seq, &chk); + if (srparam) { + stcb->asoc.str_reset_seq_out++; + type = ntohs(srparam->ph.param_type); + lparm_len = ntohs(srparam->ph.param_length); + if (type == SCTP_STR_RESET_OUT_REQUEST) { + number_entries = (lparm_len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t); + asoc->stream_reset_out_is_outstanding = 0; + if (asoc->stream_reset_outstanding) + asoc->stream_reset_outstanding--; + if (action == SCTP_STREAM_RESET_PERFORMED) { + /* do it */ + sctp_reset_out_streams(stcb, number_entries, srparam->list_of_streams); + } else { + sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED); + } + } else if (type == SCTP_STR_RESET_IN_REQUEST) { + /* Answered my request */ + number_entries = (lparm_len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t); + if (asoc->stream_reset_outstanding) + asoc->stream_reset_outstanding--; + if (action != SCTP_STREAM_RESET_PERFORMED) { + sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_IN, stcb, number_entries, srparam->list_of_streams, SCTP_SO_NOT_LOCKED); + } + } else if (type == SCTP_STR_RESET_ADD_STREAMS) { + /* Ok we now may have more streams */ + if (asoc->stream_reset_outstanding) + asoc->stream_reset_outstanding--; + if (action == SCTP_STREAM_RESET_PERFORMED) { + /* Put the new streams into effect */ + stcb->asoc.streamoutcnt = stcb->asoc.strm_realoutsize; + sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_ADD_OK, stcb, + (uint32_t) stcb->asoc.streamoutcnt, NULL, SCTP_SO_NOT_LOCKED); + } else { + sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_ADD_FAIL, stcb, + (uint32_t) stcb->asoc.streamoutcnt, NULL, SCTP_SO_NOT_LOCKED); + } + } else if (type == SCTP_STR_RESET_TSN_REQUEST) { + /** + * a) Adopt the new in tsn. + * b) reset the map + * c) Adopt the new out-tsn + */ + struct sctp_stream_reset_response_tsn *resp; + struct sctp_forward_tsn_chunk fwdtsn; + int abort_flag = 0; + + if (respin == NULL) { + /* huh ? */ + return (0); + } + if (action == SCTP_STREAM_RESET_PERFORMED) { + resp = (struct sctp_stream_reset_response_tsn *)respin; + asoc->stream_reset_outstanding--; + fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk)); + fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN; + fwdtsn.new_cumulative_tsn = htonl(ntohl(resp->senders_next_tsn) - 1); + sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0); + if (abort_flag) { + return (1); + } + stcb->asoc.highest_tsn_inside_map = (ntohl(resp->senders_next_tsn) - 1); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { + sctp_log_map(0, 7, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); + } + stcb->asoc.tsn_last_delivered = stcb->asoc.cumulative_tsn = stcb->asoc.highest_tsn_inside_map; + stcb->asoc.mapping_array_base_tsn = ntohl(resp->senders_next_tsn); + memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size); + + stcb->asoc.highest_tsn_inside_nr_map = stcb->asoc.highest_tsn_inside_map; + memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size); + + stcb->asoc.sending_seq = ntohl(resp->receivers_next_tsn); + stcb->asoc.last_acked_seq = stcb->asoc.cumulative_tsn; + + sctp_reset_out_streams(stcb, 0, (uint16_t *) NULL); + sctp_reset_in_stream(stcb, 0, (uint16_t *) NULL); + + } + } + /* get rid of the request and get the request flags */ + if (asoc->stream_reset_outstanding == 0) { + sctp_clean_up_stream_reset(stcb); + } + } + } + return (0); +} + +static void +sctp_handle_str_reset_request_in(struct sctp_tcb *stcb, + struct sctp_tmit_chunk *chk, + struct sctp_stream_reset_in_request *req, int trunc) +{ + uint32_t seq; + int len, i; + int number_entries; + uint16_t temp; + + /* + * peer wants me to send a str-reset to him for my outgoing seq's if + * seq_in is right. + */ + struct sctp_association *asoc = &stcb->asoc; + + seq = ntohl(req->request_seq); + if (asoc->str_reset_seq_in == seq) { + if (trunc) { + /* Can't do it, since they exceeded our buffer size */ + asoc->last_reset_action[1] = asoc->last_reset_action[0]; + asoc->last_reset_action[0] = SCTP_STREAM_RESET_DENIED; + sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); + } else if (stcb->asoc.stream_reset_out_is_outstanding == 0) { + len = ntohs(req->ph.param_length); + number_entries = ((len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t)); + for (i = 0; i < number_entries; i++) { + temp = ntohs(req->list_of_streams[i]); + req->list_of_streams[i] = temp; + } + /* move the reset action back one */ + asoc->last_reset_action[1] = asoc->last_reset_action[0]; + asoc->last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED; + sctp_add_stream_reset_out(chk, number_entries, req->list_of_streams, + asoc->str_reset_seq_out, + seq, (asoc->sending_seq - 1)); + asoc->stream_reset_out_is_outstanding = 1; + asoc->str_reset = chk; + sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo); + stcb->asoc.stream_reset_outstanding++; + } else { + /* Can't do it, since we have sent one out */ + asoc->last_reset_action[1] = asoc->last_reset_action[0]; + asoc->last_reset_action[0] = SCTP_STREAM_RESET_TRY_LATER; + sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); + } + asoc->str_reset_seq_in++; + } else if (asoc->str_reset_seq_in - 1 == seq) { + sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); + } else if (asoc->str_reset_seq_in - 2 == seq) { + sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); + } else { + sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO); + } +} + +static int +sctp_handle_str_reset_request_tsn(struct sctp_tcb *stcb, + struct sctp_tmit_chunk *chk, + struct sctp_stream_reset_tsn_request *req) +{ + /* reset all in and out and update the tsn */ + /* + * A) reset my str-seq's on in and out. B) Select a receive next, + * and set cum-ack to it. Also process this selected number as a + * fwd-tsn as well. C) set in the response my next sending seq. + */ + struct sctp_forward_tsn_chunk fwdtsn; + struct sctp_association *asoc = &stcb->asoc; + int abort_flag = 0; + uint32_t seq; + + seq = ntohl(req->request_seq); + if (asoc->str_reset_seq_in == seq) { + fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk)); + fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN; + fwdtsn.ch.chunk_flags = 0; + fwdtsn.new_cumulative_tsn = htonl(stcb->asoc.highest_tsn_inside_map + 1); + sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0); + if (abort_flag) { + return (1); + } + stcb->asoc.highest_tsn_inside_map += SCTP_STREAM_RESET_TSN_DELTA; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { + sctp_log_map(0, 10, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); + } + stcb->asoc.tsn_last_delivered = stcb->asoc.cumulative_tsn = stcb->asoc.highest_tsn_inside_map; + stcb->asoc.mapping_array_base_tsn = stcb->asoc.highest_tsn_inside_map + 1; + memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size); + stcb->asoc.highest_tsn_inside_nr_map = stcb->asoc.highest_tsn_inside_map; + memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size); + atomic_add_int(&stcb->asoc.sending_seq, 1); + /* save off historical data for retrans */ + stcb->asoc.last_sending_seq[1] = stcb->asoc.last_sending_seq[0]; + stcb->asoc.last_sending_seq[0] = stcb->asoc.sending_seq; + stcb->asoc.last_base_tsnsent[1] = stcb->asoc.last_base_tsnsent[0]; + stcb->asoc.last_base_tsnsent[0] = stcb->asoc.mapping_array_base_tsn; + + sctp_add_stream_reset_result_tsn(chk, + ntohl(req->request_seq), + SCTP_STREAM_RESET_PERFORMED, + stcb->asoc.sending_seq, + stcb->asoc.mapping_array_base_tsn); + sctp_reset_out_streams(stcb, 0, (uint16_t *) NULL); + sctp_reset_in_stream(stcb, 0, (uint16_t *) NULL); + stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0]; + stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED; + + asoc->str_reset_seq_in++; + } else if (asoc->str_reset_seq_in - 1 == seq) { + sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0], + stcb->asoc.last_sending_seq[0], + stcb->asoc.last_base_tsnsent[0] + ); + } else if (asoc->str_reset_seq_in - 2 == seq) { + sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[1], + stcb->asoc.last_sending_seq[1], + stcb->asoc.last_base_tsnsent[1] + ); + } else { + sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO); + } + return (0); +} + +static void +sctp_handle_str_reset_request_out(struct sctp_tcb *stcb, + struct sctp_tmit_chunk *chk, + struct sctp_stream_reset_out_request *req, int trunc) +{ + uint32_t seq, tsn; + int number_entries, len; + struct sctp_association *asoc = &stcb->asoc; + + seq = ntohl(req->request_seq); + + /* now if its not a duplicate we process it */ + if (asoc->str_reset_seq_in == seq) { + len = ntohs(req->ph.param_length); + number_entries = ((len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t)); + /* + * the sender is resetting, handle the list issue.. we must + * a) verify if we can do the reset, if so no problem b) If + * we can't do the reset we must copy the request. c) queue + * it, and setup the data in processor to trigger it off + * when needed and dequeue all the queued data. + */ + tsn = ntohl(req->send_reset_at_tsn); + + /* move the reset action back one */ + asoc->last_reset_action[1] = asoc->last_reset_action[0]; + if (trunc) { + sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_DENIED); + asoc->last_reset_action[0] = SCTP_STREAM_RESET_DENIED; + } else if ((tsn == asoc->cumulative_tsn) || + (compare_with_wrap(asoc->cumulative_tsn, tsn, MAX_TSN))) { + /* we can do it now */ + sctp_reset_in_stream(stcb, number_entries, req->list_of_streams); + sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_PERFORMED); + asoc->last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED; + } else { + /* + * we must queue it up and thus wait for the TSN's + * to arrive that are at or before tsn + */ + struct sctp_stream_reset_list *liste; + int siz; + + siz = sizeof(struct sctp_stream_reset_list) + (number_entries * sizeof(uint16_t)); + SCTP_MALLOC(liste, struct sctp_stream_reset_list *, + siz, SCTP_M_STRESET); + if (liste == NULL) { + /* gak out of memory */ + sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_DENIED); + asoc->last_reset_action[0] = SCTP_STREAM_RESET_DENIED; + return; + } + liste->tsn = tsn; + liste->number_entries = number_entries; + memcpy(&liste->req, req, + (sizeof(struct sctp_stream_reset_out_request) + (number_entries * sizeof(uint16_t)))); + TAILQ_INSERT_TAIL(&asoc->resetHead, liste, next_resp); + sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_PERFORMED); + asoc->last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED; + } + asoc->str_reset_seq_in++; + } else if ((asoc->str_reset_seq_in - 1) == seq) { + /* + * one seq back, just echo back last action since my + * response was lost. + */ + sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); + } else if ((asoc->str_reset_seq_in - 2) == seq) { + /* + * two seq back, just echo back last action since my + * response was lost. + */ + sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); + } else { + sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO); + } +} + +static void +sctp_handle_str_reset_add_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, + struct sctp_stream_reset_add_strm *str_add) +{ + /* + * Peer is requesting to add more streams. If its within our + * max-streams we will allow it. + */ + uint16_t num_stream, i; + uint32_t seq; + struct sctp_association *asoc = &stcb->asoc; + struct sctp_queued_to_read *ctl; + + /* Get the number. */ + seq = ntohl(str_add->request_seq); + num_stream = ntohs(str_add->number_of_streams); + /* Now what would be the new total? */ + if (asoc->str_reset_seq_in == seq) { + num_stream += stcb->asoc.streamincnt; + if (num_stream > stcb->asoc.max_inbound_streams) { + /* We must reject it they ask for to many */ + denied: + sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_DENIED); + stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0]; + stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_DENIED; + } else { + /* Ok, we can do that :-) */ + struct sctp_stream_in *oldstrm; + + /* save off the old */ + oldstrm = stcb->asoc.strmin; + SCTP_MALLOC(stcb->asoc.strmin, struct sctp_stream_in *, + (num_stream * sizeof(struct sctp_stream_in)), + SCTP_M_STRMI); + if (stcb->asoc.strmin == NULL) { + stcb->asoc.strmin = oldstrm; + goto denied; + } + /* copy off the old data */ + for (i = 0; i < stcb->asoc.streamincnt; i++) { + TAILQ_INIT(&stcb->asoc.strmin[i].inqueue); + stcb->asoc.strmin[i].stream_no = i; + stcb->asoc.strmin[i].last_sequence_delivered = oldstrm[i].last_sequence_delivered; + stcb->asoc.strmin[i].delivery_started = oldstrm[i].delivery_started; + /* now anything on those queues? */ + while (TAILQ_EMPTY(&oldstrm[i].inqueue) == 0) { + ctl = TAILQ_FIRST(&oldstrm[i].inqueue); + TAILQ_REMOVE(&oldstrm[i].inqueue, ctl, next); + TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].inqueue, ctl, next); + } + } + /* Init the new streams */ + for (i = stcb->asoc.streamincnt; i < num_stream; i++) { + TAILQ_INIT(&stcb->asoc.strmin[i].inqueue); + stcb->asoc.strmin[i].stream_no = i; + stcb->asoc.strmin[i].last_sequence_delivered = 0xffff; + stcb->asoc.strmin[i].delivery_started = 0; + } + SCTP_FREE(oldstrm, SCTP_M_STRMI); + /* update the size */ + stcb->asoc.streamincnt = num_stream; + /* Send the ack */ + sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_PERFORMED); + stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0]; + stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_PERFORMED; + sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_INSTREAM_ADD_OK, stcb, + (uint32_t) stcb->asoc.streamincnt, NULL, SCTP_SO_NOT_LOCKED); + } + } else if ((asoc->str_reset_seq_in - 1) == seq) { + /* + * one seq back, just echo back last action since my + * response was lost. + */ + sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); + } else if ((asoc->str_reset_seq_in - 2) == seq) { + /* + * two seq back, just echo back last action since my + * response was lost. + */ + sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); + } else { + sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_BAD_SEQNO); + + } +} + +#ifdef __GNUC__ +__attribute__((noinline)) +#endif + static int + sctp_handle_stream_reset(struct sctp_tcb *stcb, struct mbuf *m, int offset, + struct sctp_stream_reset_out_req *sr_req) +{ + int chk_length, param_len, ptype; + struct sctp_paramhdr pstore; + uint8_t cstore[SCTP_CHUNK_BUFFER_SIZE]; + + uint32_t seq; + int num_req = 0; + int trunc = 0; + struct sctp_tmit_chunk *chk; + struct sctp_chunkhdr *ch; + struct sctp_paramhdr *ph; + int ret_code = 0; + int num_param = 0; + + /* now it may be a reset or a reset-response */ + chk_length = ntohs(sr_req->ch.chunk_length); + + /* setup for adding the response */ + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + return (ret_code); + } + chk->rec.chunk_id.id = SCTP_STREAM_RESET; + chk->rec.chunk_id.can_take_data = 0; + chk->asoc = &stcb->asoc; + chk->no_fr_allowed = 0; + chk->book_size = chk->send_size = sizeof(struct sctp_chunkhdr); + chk->book_size_scale = 0; + chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + if (chk->data == NULL) { +strres_nochunk: + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + sctp_free_a_chunk(stcb, chk); + return (ret_code); + } + SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); + + /* setup chunk parameters */ + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->whoTo = stcb->asoc.primary_destination; + atomic_add_int(&chk->whoTo->ref_count, 1); + + ch = mtod(chk->data, struct sctp_chunkhdr *); + ch->chunk_type = SCTP_STREAM_RESET; + ch->chunk_flags = 0; + ch->chunk_length = htons(chk->send_size); + SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); + offset += sizeof(struct sctp_chunkhdr); + while ((size_t)chk_length >= sizeof(struct sctp_stream_reset_tsn_request)) { + ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(pstore), (uint8_t *) & pstore); + if (ph == NULL) + break; + param_len = ntohs(ph->param_length); + if (param_len < (int)sizeof(struct sctp_stream_reset_tsn_request)) { + /* bad param */ + break; + } + ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, min(param_len, (int)sizeof(cstore)), + (uint8_t *) & cstore); + ptype = ntohs(ph->param_type); + num_param++; + if (param_len > (int)sizeof(cstore)) { + trunc = 1; + } else { + trunc = 0; + } + + if (num_param > SCTP_MAX_RESET_PARAMS) { + /* hit the max of parameters already sorry.. */ + break; + } + if (ptype == SCTP_STR_RESET_OUT_REQUEST) { + struct sctp_stream_reset_out_request *req_out; + + req_out = (struct sctp_stream_reset_out_request *)ph; + num_req++; + if (stcb->asoc.stream_reset_outstanding) { + seq = ntohl(req_out->response_seq); + if (seq == stcb->asoc.str_reset_seq_out) { + /* implicit ack */ + (void)sctp_handle_stream_reset_response(stcb, seq, SCTP_STREAM_RESET_PERFORMED, NULL); + } + } + sctp_handle_str_reset_request_out(stcb, chk, req_out, trunc); + } else if (ptype == SCTP_STR_RESET_ADD_STREAMS) { + struct sctp_stream_reset_add_strm *str_add; + + str_add = (struct sctp_stream_reset_add_strm *)ph; + num_req++; + sctp_handle_str_reset_add_strm(stcb, chk, str_add); + } else if (ptype == SCTP_STR_RESET_IN_REQUEST) { + struct sctp_stream_reset_in_request *req_in; + + num_req++; + + req_in = (struct sctp_stream_reset_in_request *)ph; + + sctp_handle_str_reset_request_in(stcb, chk, req_in, trunc); + } else if (ptype == SCTP_STR_RESET_TSN_REQUEST) { + struct sctp_stream_reset_tsn_request *req_tsn; + + num_req++; + req_tsn = (struct sctp_stream_reset_tsn_request *)ph; + + if (sctp_handle_str_reset_request_tsn(stcb, chk, req_tsn)) { + ret_code = 1; + goto strres_nochunk; + } + /* no more */ + break; + } else if (ptype == SCTP_STR_RESET_RESPONSE) { + struct sctp_stream_reset_response *resp; + uint32_t result; + + resp = (struct sctp_stream_reset_response *)ph; + seq = ntohl(resp->response_seq); + result = ntohl(resp->result); + if (sctp_handle_stream_reset_response(stcb, seq, result, resp)) { + ret_code = 1; + goto strres_nochunk; + } + } else { + break; + } + offset += SCTP_SIZE32(param_len); + chk_length -= SCTP_SIZE32(param_len); + } + if (num_req == 0) { + /* we have no response free the stuff */ + goto strres_nochunk; + } + /* ok we have a chunk to link in */ + TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, + chk, + sctp_next); + stcb->asoc.ctrl_queue_cnt++; + return (ret_code); +} + +/* + * Handle a router or endpoints report of a packet loss, there are two ways + * to handle this, either we get the whole packet and must disect it + * ourselves (possibly with truncation and or corruption) or it is a summary + * from a middle box that did the disectting for us. + */ +static void +sctp_handle_packet_dropped(struct sctp_pktdrop_chunk *cp, + struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t limit) +{ + uint32_t bottle_bw, on_queue; + uint16_t trunc_len; + unsigned int chlen; + unsigned int at; + struct sctp_chunk_desc desc; + struct sctp_chunkhdr *ch; + + chlen = ntohs(cp->ch.chunk_length); + chlen -= sizeof(struct sctp_pktdrop_chunk); + /* XXX possible chlen underflow */ + if (chlen == 0) { + ch = NULL; + if (cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) + SCTP_STAT_INCR(sctps_pdrpbwrpt); + } else { + ch = (struct sctp_chunkhdr *)(cp->data + sizeof(struct sctphdr)); + chlen -= sizeof(struct sctphdr); + /* XXX possible chlen underflow */ + memset(&desc, 0, sizeof(desc)); + } + trunc_len = (uint16_t) ntohs(cp->trunc_len); + if (trunc_len > limit) { + trunc_len = limit; + } + /* now the chunks themselves */ + while ((ch != NULL) && (chlen >= sizeof(struct sctp_chunkhdr))) { + desc.chunk_type = ch->chunk_type; + /* get amount we need to move */ + at = ntohs(ch->chunk_length); + if (at < sizeof(struct sctp_chunkhdr)) { + /* corrupt chunk, maybe at the end? */ + SCTP_STAT_INCR(sctps_pdrpcrupt); + break; + } + if (trunc_len == 0) { + /* we are supposed to have all of it */ + if (at > chlen) { + /* corrupt skip it */ + SCTP_STAT_INCR(sctps_pdrpcrupt); + break; + } + } else { + /* is there enough of it left ? */ + if (desc.chunk_type == SCTP_DATA) { + if (chlen < (sizeof(struct sctp_data_chunk) + + sizeof(desc.data_bytes))) { + break; + } + } else { + if (chlen < sizeof(struct sctp_chunkhdr)) { + break; + } + } + } + if (desc.chunk_type == SCTP_DATA) { + /* can we get out the tsn? */ + if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX)) + SCTP_STAT_INCR(sctps_pdrpmbda); + + if (chlen >= (sizeof(struct sctp_data_chunk) + sizeof(uint32_t))) { + /* yep */ + struct sctp_data_chunk *dcp; + uint8_t *ddp; + unsigned int iii; + + dcp = (struct sctp_data_chunk *)ch; + ddp = (uint8_t *) (dcp + 1); + for (iii = 0; iii < sizeof(desc.data_bytes); iii++) { + desc.data_bytes[iii] = ddp[iii]; + } + desc.tsn_ifany = dcp->dp.tsn; + } else { + /* nope we are done. */ + SCTP_STAT_INCR(sctps_pdrpnedat); + break; + } + } else { + if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX)) + SCTP_STAT_INCR(sctps_pdrpmbct); + } + + if (process_chunk_drop(stcb, &desc, net, cp->ch.chunk_flags)) { + SCTP_STAT_INCR(sctps_pdrppdbrk); + break; + } + if (SCTP_SIZE32(at) > chlen) { + break; + } + chlen -= SCTP_SIZE32(at); + if (chlen < sizeof(struct sctp_chunkhdr)) { + /* done, none left */ + break; + } + ch = (struct sctp_chunkhdr *)((caddr_t)ch + SCTP_SIZE32(at)); + } + /* Now update any rwnd --- possibly */ + if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) == 0) { + /* From a peer, we get a rwnd report */ + uint32_t a_rwnd; + + SCTP_STAT_INCR(sctps_pdrpfehos); + + bottle_bw = ntohl(cp->bottle_bw); + on_queue = ntohl(cp->current_onq); + if (bottle_bw && on_queue) { + /* a rwnd report is in here */ + if (bottle_bw > on_queue) + a_rwnd = bottle_bw - on_queue; + else + a_rwnd = 0; + + if (a_rwnd == 0) + stcb->asoc.peers_rwnd = 0; + else { + if (a_rwnd > stcb->asoc.total_flight) { + stcb->asoc.peers_rwnd = + a_rwnd - stcb->asoc.total_flight; + } else { + stcb->asoc.peers_rwnd = 0; + } + if (stcb->asoc.peers_rwnd < + stcb->sctp_ep->sctp_ep.sctp_sws_sender) { + /* SWS sender side engages */ + stcb->asoc.peers_rwnd = 0; + } + } + } + } else { + SCTP_STAT_INCR(sctps_pdrpfmbox); + } + + /* now middle boxes in sat networks get a cwnd bump */ + if ((cp->ch.chunk_flags & SCTP_FROM_MIDDLE_BOX) && + (stcb->asoc.sat_t3_loss_recovery == 0) && + (stcb->asoc.sat_network)) { + /* + * This is debateable but for sat networks it makes sense + * Note if a T3 timer has went off, we will prohibit any + * changes to cwnd until we exit the t3 loss recovery. + */ + stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped(stcb, + net, cp, &bottle_bw, &on_queue); + } +} + +/* + * handles all control chunks in a packet inputs: - m: mbuf chain, assumed to + * still contain IP/SCTP header - stcb: is the tcb found for this packet - + * offset: offset into the mbuf chain to first chunkhdr - length: is the + * length of the complete packet outputs: - length: modified to remaining + * length after control processing - netp: modified to new sctp_nets after + * cookie-echo processing - return NULL to discard the packet (ie. no asoc, + * bad packet,...) otherwise return the tcb for this packet + */ +#ifdef __GNUC__ +__attribute__((noinline)) +#endif + static struct sctp_tcb * + sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, + struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb *inp, + struct sctp_tcb *stcb, struct sctp_nets **netp, int *fwd_tsn_seen, + uint32_t vrf_id, uint16_t port) +{ + struct sctp_association *asoc; + uint32_t vtag_in; + int num_chunks = 0; /* number of control chunks processed */ + uint32_t chk_length; + int ret; + int abort_no_unlock = 0; + + /* + * How big should this be, and should it be alloc'd? Lets try the + * d-mtu-ceiling for now (2k) and that should hopefully work ... + * until we get into jumbo grams and such.. + */ + uint8_t chunk_buf[SCTP_CHUNK_BUFFER_SIZE]; + struct sctp_tcb *locked_tcb = stcb; + int got_auth = 0; + uint32_t auth_offset = 0, auth_len = 0; + int auth_skipped = 0; + int asconf_cnt = 0; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + + SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_control: iphlen=%u, offset=%u, length=%u stcb:%p\n", + iphlen, *offset, length, stcb); + + /* validate chunk header length... */ + if (ntohs(ch->chunk_length) < sizeof(*ch)) { + SCTPDBG(SCTP_DEBUG_INPUT1, "Invalid header length %d\n", + ntohs(ch->chunk_length)); + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + /* + * validate the verification tag + */ + vtag_in = ntohl(sh->v_tag); + + if (locked_tcb) { + SCTP_TCB_LOCK_ASSERT(locked_tcb); + } + if (ch->chunk_type == SCTP_INITIATION) { + SCTPDBG(SCTP_DEBUG_INPUT1, "Its an INIT of len:%d vtag:%x\n", + ntohs(ch->chunk_length), vtag_in); + if (vtag_in != 0) { + /* protocol error- silently discard... */ + SCTP_STAT_INCR(sctps_badvtag); + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + } else if (ch->chunk_type != SCTP_COOKIE_ECHO) { + /* + * If there is no stcb, skip the AUTH chunk and process + * later after a stcb is found (to validate the lookup was + * valid. + */ + if ((ch->chunk_type == SCTP_AUTHENTICATION) && + (stcb == NULL) && + !SCTP_BASE_SYSCTL(sctp_auth_disable)) { + /* save this chunk for later processing */ + auth_skipped = 1; + auth_offset = *offset; + auth_len = ntohs(ch->chunk_length); + + /* (temporarily) move past this chunk */ + *offset += SCTP_SIZE32(auth_len); + if (*offset >= length) { + /* no more data left in the mbuf chain */ + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, + sizeof(struct sctp_chunkhdr), chunk_buf); + } + if (ch == NULL) { + /* Help */ + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + if (ch->chunk_type == SCTP_COOKIE_ECHO) { + goto process_control_chunks; + } + /* + * first check if it's an ASCONF with an unknown src addr we + * need to look inside to find the association + */ + if (ch->chunk_type == SCTP_ASCONF && stcb == NULL) { + struct sctp_chunkhdr *asconf_ch = ch; + uint32_t asconf_offset = 0, asconf_len = 0; + + /* inp's refcount may be reduced */ + SCTP_INP_INCR_REF(inp); + + asconf_offset = *offset; + do { + asconf_len = ntohs(asconf_ch->chunk_length); + if (asconf_len < sizeof(struct sctp_asconf_paramhdr)) + break; + stcb = sctp_findassociation_ep_asconf(m, iphlen, + *offset, sh, &inp, netp, vrf_id); + if (stcb != NULL) + break; + asconf_offset += SCTP_SIZE32(asconf_len); + asconf_ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, asconf_offset, + sizeof(struct sctp_chunkhdr), chunk_buf); + } while (asconf_ch != NULL && asconf_ch->chunk_type == SCTP_ASCONF); + if (stcb == NULL) { + /* + * reduce inp's refcount if not reduced in + * sctp_findassociation_ep_asconf(). + */ + SCTP_INP_DECR_REF(inp); + } else { + locked_tcb = stcb; + } + + /* now go back and verify any auth chunk to be sure */ + if (auth_skipped && (stcb != NULL)) { + struct sctp_auth_chunk *auth; + + auth = (struct sctp_auth_chunk *) + sctp_m_getptr(m, auth_offset, + auth_len, chunk_buf); + got_auth = 1; + auth_skipped = 0; + if ((auth == NULL) || sctp_handle_auth(stcb, auth, m, + auth_offset)) { + /* auth HMAC failed so dump it */ + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } else { + /* remaining chunks are HMAC checked */ + stcb->asoc.authenticated = 1; + } + } + } + if (stcb == NULL) { + /* no association, so it's out of the blue... */ + sctp_handle_ootb(m, iphlen, *offset, sh, inp, NULL, + vrf_id, port); + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + asoc = &stcb->asoc; + /* ABORT and SHUTDOWN can use either v_tag... */ + if ((ch->chunk_type == SCTP_ABORT_ASSOCIATION) || + (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) || + (ch->chunk_type == SCTP_PACKET_DROPPED)) { + if ((vtag_in == asoc->my_vtag) || + ((ch->chunk_flags & SCTP_HAD_NO_TCB) && + (vtag_in == asoc->peer_vtag))) { + /* this is valid */ + } else { + /* drop this packet... */ + SCTP_STAT_INCR(sctps_badvtag); + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + } else if (ch->chunk_type == SCTP_SHUTDOWN_ACK) { + if (vtag_in != asoc->my_vtag) { + /* + * this could be a stale SHUTDOWN-ACK or the + * peer never got the SHUTDOWN-COMPLETE and + * is still hung; we have started a new asoc + * but it won't complete until the shutdown + * is completed + */ + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + sctp_handle_ootb(m, iphlen, *offset, sh, inp, + NULL, vrf_id, port); + return (NULL); + } + } else { + /* for all other chunks, vtag must match */ + if (vtag_in != asoc->my_vtag) { + /* invalid vtag... */ + SCTPDBG(SCTP_DEBUG_INPUT3, + "invalid vtag: %xh, expect %xh\n", + vtag_in, asoc->my_vtag); + SCTP_STAT_INCR(sctps_badvtag); + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + return (NULL); + } + } + } /* end if !SCTP_COOKIE_ECHO */ + /* + * process all control chunks... + */ + if (((ch->chunk_type == SCTP_SELECTIVE_ACK) || + /* EY */ + (ch->chunk_type == SCTP_NR_SELECTIVE_ACK) || + (ch->chunk_type == SCTP_HEARTBEAT_REQUEST)) && + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED)) { + /* implied cookie-ack.. we must have lost the ack */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, + *netp); + } +process_control_chunks: + while (IS_SCTP_CONTROL(ch)) { + /* validate chunk length */ + chk_length = ntohs(ch->chunk_length); + SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_process_control: processing a chunk type=%u, len=%u\n", + ch->chunk_type, chk_length); + SCTP_LTRACE_CHK(inp, stcb, ch->chunk_type, chk_length); + if (chk_length < sizeof(*ch) || + (*offset + (int)chk_length) > length) { + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + SCTP_STAT_INCR_COUNTER64(sctps_incontrolchunks); + /* + * INIT-ACK only gets the init ack "header" portion only + * because we don't have to process the peer's COOKIE. All + * others get a complete chunk. + */ + if ((ch->chunk_type == SCTP_INITIATION_ACK) || + (ch->chunk_type == SCTP_INITIATION)) { + /* get an init-ack chunk */ + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, + sizeof(struct sctp_init_ack_chunk), chunk_buf); + if (ch == NULL) { + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + } else { + /* For cookies and all other chunks. */ + if (chk_length > sizeof(chunk_buf)) { + /* + * use just the size of the chunk buffer so + * the front part of our chunks fit in + * contiguous space up to the chunk buffer + * size (508 bytes). For chunks that need to + * get more than that they must use the + * sctp_m_getptr() function or other means + * (e.g. know how to parse mbuf chains). + * Cookies do this already. + */ + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, + (sizeof(chunk_buf) - 4), + chunk_buf); + if (ch == NULL) { + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + } else { + /* We can fit it all */ + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, + chk_length, chunk_buf); + if (ch == NULL) { + SCTP_PRINTF("sctp_process_control: Can't get the all data....\n"); + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + } + } + num_chunks++; + /* Save off the last place we got a control from */ + if (stcb != NULL) { + if (((netp != NULL) && (*netp != NULL)) || (ch->chunk_type == SCTP_ASCONF)) { + /* + * allow last_control to be NULL if + * ASCONF... ASCONF processing will find the + * right net later + */ + if ((netp != NULL) && (*netp != NULL)) + stcb->asoc.last_control_chunk_from = *netp; + } + } +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xB0, ch->chunk_type); +#endif + + /* check to see if this chunk required auth, but isn't */ + if ((stcb != NULL) && + !SCTP_BASE_SYSCTL(sctp_auth_disable) && + sctp_auth_is_required_chunk(ch->chunk_type, stcb->asoc.local_auth_chunks) && + !stcb->asoc.authenticated) { + /* "silently" ignore */ + SCTP_STAT_INCR(sctps_recvauthmissing); + goto next_chunk; + } + switch (ch->chunk_type) { + case SCTP_INITIATION: + /* must be first and only chunk */ + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT\n"); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + /* We are not interested anymore? */ + if ((stcb) && (stcb->asoc.total_output_queue_size)) { + /* + * collision case where we are + * sending to them too + */ + ; + } else { + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + return (NULL); + } + } + if ((chk_length > SCTP_LARGEST_INIT_ACCEPTED) || + (num_chunks > 1) || + (SCTP_BASE_SYSCTL(sctp_strict_init) && (length - *offset > (int)SCTP_SIZE32(chk_length)))) { + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + if ((stcb != NULL) && + (SCTP_GET_STATE(&stcb->asoc) == + SCTP_STATE_SHUTDOWN_ACK_SENT)) { + sctp_send_shutdown_ack(stcb, + stcb->asoc.primary_destination); + *offset = length; + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + if (netp) { + sctp_handle_init(m, iphlen, *offset, sh, + (struct sctp_init_chunk *)ch, inp, + stcb, *netp, &abort_no_unlock, vrf_id, port); + } + if (abort_no_unlock) + return (NULL); + + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + break; + case SCTP_PAD_CHUNK: + break; + case SCTP_INITIATION_ACK: + /* must be first and only chunk */ + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT-ACK\n"); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + /* We are not interested anymore */ + if ((stcb) && (stcb->asoc.total_output_queue_size)) { + ; + } else { + if (locked_tcb != stcb) { + /* Very unlikely */ + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + if (stcb) { +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(inp); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } + return (NULL); + } + } + if ((num_chunks > 1) || + (SCTP_BASE_SYSCTL(sctp_strict_init) && (length - *offset > (int)SCTP_SIZE32(chk_length)))) { + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + if ((netp) && (*netp)) { + ret = sctp_handle_init_ack(m, iphlen, *offset, sh, + (struct sctp_init_ack_chunk *)ch, stcb, *netp, &abort_no_unlock, vrf_id); + } else { + ret = -1; + } + /* + * Special case, I must call the output routine to + * get the cookie echoed + */ + if (abort_no_unlock) + return (NULL); + + if ((stcb) && ret == 0) + sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + break; + case SCTP_SELECTIVE_ACK: + { + struct sctp_sack_chunk *sack; + int abort_now = 0; + uint32_t a_rwnd, cum_ack; + uint16_t num_seg, num_dup; + uint8_t flags; + int offset_seg, offset_dup; + int nonce_sum_flag; + + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SACK\n"); + SCTP_STAT_INCR(sctps_recvsacks); + if (stcb == NULL) { + SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing SACK chunk\n"); + break; + } + if (chk_length < sizeof(struct sctp_sack_chunk)) { + SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on SACK chunk, too small\n"); + break; + } + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { + /*- + * If we have sent a shutdown-ack, we will pay no + * attention to a sack sent in to us since + * we don't care anymore. + */ + break; + } + sack = (struct sctp_sack_chunk *)ch; + flags = ch->chunk_flags; + nonce_sum_flag = flags & SCTP_SACK_NONCE_SUM; + cum_ack = ntohl(sack->sack.cum_tsn_ack); + num_seg = ntohs(sack->sack.num_gap_ack_blks); + num_dup = ntohs(sack->sack.num_dup_tsns); + a_rwnd = (uint32_t) ntohl(sack->sack.a_rwnd); + if (sizeof(struct sctp_sack_chunk) + + num_seg * sizeof(struct sctp_gap_ack_block) + + num_dup * sizeof(uint32_t) != chk_length) { + SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of SACK chunk\n"); + break; + } + offset_seg = *offset + sizeof(struct sctp_sack_chunk); + offset_dup = offset_seg + num_seg * sizeof(struct sctp_gap_ack_block); + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SACK process cum_ack:%x num_seg:%d a_rwnd:%d\n", + cum_ack, num_seg, a_rwnd); + stcb->asoc.seen_a_sack_this_pkt = 1; + if ((stcb->asoc.pr_sctp_cnt == 0) && + (num_seg == 0) && + ((compare_with_wrap(cum_ack, stcb->asoc.last_acked_seq, MAX_TSN)) || + (cum_ack == stcb->asoc.last_acked_seq)) && + (stcb->asoc.saw_sack_with_frags == 0) && + (stcb->asoc.saw_sack_with_nr_frags == 0) && + (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) + ) { + /* + * We have a SIMPLE sack having no + * prior segments and data on sent + * queue to be acked.. Use the + * faster path sack processing. We + * also allow window update sacks + * with no missing segments to go + * this way too. + */ + sctp_express_handle_sack(stcb, cum_ack, a_rwnd, nonce_sum_flag, + &abort_now); + } else { + if (netp && *netp) + sctp_handle_sack(m, offset_seg, offset_dup, + stcb, *netp, + num_seg, 0, num_dup, &abort_now, flags, + cum_ack, a_rwnd); + } + if (abort_now) { + /* ABORT signal from sack processing */ + *offset = length; + return (NULL); + } + if (TAILQ_EMPTY(&stcb->asoc.send_queue) && + TAILQ_EMPTY(&stcb->asoc.sent_queue) && + (stcb->asoc.stream_queue_cnt == 0)) { + sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); + } + } + break; + /* + * EY - nr_sack: If the received chunk is an + * nr_sack chunk + */ + case SCTP_NR_SELECTIVE_ACK: + { + struct sctp_nr_sack_chunk *nr_sack; + int abort_now = 0; + uint32_t a_rwnd, cum_ack; + uint16_t num_seg, num_nr_seg, num_dup; + uint8_t flags; + int offset_seg, offset_dup; + int nonce_sum_flag; + + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_NR_SACK\n"); + SCTP_STAT_INCR(sctps_recvsacks); + if (stcb == NULL) { + SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing NR-SACK chunk\n"); + break; + } + if ((stcb->asoc.sctp_nr_sack_on_off == 0) || + (stcb->asoc.peer_supports_nr_sack == 0)) { + goto unknown_chunk; + } + if (chk_length < sizeof(struct sctp_nr_sack_chunk)) { + SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on NR-SACK chunk, too small\n"); + break; + } + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) { + /*- + * If we have sent a shutdown-ack, we will pay no + * attention to a sack sent in to us since + * we don't care anymore. + */ + break; + } + nr_sack = (struct sctp_nr_sack_chunk *)ch; + flags = ch->chunk_flags; + nonce_sum_flag = flags & SCTP_SACK_NONCE_SUM; + + cum_ack = ntohl(nr_sack->nr_sack.cum_tsn_ack); + num_seg = ntohs(nr_sack->nr_sack.num_gap_ack_blks); + num_nr_seg = ntohs(nr_sack->nr_sack.num_nr_gap_ack_blks); + num_dup = ntohs(nr_sack->nr_sack.num_dup_tsns); + a_rwnd = (uint32_t) ntohl(nr_sack->nr_sack.a_rwnd); + if (sizeof(struct sctp_nr_sack_chunk) + + (num_seg + num_nr_seg) * sizeof(struct sctp_gap_ack_block) + + num_dup * sizeof(uint32_t) != chk_length) { + SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of NR_SACK chunk\n"); + break; + } + offset_seg = *offset + sizeof(struct sctp_nr_sack_chunk); + offset_dup = offset_seg + num_seg * sizeof(struct sctp_gap_ack_block); + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_NR_SACK process cum_ack:%x num_seg:%d a_rwnd:%d\n", + cum_ack, num_seg, a_rwnd); + stcb->asoc.seen_a_sack_this_pkt = 1; + if ((stcb->asoc.pr_sctp_cnt == 0) && + (num_seg == 0) && (num_nr_seg == 0) && + ((compare_with_wrap(cum_ack, stcb->asoc.last_acked_seq, MAX_TSN)) || + (cum_ack == stcb->asoc.last_acked_seq)) && + (stcb->asoc.saw_sack_with_frags == 0) && + (stcb->asoc.saw_sack_with_nr_frags == 0) && + (!TAILQ_EMPTY(&stcb->asoc.sent_queue))) { + /* + * We have a SIMPLE sack having no + * prior segments and data on sent + * queue to be acked. Use the faster + * path sack processing. We also + * allow window update sacks with no + * missing segments to go this way + * too. + */ + sctp_express_handle_sack(stcb, cum_ack, a_rwnd, nonce_sum_flag, + &abort_now); + } else { + if (netp && *netp) + sctp_handle_sack(m, offset_seg, offset_dup, + stcb, *netp, + num_seg, num_nr_seg, num_dup, &abort_now, flags, + cum_ack, a_rwnd); + } + if (abort_now) { + /* ABORT signal from sack processing */ + *offset = length; + return (NULL); + } + if (TAILQ_EMPTY(&stcb->asoc.send_queue) && + TAILQ_EMPTY(&stcb->asoc.sent_queue) && + (stcb->asoc.stream_queue_cnt == 0)) { + sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); + } + } + break; + + case SCTP_HEARTBEAT_REQUEST: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT\n"); + if ((stcb) && netp && *netp) { + SCTP_STAT_INCR(sctps_recvheartbeat); + sctp_send_heartbeat_ack(stcb, m, *offset, + chk_length, *netp); + + /* He's alive so give him credit */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + } + break; + case SCTP_HEARTBEAT_ACK: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT-ACK\n"); + if ((stcb == NULL) || (chk_length != sizeof(struct sctp_heartbeat_chunk))) { + /* Its not ours */ + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + /* He's alive so give him credit */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + SCTP_STAT_INCR(sctps_recvheartbeatack); + if (netp && *netp) + sctp_handle_heartbeat_ack((struct sctp_heartbeat_chunk *)ch, + stcb, *netp); + break; + case SCTP_ABORT_ASSOCIATION: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ABORT, stcb %p\n", + stcb); + if ((stcb) && netp && *netp) + sctp_handle_abort((struct sctp_abort_chunk *)ch, + stcb, *netp); + *offset = length; + return (NULL); + break; + case SCTP_SHUTDOWN: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN, stcb %p\n", + stcb); + if ((stcb == NULL) || (chk_length != sizeof(struct sctp_shutdown_chunk))) { + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + if (netp && *netp) { + int abort_flag = 0; + + sctp_handle_shutdown((struct sctp_shutdown_chunk *)ch, + stcb, *netp, &abort_flag); + if (abort_flag) { + *offset = length; + return (NULL); + } + } + break; + case SCTP_SHUTDOWN_ACK: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN-ACK, stcb %p\n", stcb); + if ((stcb) && (netp) && (*netp)) + sctp_handle_shutdown_ack((struct sctp_shutdown_ack_chunk *)ch, stcb, *netp); + *offset = length; + return (NULL); + break; + + case SCTP_OPERATION_ERROR: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_OP-ERR\n"); + if ((stcb) && netp && *netp && sctp_handle_error(ch, stcb, *netp) < 0) { + + *offset = length; + return (NULL); + } + break; + case SCTP_COOKIE_ECHO: + SCTPDBG(SCTP_DEBUG_INPUT3, + "SCTP_COOKIE-ECHO, stcb %p\n", stcb); + if ((stcb) && (stcb->asoc.total_output_queue_size)) { + ; + } else { + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + /* We are not interested anymore */ + abend: + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } + *offset = length; + return (NULL); + } + } + /* + * First are we accepting? We do this again here + * since it is possible that a previous endpoint WAS + * listening responded to a INIT-ACK and then + * closed. We opened and bound.. and are now no + * longer listening. + */ + + if ((stcb == NULL) && (inp->sctp_socket->so_qlen >= inp->sctp_socket->so_qlimit)) { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && + (SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit))) { + struct mbuf *oper; + struct sctp_paramhdr *phdr; + + oper = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + SCTP_BUF_LEN(oper) = + sizeof(struct sctp_paramhdr); + phdr = mtod(oper, + struct sctp_paramhdr *); + phdr->param_type = + htons(SCTP_CAUSE_OUT_OF_RESC); + phdr->param_length = + htons(sizeof(struct sctp_paramhdr)); + } + sctp_abort_association(inp, stcb, m, + iphlen, sh, oper, vrf_id, port); + } + *offset = length; + return (NULL); + } else { + struct mbuf *ret_buf; + struct sctp_inpcb *linp; + + if (stcb) { + linp = NULL; + } else { + linp = inp; + } + + if (linp) { + SCTP_ASOC_CREATE_LOCK(linp); + if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { + SCTP_ASOC_CREATE_UNLOCK(linp); + goto abend; + } + } + if (netp) { + ret_buf = + sctp_handle_cookie_echo(m, iphlen, + *offset, sh, + (struct sctp_cookie_echo_chunk *)ch, + &inp, &stcb, netp, + auth_skipped, + auth_offset, + auth_len, + &locked_tcb, + vrf_id, + port); + } else { + ret_buf = NULL; + } + if (linp) { + SCTP_ASOC_CREATE_UNLOCK(linp); + } + if (ret_buf == NULL) { + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + SCTPDBG(SCTP_DEBUG_INPUT3, + "GAK, null buffer\n"); + auth_skipped = 0; + *offset = length; + return (NULL); + } + /* if AUTH skipped, see if it verified... */ + if (auth_skipped) { + got_auth = 1; + auth_skipped = 0; + } + if (!TAILQ_EMPTY(&stcb->asoc.sent_queue)) { + /* + * Restart the timer if we have + * pending data + */ + struct sctp_tmit_chunk *chk; + + chk = TAILQ_FIRST(&stcb->asoc.sent_queue); + if (chk) { + sctp_timer_start(SCTP_TIMER_TYPE_SEND, + stcb->sctp_ep, stcb, + chk->whoTo); + } + } + } + break; + case SCTP_COOKIE_ACK: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE-ACK, stcb %p\n", stcb); + if ((stcb == NULL) || chk_length != sizeof(struct sctp_cookie_ack_chunk)) { + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + /* We are not interested anymore */ + if ((stcb) && (stcb->asoc.total_output_queue_size)) { + ; + } else if (stcb) { +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(inp); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + *offset = length; + return (NULL); + } + } + /* He's alive so give him credit */ + if ((stcb) && netp && *netp) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, *netp); + } + break; + case SCTP_ECN_ECHO: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN-ECHO\n"); + /* He's alive so give him credit */ + if ((stcb == NULL) || (chk_length != sizeof(struct sctp_ecne_chunk))) { + /* Its not ours */ + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + return (NULL); + } + if (stcb) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + sctp_handle_ecn_echo((struct sctp_ecne_chunk *)ch, + stcb); + } + break; + case SCTP_ECN_CWR: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN-CWR\n"); + /* He's alive so give him credit */ + if ((stcb == NULL) || (chk_length != sizeof(struct sctp_cwr_chunk))) { + /* Its not ours */ + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + return (NULL); + } + if (stcb) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + sctp_handle_ecn_cwr((struct sctp_cwr_chunk *)ch, stcb); + } + break; + case SCTP_SHUTDOWN_COMPLETE: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN-COMPLETE, stcb %p\n", stcb); + /* must be first and only chunk */ + if ((num_chunks > 1) || + (length - *offset > (int)SCTP_SIZE32(chk_length))) { + *offset = length; + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + return (NULL); + } + if ((stcb) && netp && *netp) { + sctp_handle_shutdown_complete((struct sctp_shutdown_complete_chunk *)ch, + stcb, *netp); + } + *offset = length; + return (NULL); + break; + case SCTP_ASCONF: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF\n"); + /* He's alive so give him credit */ + if (stcb) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + sctp_handle_asconf(m, *offset, + (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0); + asconf_cnt++; + } + break; + case SCTP_ASCONF_ACK: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF-ACK\n"); + if (chk_length < sizeof(struct sctp_asconf_ack_chunk)) { + /* Its not ours */ + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + return (NULL); + } + if ((stcb) && netp && *netp) { + /* He's alive so give him credit */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + sctp_handle_asconf_ack(m, *offset, + (struct sctp_asconf_ack_chunk *)ch, stcb, *netp, &abort_no_unlock); + if (abort_no_unlock) + return (NULL); + } + break; + case SCTP_FORWARD_CUM_TSN: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_FWD-TSN\n"); + if (chk_length < sizeof(struct sctp_forward_tsn_chunk)) { + /* Its not ours */ + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + return (NULL); + } + /* He's alive so give him credit */ + if (stcb) { + int abort_flag = 0; + + stcb->asoc.overall_error_count = 0; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + *fwd_tsn_seen = 1; + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + /* We are not interested anymore */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(inp); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_29); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + *offset = length; + return (NULL); + } + sctp_handle_forward_tsn(stcb, + (struct sctp_forward_tsn_chunk *)ch, &abort_flag, m, *offset); + if (abort_flag) { + *offset = length; + return (NULL); + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + } + + } + break; + case SCTP_STREAM_RESET: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_STREAM_RESET\n"); + if (((stcb == NULL) || (ch == NULL) || (chk_length < sizeof(struct sctp_stream_reset_tsn_req)))) { + /* Its not ours */ + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + return (NULL); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + /* We are not interested anymore */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(inp); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_30); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + *offset = length; + return (NULL); + } + if (stcb->asoc.peer_supports_strreset == 0) { + /* + * hmm, peer should have announced this, but + * we will turn it on since he is sending us + * a stream reset. + */ + stcb->asoc.peer_supports_strreset = 1; + } + if (sctp_handle_stream_reset(stcb, m, *offset, (struct sctp_stream_reset_out_req *)ch)) { + /* stop processing */ + *offset = length; + return (NULL); + } + break; + case SCTP_PACKET_DROPPED: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_PACKET_DROPPED\n"); + /* re-get it all please */ + if (chk_length < sizeof(struct sctp_pktdrop_chunk)) { + /* Its not ours */ + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + return (NULL); + } + if (ch && (stcb) && netp && (*netp)) { + sctp_handle_packet_dropped((struct sctp_pktdrop_chunk *)ch, + stcb, *netp, + min(chk_length, (sizeof(chunk_buf) - 4))); + + } + break; + + case SCTP_AUTHENTICATION: + SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_AUTHENTICATION\n"); + if (SCTP_BASE_SYSCTL(sctp_auth_disable)) + goto unknown_chunk; + + if (stcb == NULL) { + /* save the first AUTH for later processing */ + if (auth_skipped == 0) { + auth_offset = *offset; + auth_len = chk_length; + auth_skipped = 1; + } + /* skip this chunk (temporarily) */ + goto next_chunk; + } + if ((chk_length < (sizeof(struct sctp_auth_chunk))) || + (chk_length > (sizeof(struct sctp_auth_chunk) + + SCTP_AUTH_DIGEST_LEN_MAX))) { + /* Its not ours */ + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + return (NULL); + } + if (got_auth == 1) { + /* skip this chunk... it's already auth'd */ + goto next_chunk; + } + got_auth = 1; + if ((ch == NULL) || sctp_handle_auth(stcb, (struct sctp_auth_chunk *)ch, + m, *offset)) { + /* auth HMAC failed so dump the packet */ + *offset = length; + return (stcb); + } else { + /* remaining chunks are HMAC checked */ + stcb->asoc.authenticated = 1; + } + break; + + default: + unknown_chunk: + /* it's an unknown chunk! */ + if ((ch->chunk_type & 0x40) && (stcb != NULL)) { + struct mbuf *mm; + struct sctp_paramhdr *phd; + + mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), + 0, M_DONTWAIT, 1, MT_DATA); + if (mm) { + phd = mtod(mm, struct sctp_paramhdr *); + /* + * We cheat and use param type since + * we did not bother to define a + * error cause struct. They are the + * same basic format with different + * names. + */ + phd->param_type = htons(SCTP_CAUSE_UNRECOG_CHUNK); + phd->param_length = htons(chk_length + sizeof(*phd)); + SCTP_BUF_LEN(mm) = sizeof(*phd); + SCTP_BUF_NEXT(mm) = SCTP_M_COPYM(m, *offset, SCTP_SIZE32(chk_length), + M_DONTWAIT); + if (SCTP_BUF_NEXT(mm)) { +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = SCTP_BUF_NEXT(mm); + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_ICOPY); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + sctp_queue_op_err(stcb, mm); + } else { + sctp_m_freem(mm); + } + } + } + if ((ch->chunk_type & 0x80) == 0) { + /* discard this packet */ + *offset = length; + return (stcb); + } /* else skip this bad chunk and continue... */ + break; + } /* switch (ch->chunk_type) */ + + +next_chunk: + /* get the next chunk */ + *offset += SCTP_SIZE32(chk_length); + if (*offset >= length) { + /* no more data left in the mbuf chain */ + break; + } + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, + sizeof(struct sctp_chunkhdr), chunk_buf); + if (ch == NULL) { + if (locked_tcb) { + SCTP_TCB_UNLOCK(locked_tcb); + } + *offset = length; + return (NULL); + } + } /* while */ + + if (asconf_cnt > 0 && stcb != NULL) { + sctp_send_asconf_ack(stcb); + } + return (stcb); +} + + +/* + * Process the ECN bits we have something set so we must look to see if it is + * ECN(0) or ECN(1) or CE + */ +static void +sctp_process_ecn_marked_a(struct sctp_tcb *stcb, struct sctp_nets *net, + uint8_t ecn_bits) +{ + if ((ecn_bits & SCTP_CE_BITS) == SCTP_CE_BITS) { + ; + } else if ((ecn_bits & SCTP_ECT1_BIT) == SCTP_ECT1_BIT) { + /* + * we only add to the nonce sum for ECT1, ECT0 does not + * change the NS bit (that we have yet to find a way to send + * it yet). + */ + + /* ECN Nonce stuff */ + stcb->asoc.receiver_nonce_sum++; + stcb->asoc.receiver_nonce_sum &= SCTP_SACK_NONCE_SUM; + + /* + * Drag up the last_echo point if cumack is larger since we + * don't want the point falling way behind by more than + * 2^^31 and then having it be incorrect. + */ + if (compare_with_wrap(stcb->asoc.cumulative_tsn, + stcb->asoc.last_echo_tsn, MAX_TSN)) { + stcb->asoc.last_echo_tsn = stcb->asoc.cumulative_tsn; + } + } else if ((ecn_bits & SCTP_ECT0_BIT) == SCTP_ECT0_BIT) { + /* + * Drag up the last_echo point if cumack is larger since we + * don't want the point falling way behind by more than + * 2^^31 and then having it be incorrect. + */ + if (compare_with_wrap(stcb->asoc.cumulative_tsn, + stcb->asoc.last_echo_tsn, MAX_TSN)) { + stcb->asoc.last_echo_tsn = stcb->asoc.cumulative_tsn; + } + } +} + +static void +sctp_process_ecn_marked_b(struct sctp_tcb *stcb, struct sctp_nets *net, + uint32_t high_tsn, uint8_t ecn_bits) +{ + if ((ecn_bits & SCTP_CE_BITS) == SCTP_CE_BITS) { + /* + * we possibly must notify the sender that a congestion + * window reduction is in order. We do this by adding a ECNE + * chunk to the output chunk queue. The incoming CWR will + * remove this chunk. + */ + if (compare_with_wrap(high_tsn, stcb->asoc.last_echo_tsn, + MAX_TSN)) { + /* Yep, we need to add a ECNE */ + sctp_send_ecn_echo(stcb, net, high_tsn); + stcb->asoc.last_echo_tsn = high_tsn; + } + } +} + +#ifdef INVARIANTS +#ifdef __GNUC__ +__attribute__((noinline)) +#endif + void + sctp_validate_no_locks(struct sctp_inpcb *inp) +{ + struct sctp_tcb *lstcb; + + LIST_FOREACH(lstcb, &inp->sctp_asoc_list, sctp_tcblist) { + if (mtx_owned(&lstcb->tcb_mtx)) { + panic("Own lock on stcb at return from input"); + } + } + if (mtx_owned(&inp->inp_create_mtx)) { + panic("Own create lock on inp"); + } + if (mtx_owned(&inp->inp_mtx)) { + panic("Own inp lock on inp"); + } +} + +#endif + +/* + * common input chunk processing (v4 and v6) + */ +void +sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, + int length, struct sctphdr *sh, struct sctp_chunkhdr *ch, + struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net, + uint8_t ecn_bits, uint32_t vrf_id, uint16_t port) +{ + /* + * Control chunk processing + */ + uint32_t high_tsn; + int fwd_tsn_seen = 0, data_processed = 0; + struct mbuf *m = *mm; + int abort_flag = 0; + int un_sent; + + SCTP_STAT_INCR(sctps_recvdatagrams); +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xE0, 1); + sctp_auditing(0, inp, stcb, net); +#endif + + SCTPDBG(SCTP_DEBUG_INPUT1, "Ok, Common input processing called, m:%p iphlen:%d offset:%d length:%d stcb:%p\n", + m, iphlen, offset, length, stcb); + if (stcb) { + /* always clear this before beginning a packet */ + stcb->asoc.authenticated = 0; + stcb->asoc.seen_a_sack_this_pkt = 0; + SCTPDBG(SCTP_DEBUG_INPUT1, "stcb:%p state:%x\n", + stcb, stcb->asoc.state); + + if ((stcb->asoc.state & SCTP_STATE_WAS_ABORTED) || + (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED)) { + /*- + * If we hit here, we had a ref count + * up when the assoc was aborted and the + * timer is clearing out the assoc, we should + * NOT respond to any packet.. its OOTB. + */ + SCTP_TCB_UNLOCK(stcb); + sctp_handle_ootb(m, iphlen, offset, sh, inp, NULL, + vrf_id, port); + goto out_now; + } + } + if (IS_SCTP_CONTROL(ch)) { + /* process the control portion of the SCTP packet */ + /* sa_ignore NO_NULL_CHK */ + stcb = sctp_process_control(m, iphlen, &offset, length, sh, ch, + inp, stcb, &net, &fwd_tsn_seen, vrf_id, port); + if (stcb) { + /* + * This covers us if the cookie-echo was there and + * it changes our INP. + */ + inp = stcb->sctp_ep; + if ((net) && (port)) { + if (net->port == 0) { + sctp_pathmtu_adjustment(inp, stcb, net, net->mtu - sizeof(struct udphdr)); + } + net->port = port; + } + } + } else { + /* + * no control chunks, so pre-process DATA chunks (these + * checks are taken care of by control processing) + */ + + /* + * if DATA only packet, and auth is required, then punt... + * can't have authenticated without any AUTH (control) + * chunks + */ + if ((stcb != NULL) && + !SCTP_BASE_SYSCTL(sctp_auth_disable) && + sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks)) { + /* "silently" ignore */ + SCTP_STAT_INCR(sctps_recvauthmissing); + SCTP_TCB_UNLOCK(stcb); + goto out_now; + } + if (stcb == NULL) { + /* out of the blue DATA chunk */ + sctp_handle_ootb(m, iphlen, offset, sh, inp, NULL, + vrf_id, port); + goto out_now; + } + if (stcb->asoc.my_vtag != ntohl(sh->v_tag)) { + /* v_tag mismatch! */ + SCTP_STAT_INCR(sctps_badvtag); + SCTP_TCB_UNLOCK(stcb); + goto out_now; + } + } + + if (stcb == NULL) { + /* + * no valid TCB for this packet, or we found it's a bad + * packet while processing control, or we're done with this + * packet (done or skip rest of data), so we drop it... + */ + goto out_now; + } + /* + * DATA chunk processing + */ + /* plow through the data chunks while length > offset */ + + /* + * Rest should be DATA only. Check authentication state if AUTH for + * DATA is required. + */ + if ((length > offset) && + (stcb != NULL) && + !SCTP_BASE_SYSCTL(sctp_auth_disable) && + sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks) && + !stcb->asoc.authenticated) { + /* "silently" ignore */ + SCTP_STAT_INCR(sctps_recvauthmissing); + SCTPDBG(SCTP_DEBUG_AUTH1, + "Data chunk requires AUTH, skipped\n"); + goto trigger_send; + } + if (length > offset) { + int retval; + + /* + * First check to make sure our state is correct. We would + * not get here unless we really did have a tag, so we don't + * abort if this happens, just dump the chunk silently. + */ + switch (SCTP_GET_STATE(&stcb->asoc)) { + case SCTP_STATE_COOKIE_ECHOED: + /* + * we consider data with valid tags in this state + * shows us the cookie-ack was lost. Imply it was + * there. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_CLEAR, + stcb->asoc.overall_error_count, + 0, + SCTP_FROM_SCTP_INPUT, + __LINE__); + } + stcb->asoc.overall_error_count = 0; + sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, net); + break; + case SCTP_STATE_COOKIE_WAIT: + /* + * We consider OOTB any data sent during asoc setup. + */ + sctp_handle_ootb(m, iphlen, offset, sh, inp, NULL, + vrf_id, port); + SCTP_TCB_UNLOCK(stcb); + goto out_now; + /* sa_ignore NOTREACHED */ + break; + case SCTP_STATE_EMPTY: /* should not happen */ + case SCTP_STATE_INUSE: /* should not happen */ + case SCTP_STATE_SHUTDOWN_RECEIVED: /* This is a peer error */ + case SCTP_STATE_SHUTDOWN_ACK_SENT: + default: + SCTP_TCB_UNLOCK(stcb); + goto out_now; + /* sa_ignore NOTREACHED */ + break; + case SCTP_STATE_OPEN: + case SCTP_STATE_SHUTDOWN_SENT: + break; + } + /* take care of ECN, part 1. */ + if (stcb->asoc.ecn_allowed && + (ecn_bits & (SCTP_ECT0_BIT | SCTP_ECT1_BIT))) { + sctp_process_ecn_marked_a(stcb, net, ecn_bits); + } + /* plow through the data chunks while length > offset */ + retval = sctp_process_data(mm, iphlen, &offset, length, sh, + inp, stcb, net, &high_tsn); + if (retval == 2) { + /* + * The association aborted, NO UNLOCK needed since + * the association is destroyed. + */ + goto out_now; + } + data_processed = 1; + if (retval == 0) { + /* take care of ecn part 2. */ + if (stcb->asoc.ecn_allowed && + (ecn_bits & (SCTP_ECT0_BIT | SCTP_ECT1_BIT))) { + sctp_process_ecn_marked_b(stcb, net, high_tsn, + ecn_bits); + } + } + /* + * Anything important needs to have been m_copy'ed in + * process_data + */ + } + if ((data_processed == 0) && (fwd_tsn_seen)) { + int was_a_gap; + uint32_t highest_tsn; + + if (compare_with_wrap(stcb->asoc.highest_tsn_inside_nr_map, stcb->asoc.highest_tsn_inside_map, MAX_TSN)) { + highest_tsn = stcb->asoc.highest_tsn_inside_nr_map; + } else { + highest_tsn = stcb->asoc.highest_tsn_inside_map; + } + was_a_gap = compare_with_wrap(highest_tsn, stcb->asoc.cumulative_tsn, MAX_TSN); + stcb->asoc.send_sack = 1; + sctp_sack_check(stcb, was_a_gap, &abort_flag); + if (abort_flag) { + /* Again, we aborted so NO UNLOCK needed */ + goto out_now; + } + } else if (fwd_tsn_seen) { + stcb->asoc.send_sack = 1; + } + /* trigger send of any chunks in queue... */ +trigger_send: +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xE0, 2); + sctp_auditing(1, inp, stcb, net); +#endif + SCTPDBG(SCTP_DEBUG_INPUT1, + "Check for chunk output prw:%d tqe:%d tf=%d\n", + stcb->asoc.peers_rwnd, + TAILQ_EMPTY(&stcb->asoc.control_send_queue), + stcb->asoc.total_flight); + un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight); + + if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue) || + ((un_sent) && + (stcb->asoc.peers_rwnd > 0 || + (stcb->asoc.peers_rwnd <= 0 && stcb->asoc.total_flight == 0)))) { + SCTPDBG(SCTP_DEBUG_INPUT3, "Calling chunk OUTPUT\n"); + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); + SCTPDBG(SCTP_DEBUG_INPUT3, "chunk OUTPUT returns\n"); + } +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xE0, 3); + sctp_auditing(2, inp, stcb, net); +#endif + SCTP_TCB_UNLOCK(stcb); +out_now: +#ifdef INVARIANTS + sctp_validate_no_locks(inp); +#endif + return; +} + +#if 0 +static void +sctp_print_mbuf_chain(struct mbuf *m) +{ + for (; m; m = SCTP_BUF_NEXT(m)) { + printf("%p: m_len = %ld\n", m, SCTP_BUF_LEN(m)); + if (SCTP_BUF_IS_EXTENDED(m)) + printf("%p: extend_size = %d\n", m, SCTP_BUF_EXTEND_SIZE(m)); + } +} + +#endif + +void +sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) +{ +#ifdef SCTP_MBUF_LOGGING + struct mbuf *mat; + +#endif + struct mbuf *m; + int iphlen; + uint32_t vrf_id = 0; + uint8_t ecn_bits; + struct ip *ip; + struct sctphdr *sh; + struct sctp_inpcb *inp = NULL; + struct sctp_nets *net; + struct sctp_tcb *stcb = NULL; + struct sctp_chunkhdr *ch; + int refcount_up = 0; + int length, mlen, offset; + +#if !defined(SCTP_WITH_NO_CSUM) + uint32_t check, calc_check; + +#endif + + if (SCTP_GET_PKT_VRFID(i_pak, vrf_id)) { + SCTP_RELEASE_PKT(i_pak); + return; + } + mlen = SCTP_HEADER_LEN(i_pak); + iphlen = off; + m = SCTP_HEADER_TO_CHAIN(i_pak); + + net = NULL; + SCTP_STAT_INCR(sctps_recvpackets); + SCTP_STAT_INCR_COUNTER64(sctps_inpackets); + + +#ifdef SCTP_MBUF_LOGGING + /* Log in any input mbufs */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + mat = m; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_INPUT); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif +#ifdef SCTP_PACKET_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) + sctp_packet_log(m, mlen); +#endif + /* + * Must take out the iphlen, since mlen expects this (only effect lb + * case) + */ + mlen -= iphlen; + + /* + * Get IP, SCTP, and first chunk header together in first mbuf. + */ + ip = mtod(m, struct ip *); + offset = iphlen + sizeof(*sh) + sizeof(*ch); + if (SCTP_BUF_LEN(m) < offset) { + if ((m = m_pullup(m, offset)) == 0) { + SCTP_STAT_INCR(sctps_hdrops); + return; + } + ip = mtod(m, struct ip *); + } + /* validate mbuf chain length with IP payload length */ + if (mlen < (SCTP_GET_IPV4_LENGTH(ip) - iphlen)) { + SCTP_STAT_INCR(sctps_hdrops); + goto bad; + } + sh = (struct sctphdr *)((caddr_t)ip + iphlen); + ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(*sh)); + SCTPDBG(SCTP_DEBUG_INPUT1, + "sctp_input() length:%d iphlen:%d\n", mlen, iphlen); + + /* SCTP does not allow broadcasts or multicasts */ + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + goto bad; + } + if (SCTP_IS_IT_BROADCAST(ip->ip_dst, m)) { + /* + * We only look at broadcast if its a front state, All + * others we will not have a tcb for anyway. + */ + goto bad; + } + /* validate SCTP checksum */ + SCTPDBG(SCTP_DEBUG_CRCOFFLOAD, + "sctp_input(): Packet of length %d received on %s with csum_flags 0x%x.\n", + m->m_pkthdr.len, + if_name(m->m_pkthdr.rcvif), + m->m_pkthdr.csum_flags); +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_recvnocrc); +#else + if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { + SCTP_STAT_INCR(sctps_recvhwcrc); + goto sctp_skip_csum_4; + } + check = sh->checksum; /* save incoming checksum */ + sh->checksum = 0; /* prepare for calc */ + calc_check = sctp_calculate_cksum(m, iphlen); + sh->checksum = check; + SCTP_STAT_INCR(sctps_recvswcrc); + if (calc_check != check) { + SCTPDBG(SCTP_DEBUG_INPUT1, "Bad CSUM on SCTP packet calc_check:%x check:%x m:%p mlen:%d iphlen:%d\n", + calc_check, check, m, mlen, iphlen); + + stcb = sctp_findassociation_addr(m, iphlen, + offset - sizeof(*ch), + sh, ch, &inp, &net, + vrf_id); + if ((net) && (port)) { + if (net->port == 0) { + sctp_pathmtu_adjustment(inp, stcb, net, net->mtu - sizeof(struct udphdr)); + } + net->port = port; + } + if ((inp) && (stcb)) { + sctp_send_packet_dropped(stcb, net, m, iphlen, 1); + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_INPUT_ERROR, SCTP_SO_NOT_LOCKED); + } else if ((inp != NULL) && (stcb == NULL)) { + refcount_up = 1; + } + SCTP_STAT_INCR(sctps_badsum); + SCTP_STAT_INCR_COUNTER32(sctps_checksumerrors); + goto bad; + } +sctp_skip_csum_4: +#endif + /* destination port of 0 is illegal, based on RFC2960. */ + if (sh->dest_port == 0) { + SCTP_STAT_INCR(sctps_hdrops); + goto bad; + } + /* + * Locate pcb and tcb for datagram sctp_findassociation_addr() wants + * IP/SCTP/first chunk header... + */ + stcb = sctp_findassociation_addr(m, iphlen, offset - sizeof(*ch), + sh, ch, &inp, &net, vrf_id); + if ((net) && (port)) { + if (net->port == 0) { + sctp_pathmtu_adjustment(inp, stcb, net, net->mtu - sizeof(struct udphdr)); + } + net->port = port; + } + /* inp's ref-count increased && stcb locked */ + if (inp == NULL) { + struct sctp_init_chunk *init_chk, chunk_buf; + + SCTP_STAT_INCR(sctps_noport); +#ifdef ICMP_BANDLIM + /* + * we use the bandwidth limiting to protect against sending + * too many ABORTS all at once. In this case these count the + * same as an ICMP message. + */ + if (badport_bandlim(0) < 0) + goto bad; +#endif /* ICMP_BANDLIM */ + SCTPDBG(SCTP_DEBUG_INPUT1, + "Sending a ABORT from packet entry!\n"); + if (ch->chunk_type == SCTP_INITIATION) { + /* + * we do a trick here to get the INIT tag, dig in + * and get the tag from the INIT and put it in the + * common header. + */ + init_chk = (struct sctp_init_chunk *)sctp_m_getptr(m, + iphlen + sizeof(*sh), sizeof(*init_chk), + (uint8_t *) & chunk_buf); + if (init_chk != NULL) + sh->v_tag = init_chk->init.initiate_tag; + } + if (ch->chunk_type == SCTP_SHUTDOWN_ACK) { + sctp_send_shutdown_complete2(m, iphlen, sh, vrf_id, port); + goto bad; + } + if (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) { + goto bad; + } + if (ch->chunk_type != SCTP_ABORT_ASSOCIATION) + sctp_send_abort(m, iphlen, sh, 0, NULL, vrf_id, port); + goto bad; + } else if (stcb == NULL) { + refcount_up = 1; + } +#ifdef IPSEC + /* + * I very much doubt any of the IPSEC stuff will work but I have no + * idea, so I will leave it in place. + */ + if (inp && ipsec4_in_reject(m, &inp->ip_inp.inp)) { + MODULE_GLOBAL(ipsec4stat).in_polvio++; + SCTP_STAT_INCR(sctps_hdrops); + goto bad; + } +#endif /* IPSEC */ + + /* + * common chunk processing + */ + length = ip->ip_len + iphlen; + offset -= sizeof(struct sctp_chunkhdr); + + ecn_bits = ip->ip_tos; + + /* sa_ignore NO_NULL_CHK */ + sctp_common_input_processing(&m, iphlen, offset, length, sh, ch, + inp, stcb, net, ecn_bits, vrf_id, port); + /* inp's ref-count reduced && stcb unlocked */ + if (m) { + sctp_m_freem(m); + } + if ((inp) && (refcount_up)) { + /* reduce ref-count */ + SCTP_INP_DECR_REF(inp); + } + return; +bad: + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } + if ((inp) && (refcount_up)) { + /* reduce ref-count */ + SCTP_INP_DECR_REF(inp); + } + if (m) { + sctp_m_freem(m); + } + return; +} +void +sctp_input(i_pak, off) + struct mbuf *i_pak; + int off; +{ + sctp_input_with_port(i_pak, off, 0); +} diff --git a/freebsd/sys/netinet/sctp_input.h b/freebsd/sys/netinet/sctp_input.h new file mode 100644 index 00000000..90cd098a --- /dev/null +++ b/freebsd/sys/netinet/sctp_input.h @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_input.h,v 1.6 2005/03/06 16:04:17 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_input_h__ +#define __sctp_input_h__ + +#if defined(_KERNEL) || defined(__Userspace__) +void +sctp_common_input_processing(struct mbuf **, int, int, int, + struct sctphdr *, struct sctp_chunkhdr *, struct sctp_inpcb *, + struct sctp_tcb *, struct sctp_nets *, uint8_t, uint32_t, uint16_t); + +struct sctp_stream_reset_out_request * +sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, + struct sctp_tmit_chunk **bchk); + +void +sctp_reset_in_stream(struct sctp_tcb *stcb, int number_entries, + uint16_t * list); + + +int sctp_is_there_unsent_data(struct sctp_tcb *stcb); + +#endif +#endif diff --git a/freebsd/sys/netinet/sctp_lock_bsd.h b/freebsd/sys/netinet/sctp_lock_bsd.h new file mode 100644 index 00000000..81e4a35f --- /dev/null +++ b/freebsd/sys/netinet/sctp_lock_bsd.h @@ -0,0 +1,430 @@ +#ifndef __sctp_lock_bsd_h__ +#define __sctp_lock_bsd_h__ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * General locking concepts: The goal of our locking is to of course provide + * consistency and yet minimize overhead. We will attempt to use + * non-recursive locks which are supposed to be quite inexpensive. Now in + * order to do this the goal is that most functions are not aware of locking. + * Once we have a TCB we lock it and unlock when we are through. This means + * that the TCB lock is kind-of a "global" lock when working on an + * association. Caution must be used when asserting a TCB_LOCK since if we + * recurse we deadlock. + * + * Most other locks (INP and INFO) attempt to localize the locking i.e. we try + * to contain the lock and unlock within the function that needs to lock it. + * This sometimes mean we do extra locks and unlocks and lose a bit of + * efficency, but if the performance statements about non-recursive locks are + * true this should not be a problem. One issue that arises with this only + * lock when needed is that if an implicit association setup is done we have + * a problem. If at the time I lookup an association I have NULL in the tcb + * return, by the time I call to create the association some other processor + * could have created it. This is what the CREATE lock on the endpoint. + * Places where we will be implicitly creating the association OR just + * creating an association (the connect call) will assert the CREATE_INP + * lock. This will assure us that during all the lookup of INP and INFO if + * another creator is also locking/looking up we can gate the two to + * synchronize. So the CREATE_INP lock is also another one we must use + * extreme caution in locking to make sure we don't hit a re-entrancy issue. + * + * For non FreeBSD 5.x we provide a bunch of EMPTY lock macros so we can + * blatantly put locks everywhere and they reduce to nothing on + * NetBSD/OpenBSD and FreeBSD 4.x + * + */ + +/* + * When working with the global SCTP lists we lock and unlock the INP_INFO + * lock. So when we go to lookup an association we will want to do a + * SCTP_INP_INFO_RLOCK() and then when we want to add a new association to + * the SCTP_BASE_INFO() list's we will do a SCTP_INP_INFO_WLOCK(). + */ +#include +__FBSDID("$FreeBSD$"); + + +extern struct sctp_foo_stuff sctp_logoff[]; +extern int sctp_logoff_stuff; + +#define SCTP_IPI_COUNT_INIT() + +#define SCTP_STATLOG_INIT_LOCK() +#define SCTP_STATLOG_LOCK() +#define SCTP_STATLOG_UNLOCK() +#define SCTP_STATLOG_DESTROY() + +#define SCTP_INP_INFO_LOCK_DESTROY() do { \ + if(rw_wowned(&SCTP_BASE_INFO(ipi_ep_mtx))) { \ + rw_wunlock(&SCTP_BASE_INFO(ipi_ep_mtx)); \ + } \ + rw_destroy(&SCTP_BASE_INFO(ipi_ep_mtx)); \ + } while (0) + +#define SCTP_INP_INFO_LOCK_INIT() \ + rw_init(&SCTP_BASE_INFO(ipi_ep_mtx), "sctp-info"); + + +#define SCTP_INP_INFO_RLOCK() do { \ + rw_rlock(&SCTP_BASE_INFO(ipi_ep_mtx)); \ +} while (0) + + +#define SCTP_INP_INFO_WLOCK() do { \ + rw_wlock(&SCTP_BASE_INFO(ipi_ep_mtx)); \ +} while (0) + + +#define SCTP_INP_INFO_RUNLOCK() rw_runlock(&SCTP_BASE_INFO(ipi_ep_mtx)) +#define SCTP_INP_INFO_WUNLOCK() rw_wunlock(&SCTP_BASE_INFO(ipi_ep_mtx)) + + +#define SCTP_IPI_ADDR_INIT() \ + rw_init(&SCTP_BASE_INFO(ipi_addr_mtx), "sctp-addr") +#define SCTP_IPI_ADDR_DESTROY() do { \ + if(rw_wowned(&SCTP_BASE_INFO(ipi_addr_mtx))) { \ + rw_wunlock(&SCTP_BASE_INFO(ipi_addr_mtx)); \ + } \ + rw_destroy(&SCTP_BASE_INFO(ipi_addr_mtx)); \ + } while (0) +#define SCTP_IPI_ADDR_RLOCK() do { \ + rw_rlock(&SCTP_BASE_INFO(ipi_addr_mtx)); \ +} while (0) +#define SCTP_IPI_ADDR_WLOCK() do { \ + rw_wlock(&SCTP_BASE_INFO(ipi_addr_mtx)); \ +} while (0) + +#define SCTP_IPI_ADDR_RUNLOCK() rw_runlock(&SCTP_BASE_INFO(ipi_addr_mtx)) +#define SCTP_IPI_ADDR_WUNLOCK() rw_wunlock(&SCTP_BASE_INFO(ipi_addr_mtx)) + + +#define SCTP_IPI_ITERATOR_WQ_INIT() \ + mtx_init(&sctp_it_ctl.ipi_iterator_wq_mtx, "sctp-it-wq", "sctp_it_wq", MTX_DEF) + +#define SCTP_IPI_ITERATOR_WQ_DESTROY() \ + mtx_destroy(&sctp_it_ctl.ipi_iterator_wq_mtx) + +#define SCTP_IPI_ITERATOR_WQ_LOCK() do { \ + mtx_lock(&sctp_it_ctl.ipi_iterator_wq_mtx); \ +} while (0) + +#define SCTP_IPI_ITERATOR_WQ_UNLOCK() mtx_unlock(&sctp_it_ctl.ipi_iterator_wq_mtx) + + +#define SCTP_IP_PKTLOG_INIT() \ + mtx_init(&SCTP_BASE_INFO(ipi_pktlog_mtx), "sctp-pktlog", "packetlog", MTX_DEF) + + +#define SCTP_IP_PKTLOG_LOCK() do { \ + mtx_lock(&SCTP_BASE_INFO(ipi_pktlog_mtx)); \ +} while (0) + +#define SCTP_IP_PKTLOG_UNLOCK() mtx_unlock(&SCTP_BASE_INFO(ipi_pktlog_mtx)) + +#define SCTP_IP_PKTLOG_DESTROY() \ + mtx_destroy(&SCTP_BASE_INFO(ipi_pktlog_mtx)) + + + + + +/* + * The INP locks we will use for locking an SCTP endpoint, so for example if + * we want to change something at the endpoint level for example random_store + * or cookie secrets we lock the INP level. + */ + +#define SCTP_INP_READ_INIT(_inp) \ + mtx_init(&(_inp)->inp_rdata_mtx, "sctp-read", "inpr", MTX_DEF | MTX_DUPOK) + +#define SCTP_INP_READ_DESTROY(_inp) \ + mtx_destroy(&(_inp)->inp_rdata_mtx) + +#define SCTP_INP_READ_LOCK(_inp) do { \ + mtx_lock(&(_inp)->inp_rdata_mtx); \ +} while (0) + + +#define SCTP_INP_READ_UNLOCK(_inp) mtx_unlock(&(_inp)->inp_rdata_mtx) + + +#define SCTP_INP_LOCK_INIT(_inp) \ + mtx_init(&(_inp)->inp_mtx, "sctp-inp", "inp", MTX_DEF | MTX_DUPOK) +#define SCTP_ASOC_CREATE_LOCK_INIT(_inp) \ + mtx_init(&(_inp)->inp_create_mtx, "sctp-create", "inp_create", \ + MTX_DEF | MTX_DUPOK) + +#define SCTP_INP_LOCK_DESTROY(_inp) \ + mtx_destroy(&(_inp)->inp_mtx) + +#define SCTP_INP_LOCK_CONTENDED(_inp) ((_inp)->inp_mtx.mtx_lock & MTX_CONTESTED) + +#define SCTP_INP_READ_CONTENDED(_inp) ((_inp)->inp_rdata_mtx.mtx_lock & MTX_CONTESTED) + +#define SCTP_ASOC_CREATE_LOCK_CONTENDED(_inp) ((_inp)->inp_create_mtx.mtx_lock & MTX_CONTESTED) + + +#define SCTP_ASOC_CREATE_LOCK_DESTROY(_inp) \ + mtx_destroy(&(_inp)->inp_create_mtx) + + +#ifdef SCTP_LOCK_LOGGING +#define SCTP_INP_RLOCK(_inp) do { \ + if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) sctp_log_lock(_inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_INP);\ + mtx_lock(&(_inp)->inp_mtx); \ +} while (0) + +#define SCTP_INP_WLOCK(_inp) do { \ + if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) sctp_log_lock(_inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_INP);\ + mtx_lock(&(_inp)->inp_mtx); \ +} while (0) + +#else + +#define SCTP_INP_RLOCK(_inp) do { \ + mtx_lock(&(_inp)->inp_mtx); \ +} while (0) + +#define SCTP_INP_WLOCK(_inp) do { \ + mtx_lock(&(_inp)->inp_mtx); \ +} while (0) + +#endif + + +#define SCTP_TCB_SEND_LOCK_INIT(_tcb) \ + mtx_init(&(_tcb)->tcb_send_mtx, "sctp-send-tcb", "tcbs", MTX_DEF | MTX_DUPOK) + +#define SCTP_TCB_SEND_LOCK_DESTROY(_tcb) mtx_destroy(&(_tcb)->tcb_send_mtx) + +#define SCTP_TCB_SEND_LOCK(_tcb) do { \ + mtx_lock(&(_tcb)->tcb_send_mtx); \ +} while (0) + +#define SCTP_TCB_SEND_UNLOCK(_tcb) mtx_unlock(&(_tcb)->tcb_send_mtx) + +#define SCTP_INP_INCR_REF(_inp) atomic_add_int(&((_inp)->refcount), 1) +#define SCTP_INP_DECR_REF(_inp) atomic_add_int(&((_inp)->refcount), -1) + + +#ifdef SCTP_LOCK_LOGGING +#define SCTP_ASOC_CREATE_LOCK(_inp) \ + do { \ + if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) sctp_log_lock(_inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_CREATE); \ + mtx_lock(&(_inp)->inp_create_mtx); \ + } while (0) +#else + +#define SCTP_ASOC_CREATE_LOCK(_inp) \ + do { \ + mtx_lock(&(_inp)->inp_create_mtx); \ + } while (0) +#endif + +#define SCTP_INP_RUNLOCK(_inp) mtx_unlock(&(_inp)->inp_mtx) +#define SCTP_INP_WUNLOCK(_inp) mtx_unlock(&(_inp)->inp_mtx) +#define SCTP_ASOC_CREATE_UNLOCK(_inp) mtx_unlock(&(_inp)->inp_create_mtx) + +/* + * For the majority of things (once we have found the association) we will + * lock the actual association mutex. This will protect all the assoiciation + * level queues and streams and such. We will need to lock the socket layer + * when we stuff data up into the receiving sb_mb. I.e. we will need to do an + * extra SOCKBUF_LOCK(&so->so_rcv) even though the association is locked. + */ + +#define SCTP_TCB_LOCK_INIT(_tcb) \ + mtx_init(&(_tcb)->tcb_mtx, "sctp-tcb", "tcb", MTX_DEF | MTX_DUPOK) + +#define SCTP_TCB_LOCK_DESTROY(_tcb) mtx_destroy(&(_tcb)->tcb_mtx) + +#ifdef SCTP_LOCK_LOGGING +#define SCTP_TCB_LOCK(_tcb) do { \ + if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) sctp_log_lock(_tcb->sctp_ep, _tcb, SCTP_LOG_LOCK_TCB); \ + mtx_lock(&(_tcb)->tcb_mtx); \ +} while (0) + +#else +#define SCTP_TCB_LOCK(_tcb) do { \ + mtx_lock(&(_tcb)->tcb_mtx); \ +} while (0) + +#endif + + +#define SCTP_TCB_TRYLOCK(_tcb) mtx_trylock(&(_tcb)->tcb_mtx) + +#define SCTP_TCB_UNLOCK(_tcb) mtx_unlock(&(_tcb)->tcb_mtx) + +#define SCTP_TCB_UNLOCK_IFOWNED(_tcb) do { \ + if (mtx_owned(&(_tcb)->tcb_mtx)) \ + mtx_unlock(&(_tcb)->tcb_mtx); \ + } while (0) + + + +#ifdef INVARIANTS +#define SCTP_TCB_LOCK_ASSERT(_tcb) do { \ + if (mtx_owned(&(_tcb)->tcb_mtx) == 0) \ + panic("Don't own TCB lock"); \ + } while (0) +#else +#define SCTP_TCB_LOCK_ASSERT(_tcb) +#endif + +#define SCTP_ITERATOR_LOCK_INIT() \ + mtx_init(&sctp_it_ctl.it_mtx, "sctp-it", "iterator", MTX_DEF) + +#ifdef INVARIANTS +#define SCTP_ITERATOR_LOCK() \ + do { \ + if (mtx_owned(&sctp_it_ctl.it_mtx)) \ + panic("Iterator Lock"); \ + mtx_lock(&sctp_it_ctl.it_mtx); \ + } while (0) +#else +#define SCTP_ITERATOR_LOCK() \ + do { \ + mtx_lock(&sctp_it_ctl.it_mtx); \ + } while (0) + +#endif + +#define SCTP_ITERATOR_UNLOCK() mtx_unlock(&sctp_it_ctl.it_mtx) +#define SCTP_ITERATOR_LOCK_DESTROY() mtx_destroy(&sctp_it_ctl.it_mtx) + + +#define SCTP_WQ_ADDR_INIT() do { \ + mtx_init(&SCTP_BASE_INFO(wq_addr_mtx), "sctp-addr-wq","sctp_addr_wq",MTX_DEF); \ + } while (0) + +#define SCTP_WQ_ADDR_DESTROY() do { \ + if(mtx_owned(&SCTP_BASE_INFO(wq_addr_mtx))) { \ + mtx_unlock(&SCTP_BASE_INFO(wq_addr_mtx)); \ + } \ + mtx_destroy(&SCTP_BASE_INFO(wq_addr_mtx)); \ + } while (0) + +#define SCTP_WQ_ADDR_LOCK() do { \ + mtx_lock(&SCTP_BASE_INFO(wq_addr_mtx)); \ +} while (0) +#define SCTP_WQ_ADDR_UNLOCK() do { \ + mtx_unlock(&SCTP_BASE_INFO(wq_addr_mtx)); \ +} while (0) + + + +#define SCTP_INCR_EP_COUNT() \ + do { \ + atomic_add_int(&SCTP_BASE_INFO(ipi_count_ep), 1); \ + } while (0) + +#define SCTP_DECR_EP_COUNT() \ + do { \ + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ep), 1); \ + } while (0) + +#define SCTP_INCR_ASOC_COUNT() \ + do { \ + atomic_add_int(&SCTP_BASE_INFO(ipi_count_asoc), 1); \ + } while (0) + +#define SCTP_DECR_ASOC_COUNT() \ + do { \ + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_asoc), 1); \ + } while (0) + +#define SCTP_INCR_LADDR_COUNT() \ + do { \ + atomic_add_int(&SCTP_BASE_INFO(ipi_count_laddr), 1); \ + } while (0) + +#define SCTP_DECR_LADDR_COUNT() \ + do { \ + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_laddr), 1); \ + } while (0) + +#define SCTP_INCR_RADDR_COUNT() \ + do { \ + atomic_add_int(&SCTP_BASE_INFO(ipi_count_raddr), 1); \ + } while (0) + +#define SCTP_DECR_RADDR_COUNT() \ + do { \ + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_raddr),1); \ + } while (0) + +#define SCTP_INCR_CHK_COUNT() \ + do { \ + atomic_add_int(&SCTP_BASE_INFO(ipi_count_chunk), 1); \ + } while (0) +#ifdef INVARIANTS +#define SCTP_DECR_CHK_COUNT() \ + do { \ + if(SCTP_BASE_INFO(ipi_count_chunk) == 0) \ + panic("chunk count to 0?"); \ + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_chunk), 1); \ + } while (0) +#else +#define SCTP_DECR_CHK_COUNT() \ + do { \ + if(SCTP_BASE_INFO(ipi_count_chunk) != 0) \ + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_chunk), 1); \ + } while (0) +#endif +#define SCTP_INCR_READQ_COUNT() \ + do { \ + atomic_add_int(&SCTP_BASE_INFO(ipi_count_readq),1); \ + } while (0) + +#define SCTP_DECR_READQ_COUNT() \ + do { \ + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_readq), 1); \ + } while (0) + +#define SCTP_INCR_STRMOQ_COUNT() \ + do { \ + atomic_add_int(&SCTP_BASE_INFO(ipi_count_strmoq), 1); \ + } while (0) + +#define SCTP_DECR_STRMOQ_COUNT() \ + do { \ + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_strmoq), 1); \ + } while (0) + + +#if defined(SCTP_SO_LOCK_TESTING) +#define SCTP_INP_SO(sctpinp) (sctpinp)->ip_inp.inp.inp_socket +#define SCTP_SOCKET_LOCK(so, refcnt) +#define SCTP_SOCKET_UNLOCK(so, refcnt) +#endif + +#endif diff --git a/freebsd/sys/netinet/sctp_os.h b/freebsd/sys/netinet/sctp_os.h new file mode 100644 index 00000000..c1a392f0 --- /dev/null +++ b/freebsd/sys/netinet/sctp_os.h @@ -0,0 +1,72 @@ +/*- + * Copyright (c) 2006-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); +#ifndef __sctp_os_h__ +#define __sctp_os_h__ + +/* + * General kernel memory allocation: + * SCTP_MALLOC(element, type, size, name) + * SCTP_FREE(element) + * Kernel memory allocation for "soname"- memory must be zeroed. + * SCTP_MALLOC_SONAME(name, type, size) + * SCTP_FREE_SONAME(name) + */ + +/* + * Zone(pool) allocation routines: MUST be defined for each OS. + * zone = zone/pool pointer. + * name = string name of the zone/pool. + * size = size of each zone/pool element. + * number = number of elements in zone/pool. + * type = structure type to allocate + * + * sctp_zone_t + * SCTP_ZONE_INIT(zone, name, size, number) + * SCTP_ZONE_GET(zone, type) + * SCTP_ZONE_FREE(zone, element) + * SCTP_ZONE_DESTROY(zone) + */ + +#include + + + + + +/* All os's must implement this address gatherer. If + * no VRF's exist, then vrf 0 is the only one and all + * addresses and ifn's live here. + */ +#define SCTP_DEFAULT_VRF 0 +void sctp_init_vrf_list(int vrfid); + +#endif diff --git a/freebsd/sys/netinet/sctp_os_bsd.h b/freebsd/sys/netinet/sctp_os_bsd.h new file mode 100644 index 00000000..cf29776f --- /dev/null +++ b/freebsd/sys/netinet/sctp_os_bsd.h @@ -0,0 +1,503 @@ +/*- + * Copyright (c) 2006-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); +#ifndef __sctp_os_bsd_h__ +#define __sctp_os_bsd_h__ +/* + * includes + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef IPSEC +#include +#include +#endif /* IPSEC */ + +#ifdef INET6 +#include +#ifdef IPSEC +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#endif /* INET6 */ + + +#include + +#ifndef in6pcb +#define in6pcb inpcb +#endif +/* Declare all the malloc names for all the various mallocs */ +MALLOC_DECLARE(SCTP_M_MAP); +MALLOC_DECLARE(SCTP_M_STRMI); +MALLOC_DECLARE(SCTP_M_STRMO); +MALLOC_DECLARE(SCTP_M_ASC_ADDR); +MALLOC_DECLARE(SCTP_M_ASC_IT); +MALLOC_DECLARE(SCTP_M_AUTH_CL); +MALLOC_DECLARE(SCTP_M_AUTH_KY); +MALLOC_DECLARE(SCTP_M_AUTH_HL); +MALLOC_DECLARE(SCTP_M_AUTH_IF); +MALLOC_DECLARE(SCTP_M_STRESET); +MALLOC_DECLARE(SCTP_M_CMSG); +MALLOC_DECLARE(SCTP_M_COPYAL); +MALLOC_DECLARE(SCTP_M_VRF); +MALLOC_DECLARE(SCTP_M_IFA); +MALLOC_DECLARE(SCTP_M_IFN); +MALLOC_DECLARE(SCTP_M_TIMW); +MALLOC_DECLARE(SCTP_M_MVRF); +MALLOC_DECLARE(SCTP_M_ITER); +MALLOC_DECLARE(SCTP_M_SOCKOPT); + +#if defined(SCTP_LOCAL_TRACE_BUF) + +#define SCTP_GET_CYCLECOUNT get_cyclecount() +#define SCTP_CTR6 sctp_log_trace + +#else +#define SCTP_CTR6 CTR6 +#endif + +/* + * Macros to expand out globals defined by various modules + * to either a real global or a virtualized instance of one, + * depending on whether VIMAGE is defined. + */ +/* then define the macro(s) that hook into the vimage macros */ +#define MODULE_GLOBAL(__SYMBOL) V_##__SYMBOL + +#define V_system_base_info VNET(system_base_info) +#define SCTP_BASE_INFO(__m) V_system_base_info.sctppcbinfo.__m +#define SCTP_BASE_STATS V_system_base_info.sctpstat +#define SCTP_BASE_STATS_SYSCTL VNET_NAME(system_base_info.sctpstat) +#define SCTP_BASE_STAT(__m) V_system_base_info.sctpstat.__m +#define SCTP_BASE_SYSCTL(__m) VNET_NAME(system_base_info.sctpsysctl.__m) +#define SCTP_BASE_VAR(__m) V_system_base_info.__m + +/* + * + */ +#define USER_ADDR_NULL (NULL) /* FIX ME: temp */ + +#if defined(SCTP_DEBUG) +#define SCTPDBG(level, params...) \ +{ \ + do { \ + if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \ + printf(params); \ + } \ + } while (0); \ +} +#define SCTPDBG_ADDR(level, addr) \ +{ \ + do { \ + if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \ + sctp_print_address(addr); \ + } \ + } while (0); \ +} +#define SCTPDBG_PKT(level, iph, sh) \ +{ \ + do { \ + if (SCTP_BASE_SYSCTL(sctp_debug_on) & level) { \ + sctp_print_address_pkt(iph, sh); \ + } \ + } while (0); \ +} +#else +#define SCTPDBG(level, params...) +#define SCTPDBG_ADDR(level, addr) +#define SCTPDBG_PKT(level, iph, sh) +#endif +#define SCTP_PRINTF(params...) printf(params) + +#ifdef SCTP_LTRACE_CHUNKS +#define SCTP_LTRACE_CHK(a, b, c, d) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_CHUNK_ENABLE) SCTP_CTR6(KTR_SUBSYS, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_CHUNK_PROC, 0, a, b, c, d) +#else +#define SCTP_LTRACE_CHK(a, b, c, d) +#endif + +#ifdef SCTP_LTRACE_ERRORS +#define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \ + printf("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \ + m, inp, stcb, net, file, __LINE__, err); +#define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \ + printf("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \ + inp, stcb, net, file, __LINE__, err); +#else +#define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) +#define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) +#endif + + +/* + * Local address and interface list handling + */ +#define SCTP_MAX_VRF_ID 0 +#define SCTP_SIZE_OF_VRF_HASH 3 +#define SCTP_IFNAMSIZ IFNAMSIZ +#define SCTP_DEFAULT_VRFID 0 +#define SCTP_VRF_ADDR_HASH_SIZE 16 +#define SCTP_VRF_IFN_HASH_SIZE 3 +#define SCTP_INIT_VRF_TABLEID(vrf) + +#define SCTP_IFN_IS_IFT_LOOP(ifn) ((ifn)->ifn_type == IFT_LOOP) +#define SCTP_ROUTE_IS_REAL_LOOP(ro) ((ro)->ro_rt && (ro)->ro_rt->rt_ifa && (ro)->ro_rt->rt_ifa->ifa_ifp && (ro)->ro_rt->rt_ifa->ifa_ifp->if_type == IFT_LOOP) + +/* + * Access to IFN's to help with src-addr-selection + */ +/* This could return VOID if the index works but for BSD we provide both. */ +#define SCTP_GET_IFN_VOID_FROM_ROUTE(ro) (void *)ro->ro_rt->rt_ifp +#define SCTP_GET_IF_INDEX_FROM_ROUTE(ro) (ro)->ro_rt->rt_ifp->if_index +#define SCTP_ROUTE_HAS_VALID_IFN(ro) ((ro)->ro_rt && (ro)->ro_rt->rt_ifp) + +/* + * general memory allocation + */ +#define SCTP_MALLOC(var, type, size, name) \ + do { \ + var = (type)malloc(size, name, M_NOWAIT); \ + } while (0) + +#define SCTP_FREE(var, type) free(var, type) + +#define SCTP_MALLOC_SONAME(var, type, size) \ + do { \ + var = (type)malloc(size, M_SONAME, M_WAITOK | M_ZERO); \ + } while (0) + +#define SCTP_FREE_SONAME(var) free(var, M_SONAME) + +#define SCTP_PROCESS_STRUCT struct proc * + +/* + * zone allocation functions + */ +#include + +/* SCTP_ZONE_INIT: initialize the zone */ +typedef struct uma_zone *sctp_zone_t; + +#define SCTP_ZONE_INIT(zone, name, size, number) { \ + zone = uma_zcreate(name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,\ + 0); \ + uma_zone_set_max(zone, number); \ +} + +#define SCTP_ZONE_DESTROY(zone) uma_zdestroy(zone) + +/* SCTP_ZONE_GET: allocate element from the zone */ +#define SCTP_ZONE_GET(zone, type) \ + (type *)uma_zalloc(zone, M_NOWAIT); + +/* SCTP_ZONE_FREE: free element from the zone */ +#define SCTP_ZONE_FREE(zone, element) \ + uma_zfree(zone, element); + +#define SCTP_HASH_INIT(size, hashmark) hashinit_flags(size, M_PCB, hashmark, HASH_NOWAIT) +#define SCTP_HASH_FREE(table, hashmark) hashdestroy(table, M_PCB, hashmark) + +#define SCTP_M_COPYM m_copym + +/* + * timers + */ +#include +typedef struct callout sctp_os_timer_t; + + +#define SCTP_OS_TIMER_INIT(tmr) callout_init(tmr, 1) +#define SCTP_OS_TIMER_START callout_reset +#define SCTP_OS_TIMER_STOP callout_stop +#define SCTP_OS_TIMER_STOP_DRAIN callout_drain +#define SCTP_OS_TIMER_PENDING callout_pending +#define SCTP_OS_TIMER_ACTIVE callout_active +#define SCTP_OS_TIMER_DEACTIVATE callout_deactivate + +#define sctp_get_tick_count() (ticks) + +#define SCTP_UNUSED __attribute__((unused)) + +/* + * Functions + */ +/* Mbuf manipulation and access macros */ +#define SCTP_BUF_LEN(m) (m->m_len) +#define SCTP_BUF_NEXT(m) (m->m_next) +#define SCTP_BUF_NEXT_PKT(m) (m->m_nextpkt) +#define SCTP_BUF_RESV_UF(m, size) m->m_data += size +#define SCTP_BUF_AT(m, size) m->m_data + size +#define SCTP_BUF_IS_EXTENDED(m) (m->m_flags & M_EXT) +#define SCTP_BUF_EXTEND_SIZE(m) (m->m_ext.ext_size) +#define SCTP_BUF_TYPE(m) (m->m_type) +#define SCTP_BUF_RECVIF(m) (m->m_pkthdr.rcvif) +#define SCTP_BUF_PREPEND M_PREPEND + +#define SCTP_ALIGN_TO_END(m, len) if(m->m_flags & M_PKTHDR) { \ + MH_ALIGN(m, len); \ + } else if ((m->m_flags & M_EXT) == 0) { \ + M_ALIGN(m, len); \ + } + +/* We make it so if you have up to 4 threads + * writing based on the default size of + * the packet log 65 k, that would be + * 4 16k packets before we would hit + * a problem. + */ +#define SCTP_PKTLOG_WRITERS_NEED_LOCK 3 + +/*************************/ +/* MTU */ +/*************************/ +#define SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, af) ((struct ifnet *)ifn)->if_mtu +#define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, rt) ((rt != NULL) ? rt->rt_rmx.rmx_mtu : 0) +#define SCTP_GATHER_MTU_FROM_INTFC(sctp_ifn) ((sctp_ifn->ifn_p != NULL) ? ((struct ifnet *)(sctp_ifn->ifn_p))->if_mtu : 0) +#define SCTP_SET_MTU_OF_ROUTE(sa, rt, mtu) do { \ + if (rt != NULL) \ + rt->rt_rmx.rmx_mtu = mtu; \ + } while(0) + +/* (de-)register interface event notifications */ +#define SCTP_REGISTER_INTERFACE(ifhandle, af) +#define SCTP_DEREGISTER_INTERFACE(ifhandle, af) + + +/*************************/ +/* These are for logging */ +/*************************/ +/* return the base ext data pointer */ +#define SCTP_BUF_EXTEND_BASE(m) (m->m_ext.ext_buf) + /* return the refcnt of the data pointer */ +#define SCTP_BUF_EXTEND_REFCNT(m) (*m->m_ext.ref_cnt) +/* return any buffer related flags, this is + * used beyond logging for apple only. + */ +#define SCTP_BUF_GET_FLAGS(m) (m->m_flags) + +/* For BSD this just accesses the M_PKTHDR length + * so it operates on an mbuf with hdr flag. Other + * O/S's may have separate packet header and mbuf + * chain pointers.. thus the macro. + */ +#define SCTP_HEADER_TO_CHAIN(m) (m) +#define SCTP_DETACH_HEADER_FROM_CHAIN(m) +#define SCTP_HEADER_LEN(m) (m->m_pkthdr.len) +#define SCTP_GET_HEADER_FOR_OUTPUT(o_pak) 0 +#define SCTP_RELEASE_HEADER(m) +#define SCTP_RELEASE_PKT(m) sctp_m_freem(m) +#define SCTP_ENABLE_UDP_CSUM(m) do { \ + m->m_pkthdr.csum_flags = CSUM_UDP; \ + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); \ + } while (0) + +#define SCTP_GET_PKT_VRFID(m, vrf_id) ((vrf_id = SCTP_DEFAULT_VRFID) != SCTP_DEFAULT_VRFID) + + + +/* Attach the chain of data into the sendable packet. */ +#define SCTP_ATTACH_CHAIN(pak, m, packet_length) do { \ + pak = m; \ + pak->m_pkthdr.len = packet_length; \ + } while(0) + +/* Other m_pkthdr type things */ +#define SCTP_IS_IT_BROADCAST(dst, m) ((m->m_flags & M_PKTHDR) ? in_broadcast(dst, m->m_pkthdr.rcvif) : 0) +#define SCTP_IS_IT_LOOPBACK(m) ((m->m_flags & M_PKTHDR) && ((m->m_pkthdr.rcvif == NULL) || (m->m_pkthdr.rcvif->if_type == IFT_LOOP))) + + +/* This converts any input packet header + * into the chain of data holders, for BSD + * its a NOP. + */ + +/* Macro's for getting length from V6/V4 header */ +#define SCTP_GET_IPV4_LENGTH(iph) (iph->ip_len) +#define SCTP_GET_IPV6_LENGTH(ip6) (ntohs(ip6->ip6_plen)) + +/* get the v6 hop limit */ +#define SCTP_GET_HLIM(inp, ro) in6_selecthlim((struct in6pcb *)&inp->ip_inp.inp, (ro ? (ro->ro_rt ? (ro->ro_rt->rt_ifp) : (NULL)) : (NULL))); + +/* is the endpoint v6only? */ +#define SCTP_IPV6_V6ONLY(inp) (((struct inpcb *)inp)->inp_flags & IN6P_IPV6_V6ONLY) +/* is the socket non-blocking? */ +#define SCTP_SO_IS_NBIO(so) ((so)->so_state & SS_NBIO) +#define SCTP_SET_SO_NBIO(so) ((so)->so_state |= SS_NBIO) +#define SCTP_CLEAR_SO_NBIO(so) ((so)->so_state &= ~SS_NBIO) +/* get the socket type */ +#define SCTP_SO_TYPE(so) ((so)->so_type) +/* reserve sb space for a socket */ +#define SCTP_SORESERVE(so, send, recv) soreserve(so, send, recv) +/* wakeup a socket */ +#define SCTP_SOWAKEUP(so) wakeup(&(so)->so_timeo) +/* clear the socket buffer state */ +#define SCTP_SB_CLEAR(sb) \ + (sb).sb_cc = 0; \ + (sb).sb_mb = NULL; \ + (sb).sb_mbcnt = 0; + +#define SCTP_SB_LIMIT_RCV(so) so->so_rcv.sb_hiwat +#define SCTP_SB_LIMIT_SND(so) so->so_snd.sb_hiwat + +/* + * routes, output, etc. + */ +typedef struct route sctp_route_t; +typedef struct rtentry sctp_rtentry_t; + +#define SCTP_RTALLOC(ro, vrf_id) rtalloc_ign((struct route *)ro, 0UL) + +/* Future zero copy wakeup/send function */ +#define SCTP_ZERO_COPY_EVENT(inp, so) +/* This is re-pulse ourselves for sendbuf */ +#define SCTP_ZERO_COPY_SENDQ_EVENT(inp, so) + +/* + * IP output routines + */ +#define SCTP_IP_OUTPUT(result, o_pak, ro, stcb, vrf_id) \ +{ \ + int o_flgs = IP_RAWOUTPUT; \ + struct sctp_tcb *local_stcb = stcb; \ + if (local_stcb && \ + local_stcb->sctp_ep && \ + local_stcb->sctp_ep->sctp_socket) \ + o_flgs |= local_stcb->sctp_ep->sctp_socket->so_options & SO_DONTROUTE; \ + result = ip_output(o_pak, NULL, ro, o_flgs, 0, NULL); \ +} + +#define SCTP_IP6_OUTPUT(result, o_pak, ro, ifp, stcb, vrf_id) \ +{ \ + struct sctp_tcb *local_stcb = stcb; \ + if (local_stcb && local_stcb->sctp_ep) \ + result = ip6_output(o_pak, \ + ((struct in6pcb *)(local_stcb->sctp_ep))->in6p_outputopts, \ + (ro), 0, 0, ifp, NULL); \ + else \ + result = ip6_output(o_pak, NULL, (ro), 0, 0, ifp, NULL); \ +} + +struct mbuf * +sctp_get_mbuf_for_msg(unsigned int space_needed, + int want_header, int how, int allonebuf, int type); + + +/* + * SCTP AUTH + */ +#define HAVE_SHA2 + +#define SCTP_READ_RANDOM(buf, len) read_random(buf, len) + +#ifdef USE_SCTP_SHA1 +#include +#else +#include +/* map standard crypto API names */ +#define SHA1_Init SHA1Init +#define SHA1_Update SHA1Update +#define SHA1_Final(x,y) SHA1Final((caddr_t)x, y) +#endif + +#if defined(HAVE_SHA2) +#include +#endif + +#endif + +#define SCTP_DECREMENT_AND_CHECK_REFCOUNT(addr) (atomic_fetchadd_int(addr, -1) == 1) +#if defined(INVARIANTS) +#define SCTP_SAVE_ATOMIC_DECREMENT(addr, val) \ +{ \ + int32_t oldval; \ + oldval = atomic_fetchadd_int(addr, -val); \ + if (oldval < val) { \ + panic("Counter goes negative"); \ + } \ +} +#else +#define SCTP_SAVE_ATOMIC_DECREMENT(addr, val) \ +{ \ + int32_t oldval; \ + oldval = atomic_fetchadd_int(addr, -val); \ + if (oldval < val) { \ + *addr = 0; \ + } \ +} +#endif diff --git a/freebsd/sys/netinet/sctp_output.c b/freebsd/sys/netinet/sctp_output.c new file mode 100644 index 00000000..9acd3288 --- /dev/null +++ b/freebsd/sys/netinet/sctp_output.c @@ -0,0 +1,13539 @@ +#include + +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_output.c,v 1.46 2005/03/06 16:04:17 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#define SCTP_MAX_GAPS_INARRAY 4 +struct sack_track { + uint8_t right_edge; /* mergable on the right edge */ + uint8_t left_edge; /* mergable on the left edge */ + uint8_t num_entries; + uint8_t spare; + struct sctp_gap_ack_block gaps[SCTP_MAX_GAPS_INARRAY]; +}; + +struct sack_track sack_array[256] = { + {0, 0, 0, 0, /* 0x00 */ + {{0, 0}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 1, 0, /* 0x01 */ + {{0, 0}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x02 */ + {{1, 1}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 1, 0, /* 0x03 */ + {{0, 1}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x04 */ + {{2, 2}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x05 */ + {{0, 0}, + {2, 2}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x06 */ + {{1, 2}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 1, 0, /* 0x07 */ + {{0, 2}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x08 */ + {{3, 3}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x09 */ + {{0, 0}, + {3, 3}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x0a */ + {{1, 1}, + {3, 3}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x0b */ + {{0, 1}, + {3, 3}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x0c */ + {{2, 3}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x0d */ + {{0, 0}, + {2, 3}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x0e */ + {{1, 3}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 1, 0, /* 0x0f */ + {{0, 3}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x10 */ + {{4, 4}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x11 */ + {{0, 0}, + {4, 4}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x12 */ + {{1, 1}, + {4, 4}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x13 */ + {{0, 1}, + {4, 4}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x14 */ + {{2, 2}, + {4, 4}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x15 */ + {{0, 0}, + {2, 2}, + {4, 4}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x16 */ + {{1, 2}, + {4, 4}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x17 */ + {{0, 2}, + {4, 4}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x18 */ + {{3, 4}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x19 */ + {{0, 0}, + {3, 4}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x1a */ + {{1, 1}, + {3, 4}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x1b */ + {{0, 1}, + {3, 4}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x1c */ + {{2, 4}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x1d */ + {{0, 0}, + {2, 4}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x1e */ + {{1, 4}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 1, 0, /* 0x1f */ + {{0, 4}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x20 */ + {{5, 5}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x21 */ + {{0, 0}, + {5, 5}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x22 */ + {{1, 1}, + {5, 5}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x23 */ + {{0, 1}, + {5, 5}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x24 */ + {{2, 2}, + {5, 5}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x25 */ + {{0, 0}, + {2, 2}, + {5, 5}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x26 */ + {{1, 2}, + {5, 5}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x27 */ + {{0, 2}, + {5, 5}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x28 */ + {{3, 3}, + {5, 5}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x29 */ + {{0, 0}, + {3, 3}, + {5, 5}, + {0, 0} + } + }, + {0, 0, 3, 0, /* 0x2a */ + {{1, 1}, + {3, 3}, + {5, 5}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x2b */ + {{0, 1}, + {3, 3}, + {5, 5}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x2c */ + {{2, 3}, + {5, 5}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x2d */ + {{0, 0}, + {2, 3}, + {5, 5}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x2e */ + {{1, 3}, + {5, 5}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x2f */ + {{0, 3}, + {5, 5}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x30 */ + {{4, 5}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x31 */ + {{0, 0}, + {4, 5}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x32 */ + {{1, 1}, + {4, 5}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x33 */ + {{0, 1}, + {4, 5}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x34 */ + {{2, 2}, + {4, 5}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x35 */ + {{0, 0}, + {2, 2}, + {4, 5}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x36 */ + {{1, 2}, + {4, 5}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x37 */ + {{0, 2}, + {4, 5}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x38 */ + {{3, 5}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x39 */ + {{0, 0}, + {3, 5}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x3a */ + {{1, 1}, + {3, 5}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x3b */ + {{0, 1}, + {3, 5}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x3c */ + {{2, 5}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x3d */ + {{0, 0}, + {2, 5}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x3e */ + {{1, 5}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 1, 0, /* 0x3f */ + {{0, 5}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x40 */ + {{6, 6}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x41 */ + {{0, 0}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x42 */ + {{1, 1}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x43 */ + {{0, 1}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x44 */ + {{2, 2}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x45 */ + {{0, 0}, + {2, 2}, + {6, 6}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x46 */ + {{1, 2}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x47 */ + {{0, 2}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x48 */ + {{3, 3}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x49 */ + {{0, 0}, + {3, 3}, + {6, 6}, + {0, 0} + } + }, + {0, 0, 3, 0, /* 0x4a */ + {{1, 1}, + {3, 3}, + {6, 6}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x4b */ + {{0, 1}, + {3, 3}, + {6, 6}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x4c */ + {{2, 3}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x4d */ + {{0, 0}, + {2, 3}, + {6, 6}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x4e */ + {{1, 3}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x4f */ + {{0, 3}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x50 */ + {{4, 4}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x51 */ + {{0, 0}, + {4, 4}, + {6, 6}, + {0, 0} + } + }, + {0, 0, 3, 0, /* 0x52 */ + {{1, 1}, + {4, 4}, + {6, 6}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x53 */ + {{0, 1}, + {4, 4}, + {6, 6}, + {0, 0} + } + }, + {0, 0, 3, 0, /* 0x54 */ + {{2, 2}, + {4, 4}, + {6, 6}, + {0, 0} + } + }, + {1, 0, 4, 0, /* 0x55 */ + {{0, 0}, + {2, 2}, + {4, 4}, + {6, 6} + } + }, + {0, 0, 3, 0, /* 0x56 */ + {{1, 2}, + {4, 4}, + {6, 6}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x57 */ + {{0, 2}, + {4, 4}, + {6, 6}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x58 */ + {{3, 4}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x59 */ + {{0, 0}, + {3, 4}, + {6, 6}, + {0, 0} + } + }, + {0, 0, 3, 0, /* 0x5a */ + {{1, 1}, + {3, 4}, + {6, 6}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x5b */ + {{0, 1}, + {3, 4}, + {6, 6}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x5c */ + {{2, 4}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x5d */ + {{0, 0}, + {2, 4}, + {6, 6}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x5e */ + {{1, 4}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x5f */ + {{0, 4}, + {6, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x60 */ + {{5, 6}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x61 */ + {{0, 0}, + {5, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x62 */ + {{1, 1}, + {5, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x63 */ + {{0, 1}, + {5, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x64 */ + {{2, 2}, + {5, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x65 */ + {{0, 0}, + {2, 2}, + {5, 6}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x66 */ + {{1, 2}, + {5, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x67 */ + {{0, 2}, + {5, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x68 */ + {{3, 3}, + {5, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x69 */ + {{0, 0}, + {3, 3}, + {5, 6}, + {0, 0} + } + }, + {0, 0, 3, 0, /* 0x6a */ + {{1, 1}, + {3, 3}, + {5, 6}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x6b */ + {{0, 1}, + {3, 3}, + {5, 6}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x6c */ + {{2, 3}, + {5, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x6d */ + {{0, 0}, + {2, 3}, + {5, 6}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x6e */ + {{1, 3}, + {5, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x6f */ + {{0, 3}, + {5, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x70 */ + {{4, 6}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x71 */ + {{0, 0}, + {4, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x72 */ + {{1, 1}, + {4, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x73 */ + {{0, 1}, + {4, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x74 */ + {{2, 2}, + {4, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 3, 0, /* 0x75 */ + {{0, 0}, + {2, 2}, + {4, 6}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x76 */ + {{1, 2}, + {4, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x77 */ + {{0, 2}, + {4, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x78 */ + {{3, 6}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x79 */ + {{0, 0}, + {3, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 2, 0, /* 0x7a */ + {{1, 1}, + {3, 6}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x7b */ + {{0, 1}, + {3, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x7c */ + {{2, 6}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 2, 0, /* 0x7d */ + {{0, 0}, + {2, 6}, + {0, 0}, + {0, 0} + } + }, + {0, 0, 1, 0, /* 0x7e */ + {{1, 6}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 0, 1, 0, /* 0x7f */ + {{0, 6}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 1, 0, /* 0x80 */ + {{7, 7}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0x81 */ + {{0, 0}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0x82 */ + {{1, 1}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0x83 */ + {{0, 1}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0x84 */ + {{2, 2}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0x85 */ + {{0, 0}, + {2, 2}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0x86 */ + {{1, 2}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0x87 */ + {{0, 2}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0x88 */ + {{3, 3}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0x89 */ + {{0, 0}, + {3, 3}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0x8a */ + {{1, 1}, + {3, 3}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0x8b */ + {{0, 1}, + {3, 3}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0x8c */ + {{2, 3}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0x8d */ + {{0, 0}, + {2, 3}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0x8e */ + {{1, 3}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0x8f */ + {{0, 3}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0x90 */ + {{4, 4}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0x91 */ + {{0, 0}, + {4, 4}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0x92 */ + {{1, 1}, + {4, 4}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0x93 */ + {{0, 1}, + {4, 4}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0x94 */ + {{2, 2}, + {4, 4}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 4, 0, /* 0x95 */ + {{0, 0}, + {2, 2}, + {4, 4}, + {7, 7} + } + }, + {0, 1, 3, 0, /* 0x96 */ + {{1, 2}, + {4, 4}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0x97 */ + {{0, 2}, + {4, 4}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0x98 */ + {{3, 4}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0x99 */ + {{0, 0}, + {3, 4}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0x9a */ + {{1, 1}, + {3, 4}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0x9b */ + {{0, 1}, + {3, 4}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0x9c */ + {{2, 4}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0x9d */ + {{0, 0}, + {2, 4}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0x9e */ + {{1, 4}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0x9f */ + {{0, 4}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xa0 */ + {{5, 5}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xa1 */ + {{0, 0}, + {5, 5}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xa2 */ + {{1, 1}, + {5, 5}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xa3 */ + {{0, 1}, + {5, 5}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xa4 */ + {{2, 2}, + {5, 5}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 4, 0, /* 0xa5 */ + {{0, 0}, + {2, 2}, + {5, 5}, + {7, 7} + } + }, + {0, 1, 3, 0, /* 0xa6 */ + {{1, 2}, + {5, 5}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xa7 */ + {{0, 2}, + {5, 5}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xa8 */ + {{3, 3}, + {5, 5}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 4, 0, /* 0xa9 */ + {{0, 0}, + {3, 3}, + {5, 5}, + {7, 7} + } + }, + {0, 1, 4, 0, /* 0xaa */ + {{1, 1}, + {3, 3}, + {5, 5}, + {7, 7} + } + }, + {1, 1, 4, 0, /* 0xab */ + {{0, 1}, + {3, 3}, + {5, 5}, + {7, 7} + } + }, + {0, 1, 3, 0, /* 0xac */ + {{2, 3}, + {5, 5}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 4, 0, /* 0xad */ + {{0, 0}, + {2, 3}, + {5, 5}, + {7, 7} + } + }, + {0, 1, 3, 0, /* 0xae */ + {{1, 3}, + {5, 5}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xaf */ + {{0, 3}, + {5, 5}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xb0 */ + {{4, 5}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xb1 */ + {{0, 0}, + {4, 5}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xb2 */ + {{1, 1}, + {4, 5}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xb3 */ + {{0, 1}, + {4, 5}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xb4 */ + {{2, 2}, + {4, 5}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 4, 0, /* 0xb5 */ + {{0, 0}, + {2, 2}, + {4, 5}, + {7, 7} + } + }, + {0, 1, 3, 0, /* 0xb6 */ + {{1, 2}, + {4, 5}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xb7 */ + {{0, 2}, + {4, 5}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xb8 */ + {{3, 5}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xb9 */ + {{0, 0}, + {3, 5}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xba */ + {{1, 1}, + {3, 5}, + {7, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xbb */ + {{0, 1}, + {3, 5}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xbc */ + {{2, 5}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xbd */ + {{0, 0}, + {2, 5}, + {7, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xbe */ + {{1, 5}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xbf */ + {{0, 5}, + {7, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 1, 0, /* 0xc0 */ + {{6, 7}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xc1 */ + {{0, 0}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xc2 */ + {{1, 1}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xc3 */ + {{0, 1}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xc4 */ + {{2, 2}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xc5 */ + {{0, 0}, + {2, 2}, + {6, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xc6 */ + {{1, 2}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xc7 */ + {{0, 2}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xc8 */ + {{3, 3}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xc9 */ + {{0, 0}, + {3, 3}, + {6, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xca */ + {{1, 1}, + {3, 3}, + {6, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xcb */ + {{0, 1}, + {3, 3}, + {6, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xcc */ + {{2, 3}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xcd */ + {{0, 0}, + {2, 3}, + {6, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xce */ + {{1, 3}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xcf */ + {{0, 3}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xd0 */ + {{4, 4}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xd1 */ + {{0, 0}, + {4, 4}, + {6, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xd2 */ + {{1, 1}, + {4, 4}, + {6, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xd3 */ + {{0, 1}, + {4, 4}, + {6, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xd4 */ + {{2, 2}, + {4, 4}, + {6, 7}, + {0, 0} + } + }, + {1, 1, 4, 0, /* 0xd5 */ + {{0, 0}, + {2, 2}, + {4, 4}, + {6, 7} + } + }, + {0, 1, 3, 0, /* 0xd6 */ + {{1, 2}, + {4, 4}, + {6, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xd7 */ + {{0, 2}, + {4, 4}, + {6, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xd8 */ + {{3, 4}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xd9 */ + {{0, 0}, + {3, 4}, + {6, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xda */ + {{1, 1}, + {3, 4}, + {6, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xdb */ + {{0, 1}, + {3, 4}, + {6, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xdc */ + {{2, 4}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xdd */ + {{0, 0}, + {2, 4}, + {6, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xde */ + {{1, 4}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xdf */ + {{0, 4}, + {6, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 1, 0, /* 0xe0 */ + {{5, 7}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xe1 */ + {{0, 0}, + {5, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xe2 */ + {{1, 1}, + {5, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xe3 */ + {{0, 1}, + {5, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xe4 */ + {{2, 2}, + {5, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xe5 */ + {{0, 0}, + {2, 2}, + {5, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xe6 */ + {{1, 2}, + {5, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xe7 */ + {{0, 2}, + {5, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xe8 */ + {{3, 3}, + {5, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xe9 */ + {{0, 0}, + {3, 3}, + {5, 7}, + {0, 0} + } + }, + {0, 1, 3, 0, /* 0xea */ + {{1, 1}, + {3, 3}, + {5, 7}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xeb */ + {{0, 1}, + {3, 3}, + {5, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xec */ + {{2, 3}, + {5, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xed */ + {{0, 0}, + {2, 3}, + {5, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xee */ + {{1, 3}, + {5, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xef */ + {{0, 3}, + {5, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 1, 0, /* 0xf0 */ + {{4, 7}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xf1 */ + {{0, 0}, + {4, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xf2 */ + {{1, 1}, + {4, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xf3 */ + {{0, 1}, + {4, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xf4 */ + {{2, 2}, + {4, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 3, 0, /* 0xf5 */ + {{0, 0}, + {2, 2}, + {4, 7}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xf6 */ + {{1, 2}, + {4, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xf7 */ + {{0, 2}, + {4, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 1, 0, /* 0xf8 */ + {{3, 7}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xf9 */ + {{0, 0}, + {3, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 2, 0, /* 0xfa */ + {{1, 1}, + {3, 7}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xfb */ + {{0, 1}, + {3, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 1, 0, /* 0xfc */ + {{2, 7}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 2, 0, /* 0xfd */ + {{0, 0}, + {2, 7}, + {0, 0}, + {0, 0} + } + }, + {0, 1, 1, 0, /* 0xfe */ + {{1, 7}, + {0, 0}, + {0, 0}, + {0, 0} + } + }, + {1, 1, 1, 0, /* 0xff */ + {{0, 7}, + {0, 0}, + {0, 0}, + {0, 0} + } + } +}; + + +int +sctp_is_address_in_scope(struct sctp_ifa *ifa, + int ipv4_addr_legal, + int ipv6_addr_legal, + int loopback_scope, + int ipv4_local_scope, + int local_scope, + int site_scope, + int do_update) +{ + if ((loopback_scope == 0) && + (ifa->ifn_p) && SCTP_IFN_IS_IFT_LOOP(ifa->ifn_p)) { + /* + * skip loopback if not in scope * + */ + return (0); + } + switch (ifa->address.sa.sa_family) { + case AF_INET: + if (ipv4_addr_legal) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&ifa->address.sin; + if (sin->sin_addr.s_addr == 0) { + /* not in scope , unspecified */ + return (0); + } + if ((ipv4_local_scope == 0) && + (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { + /* private address not in scope */ + return (0); + } + } else { + return (0); + } + break; +#ifdef INET6 + case AF_INET6: + if (ipv6_addr_legal) { + struct sockaddr_in6 *sin6; + + /* + * Must update the flags, bummer, which means any + * IFA locks must now be applied HERE <-> + */ + if (do_update) { + sctp_gather_internal_ifa_flags(ifa); + } + if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { + return (0); + } + /* ok to use deprecated addresses? */ + sin6 = (struct sockaddr_in6 *)&ifa->address.sin6; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* skip unspecifed addresses */ + return (0); + } + if ( /* (local_scope == 0) && */ + (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))) { + return (0); + } + if ((site_scope == 0) && + (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { + return (0); + } + } else { + return (0); + } + break; +#endif + default: + return (0); + } + return (1); +} + +static struct mbuf * +sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa) +{ + struct sctp_paramhdr *parmh; + struct mbuf *mret; + int len; + + if (ifa->address.sa.sa_family == AF_INET) { + len = sizeof(struct sctp_ipv4addr_param); + } else if (ifa->address.sa.sa_family == AF_INET6) { + len = sizeof(struct sctp_ipv6addr_param); + } else { + /* unknown type */ + return (m); + } + if (M_TRAILINGSPACE(m) >= len) { + /* easy side we just drop it on the end */ + parmh = (struct sctp_paramhdr *)(SCTP_BUF_AT(m, SCTP_BUF_LEN(m))); + mret = m; + } else { + /* Need more space */ + mret = m; + while (SCTP_BUF_NEXT(mret) != NULL) { + mret = SCTP_BUF_NEXT(mret); + } + SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(len, 0, M_DONTWAIT, 1, MT_DATA); + if (SCTP_BUF_NEXT(mret) == NULL) { + /* We are hosed, can't add more addresses */ + return (m); + } + mret = SCTP_BUF_NEXT(mret); + parmh = mtod(mret, struct sctp_paramhdr *); + } + /* now add the parameter */ + switch (ifa->address.sa.sa_family) { + case AF_INET: + { + struct sctp_ipv4addr_param *ipv4p; + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&ifa->address.sin; + ipv4p = (struct sctp_ipv4addr_param *)parmh; + parmh->param_type = htons(SCTP_IPV4_ADDRESS); + parmh->param_length = htons(len); + ipv4p->addr = sin->sin_addr.s_addr; + SCTP_BUF_LEN(mret) += len; + break; + } +#ifdef INET6 + case AF_INET6: + { + struct sctp_ipv6addr_param *ipv6p; + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&ifa->address.sin6; + ipv6p = (struct sctp_ipv6addr_param *)parmh; + parmh->param_type = htons(SCTP_IPV6_ADDRESS); + parmh->param_length = htons(len); + memcpy(ipv6p->addr, &sin6->sin6_addr, + sizeof(ipv6p->addr)); + /* clear embedded scope in the address */ + in6_clearscope((struct in6_addr *)ipv6p->addr); + SCTP_BUF_LEN(mret) += len; + break; + } +#endif + default: + return (m); + } + return (mret); +} + + +struct mbuf * +sctp_add_addresses_to_i_ia(struct sctp_inpcb *inp, struct sctp_scoping *scope, + struct mbuf *m_at, int cnt_inits_to) +{ + struct sctp_vrf *vrf = NULL; + int cnt, limit_out = 0, total_count; + uint32_t vrf_id; + + vrf_id = inp->def_vrf_id; + SCTP_IPI_ADDR_RLOCK(); + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) { + SCTP_IPI_ADDR_RUNLOCK(); + return (m_at); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + struct sctp_ifa *sctp_ifap; + struct sctp_ifn *sctp_ifnp; + + cnt = cnt_inits_to; + if (vrf->total_ifa_count > SCTP_COUNT_LIMIT) { + limit_out = 1; + cnt = SCTP_ADDRESS_LIMIT; + goto skip_count; + } + LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) { + if ((scope->loopback_scope == 0) && + SCTP_IFN_IS_IFT_LOOP(sctp_ifnp)) { + /* + * Skip loopback devices if loopback_scope + * not set + */ + continue; + } + LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) { + if (sctp_is_address_in_scope(sctp_ifap, + scope->ipv4_addr_legal, + scope->ipv6_addr_legal, + scope->loopback_scope, + scope->ipv4_local_scope, + scope->local_scope, + scope->site_scope, 1) == 0) { + continue; + } + cnt++; + if (cnt > SCTP_ADDRESS_LIMIT) { + break; + } + } + if (cnt > SCTP_ADDRESS_LIMIT) { + break; + } + } +skip_count: + if (cnt > 1) { + total_count = 0; + LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) { + cnt = 0; + if ((scope->loopback_scope == 0) && + SCTP_IFN_IS_IFT_LOOP(sctp_ifnp)) { + /* + * Skip loopback devices if + * loopback_scope not set + */ + continue; + } + LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) { + if (sctp_is_address_in_scope(sctp_ifap, + scope->ipv4_addr_legal, + scope->ipv6_addr_legal, + scope->loopback_scope, + scope->ipv4_local_scope, + scope->local_scope, + scope->site_scope, 0) == 0) { + continue; + } + m_at = sctp_add_addr_to_mbuf(m_at, sctp_ifap); + if (limit_out) { + cnt++; + total_count++; + if (cnt >= 2) { + /* + * two from each + * address + */ + break; + } + if (total_count > SCTP_ADDRESS_LIMIT) { + /* No more addresses */ + break; + } + } + } + } + } + } else { + struct sctp_laddr *laddr; + + cnt = cnt_inits_to; + /* First, how many ? */ + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa == NULL) { + continue; + } + if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) + /* + * Address being deleted by the system, dont + * list. + */ + continue; + if (laddr->action == SCTP_DEL_IP_ADDRESS) { + /* + * Address being deleted on this ep don't + * list. + */ + continue; + } + if (sctp_is_address_in_scope(laddr->ifa, + scope->ipv4_addr_legal, + scope->ipv6_addr_legal, + scope->loopback_scope, + scope->ipv4_local_scope, + scope->local_scope, + scope->site_scope, 1) == 0) { + continue; + } + cnt++; + } + if (cnt > SCTP_ADDRESS_LIMIT) { + limit_out = 1; + } + /* + * To get through a NAT we only list addresses if we have + * more than one. That way if you just bind a single address + * we let the source of the init dictate our address. + */ + if (cnt > 1) { + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + cnt = 0; + if (laddr->ifa == NULL) { + continue; + } + if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) + continue; + + if (sctp_is_address_in_scope(laddr->ifa, + scope->ipv4_addr_legal, + scope->ipv6_addr_legal, + scope->loopback_scope, + scope->ipv4_local_scope, + scope->local_scope, + scope->site_scope, 0) == 0) { + continue; + } + m_at = sctp_add_addr_to_mbuf(m_at, laddr->ifa); + cnt++; + if (cnt >= SCTP_ADDRESS_LIMIT) { + break; + } + } + } + } + SCTP_IPI_ADDR_RUNLOCK(); + return (m_at); +} + +static struct sctp_ifa * +sctp_is_ifa_addr_preferred(struct sctp_ifa *ifa, + uint8_t dest_is_loop, + uint8_t dest_is_priv, + sa_family_t fam) +{ + uint8_t dest_is_global = 0; + + /* dest_is_priv is true if destination is a private address */ + /* dest_is_loop is true if destination is a loopback addresses */ + + /** + * Here we determine if its a preferred address. A preferred address + * means it is the same scope or higher scope then the destination. + * L = loopback, P = private, G = global + * ----------------------------------------- + * src | dest | result + * ---------------------------------------- + * L | L | yes + * ----------------------------------------- + * P | L | yes-v4 no-v6 + * ----------------------------------------- + * G | L | yes-v4 no-v6 + * ----------------------------------------- + * L | P | no + * ----------------------------------------- + * P | P | yes + * ----------------------------------------- + * G | P | no + * ----------------------------------------- + * L | G | no + * ----------------------------------------- + * P | G | no + * ----------------------------------------- + * G | G | yes + * ----------------------------------------- + */ + + if (ifa->address.sa.sa_family != fam) { + /* forget mis-matched family */ + return (NULL); + } + if ((dest_is_priv == 0) && (dest_is_loop == 0)) { + dest_is_global = 1; + } + SCTPDBG(SCTP_DEBUG_OUTPUT2, "Is destination preferred:"); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ifa->address.sa); + /* Ok the address may be ok */ + if (fam == AF_INET6) { + /* ok to use deprecated addresses? no lets not! */ + if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { + SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:1\n"); + return (NULL); + } + if (ifa->src_is_priv && !ifa->src_is_loop) { + if (dest_is_loop) { + SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:2\n"); + return (NULL); + } + } + if (ifa->src_is_glob) { + if (dest_is_loop) { + SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:3\n"); + return (NULL); + } + } + } + /* + * Now that we know what is what, implement or table this could in + * theory be done slicker (it used to be), but this is + * straightforward and easier to validate :-) + */ + SCTPDBG(SCTP_DEBUG_OUTPUT3, "src_loop:%d src_priv:%d src_glob:%d\n", + ifa->src_is_loop, ifa->src_is_priv, ifa->src_is_glob); + SCTPDBG(SCTP_DEBUG_OUTPUT3, "dest_loop:%d dest_priv:%d dest_glob:%d\n", + dest_is_loop, dest_is_priv, dest_is_global); + + if ((ifa->src_is_loop) && (dest_is_priv)) { + SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:4\n"); + return (NULL); + } + if ((ifa->src_is_glob) && (dest_is_priv)) { + SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:5\n"); + return (NULL); + } + if ((ifa->src_is_loop) && (dest_is_global)) { + SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:6\n"); + return (NULL); + } + if ((ifa->src_is_priv) && (dest_is_global)) { + SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:7\n"); + return (NULL); + } + SCTPDBG(SCTP_DEBUG_OUTPUT3, "YES\n"); + /* its a preferred address */ + return (ifa); +} + +static struct sctp_ifa * +sctp_is_ifa_addr_acceptable(struct sctp_ifa *ifa, + uint8_t dest_is_loop, + uint8_t dest_is_priv, + sa_family_t fam) +{ + uint8_t dest_is_global = 0; + + /* + * Here we determine if its a acceptable address. A acceptable + * address means it is the same scope or higher scope but we can + * allow for NAT which means its ok to have a global dest and a + * private src. + * + * L = loopback, P = private, G = global + * ----------------------------------------- src | dest | result + * ----------------------------------------- L | L | yes + * ----------------------------------------- P | L | + * yes-v4 no-v6 ----------------------------------------- G | + * L | yes ----------------------------------------- L | + * P | no ----------------------------------------- P | P + * | yes ----------------------------------------- G | P + * | yes - May not work ----------------------------------------- + * L | G | no ----------------------------------------- P + * | G | yes - May not work + * ----------------------------------------- G | G | yes + * ----------------------------------------- + */ + + if (ifa->address.sa.sa_family != fam) { + /* forget non matching family */ + return (NULL); + } + /* Ok the address may be ok */ + if ((dest_is_loop == 0) && (dest_is_priv == 0)) { + dest_is_global = 1; + } + if (fam == AF_INET6) { + /* ok to use deprecated addresses? */ + if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { + return (NULL); + } + if (ifa->src_is_priv) { + /* Special case, linklocal to loop */ + if (dest_is_loop) + return (NULL); + } + } + /* + * Now that we know what is what, implement our table. This could in + * theory be done slicker (it used to be), but this is + * straightforward and easier to validate :-) + */ + if ((ifa->src_is_loop == 1) && (dest_is_priv)) { + return (NULL); + } + if ((ifa->src_is_loop == 1) && (dest_is_global)) { + return (NULL); + } + /* its an acceptable address */ + return (ifa); +} + +int +sctp_is_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) +{ + struct sctp_laddr *laddr; + + if (stcb == NULL) { + /* There are no restrictions, no TCB :-) */ + return (0); + } + LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) { + if (laddr->ifa == NULL) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", + __FUNCTION__); + continue; + } + if (laddr->ifa == ifa) { + /* Yes it is on the list */ + return (1); + } + } + return (0); +} + + +int +sctp_is_addr_in_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa) +{ + struct sctp_laddr *laddr; + + if (ifa == NULL) + return (0); + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa == NULL) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", + __FUNCTION__); + continue; + } + if ((laddr->ifa == ifa) && laddr->action == 0) + /* same pointer */ + return (1); + } + return (0); +} + + + +static struct sctp_ifa * +sctp_choose_boundspecific_inp(struct sctp_inpcb *inp, + sctp_route_t * ro, + uint32_t vrf_id, + int non_asoc_addr_ok, + uint8_t dest_is_priv, + uint8_t dest_is_loop, + sa_family_t fam) +{ + struct sctp_laddr *laddr, *starting_point; + void *ifn; + int resettotop = 0; + struct sctp_ifn *sctp_ifn; + struct sctp_ifa *sctp_ifa, *sifa; + struct sctp_vrf *vrf; + uint32_t ifn_index; + + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) + return (NULL); + + ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); + ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro); + sctp_ifn = sctp_find_ifn(ifn, ifn_index); + /* + * first question, is the ifn we will emit on in our list, if so, we + * want such an address. Note that we first looked for a preferred + * address. + */ + if (sctp_ifn) { + /* is a preferred one on the interface we route out? */ + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && + (non_asoc_addr_ok == 0)) + continue; + sifa = sctp_is_ifa_addr_preferred(sctp_ifa, + dest_is_loop, + dest_is_priv, fam); + if (sifa == NULL) + continue; + if (sctp_is_addr_in_ep(inp, sifa)) { + atomic_add_int(&sifa->refcount, 1); + return (sifa); + } + } + } + /* + * ok, now we now need to find one on the list of the addresses. We + * can't get one on the emitting interface so let's find first a + * preferred one. If not that an acceptable one otherwise... we + * return NULL. + */ + starting_point = inp->next_addr_touse; +once_again: + if (inp->next_addr_touse == NULL) { + inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list); + resettotop = 1; + } + for (laddr = inp->next_addr_touse; laddr; + laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { + if (laddr->ifa == NULL) { + /* address has been removed */ + continue; + } + if (laddr->action == SCTP_DEL_IP_ADDRESS) { + /* address is being deleted */ + continue; + } + sifa = sctp_is_ifa_addr_preferred(laddr->ifa, dest_is_loop, + dest_is_priv, fam); + if (sifa == NULL) + continue; + atomic_add_int(&sifa->refcount, 1); + return (sifa); + } + if (resettotop == 0) { + inp->next_addr_touse = NULL; + goto once_again; + } + inp->next_addr_touse = starting_point; + resettotop = 0; +once_again_too: + if (inp->next_addr_touse == NULL) { + inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list); + resettotop = 1; + } + /* ok, what about an acceptable address in the inp */ + for (laddr = inp->next_addr_touse; laddr; + laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { + if (laddr->ifa == NULL) { + /* address has been removed */ + continue; + } + if (laddr->action == SCTP_DEL_IP_ADDRESS) { + /* address is being deleted */ + continue; + } + sifa = sctp_is_ifa_addr_acceptable(laddr->ifa, dest_is_loop, + dest_is_priv, fam); + if (sifa == NULL) + continue; + atomic_add_int(&sifa->refcount, 1); + return (sifa); + } + if (resettotop == 0) { + inp->next_addr_touse = NULL; + goto once_again_too; + } + /* + * no address bound can be a source for the destination we are in + * trouble + */ + return (NULL); +} + + + +static struct sctp_ifa * +sctp_choose_boundspecific_stcb(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_nets *net, + sctp_route_t * ro, + uint32_t vrf_id, + uint8_t dest_is_priv, + uint8_t dest_is_loop, + int non_asoc_addr_ok, + sa_family_t fam) +{ + struct sctp_laddr *laddr, *starting_point; + void *ifn; + struct sctp_ifn *sctp_ifn; + struct sctp_ifa *sctp_ifa, *sifa; + uint8_t start_at_beginning = 0; + struct sctp_vrf *vrf; + uint32_t ifn_index; + + /* + * first question, is the ifn we will emit on in our list, if so, we + * want that one. + */ + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) + return (NULL); + + ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); + ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro); + sctp_ifn = sctp_find_ifn(ifn, ifn_index); + + /* + * first question, is the ifn we will emit on in our list? If so, + * we want that one. First we look for a preferred. Second, we go + * for an acceptable. + */ + if (sctp_ifn) { + /* first try for a preferred address on the ep */ + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) + continue; + if (sctp_is_addr_in_ep(inp, sctp_ifa)) { + sifa = sctp_is_ifa_addr_preferred(sctp_ifa, dest_is_loop, dest_is_priv, fam); + if (sifa == NULL) + continue; + if (((non_asoc_addr_ok == 0) && + (sctp_is_addr_restricted(stcb, sifa))) || + (non_asoc_addr_ok && + (sctp_is_addr_restricted(stcb, sifa)) && + (!sctp_is_addr_pending(stcb, sifa)))) { + /* on the no-no list */ + continue; + } + atomic_add_int(&sifa->refcount, 1); + return (sifa); + } + } + /* next try for an acceptable address on the ep */ + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) + continue; + if (sctp_is_addr_in_ep(inp, sctp_ifa)) { + sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam); + if (sifa == NULL) + continue; + if (((non_asoc_addr_ok == 0) && + (sctp_is_addr_restricted(stcb, sifa))) || + (non_asoc_addr_ok && + (sctp_is_addr_restricted(stcb, sifa)) && + (!sctp_is_addr_pending(stcb, sifa)))) { + /* on the no-no list */ + continue; + } + atomic_add_int(&sifa->refcount, 1); + return (sifa); + } + } + + } + /* + * if we can't find one like that then we must look at all addresses + * bound to pick one at first preferable then secondly acceptable. + */ + starting_point = stcb->asoc.last_used_address; +sctp_from_the_top: + if (stcb->asoc.last_used_address == NULL) { + start_at_beginning = 1; + stcb->asoc.last_used_address = LIST_FIRST(&inp->sctp_addr_list); + } + /* search beginning with the last used address */ + for (laddr = stcb->asoc.last_used_address; laddr; + laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { + if (laddr->ifa == NULL) { + /* address has been removed */ + continue; + } + if (laddr->action == SCTP_DEL_IP_ADDRESS) { + /* address is being deleted */ + continue; + } + sifa = sctp_is_ifa_addr_preferred(laddr->ifa, dest_is_loop, dest_is_priv, fam); + if (sifa == NULL) + continue; + if (((non_asoc_addr_ok == 0) && + (sctp_is_addr_restricted(stcb, sifa))) || + (non_asoc_addr_ok && + (sctp_is_addr_restricted(stcb, sifa)) && + (!sctp_is_addr_pending(stcb, sifa)))) { + /* on the no-no list */ + continue; + } + stcb->asoc.last_used_address = laddr; + atomic_add_int(&sifa->refcount, 1); + return (sifa); + } + if (start_at_beginning == 0) { + stcb->asoc.last_used_address = NULL; + goto sctp_from_the_top; + } + /* now try for any higher scope than the destination */ + stcb->asoc.last_used_address = starting_point; + start_at_beginning = 0; +sctp_from_the_top2: + if (stcb->asoc.last_used_address == NULL) { + start_at_beginning = 1; + stcb->asoc.last_used_address = LIST_FIRST(&inp->sctp_addr_list); + } + /* search beginning with the last used address */ + for (laddr = stcb->asoc.last_used_address; laddr; + laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { + if (laddr->ifa == NULL) { + /* address has been removed */ + continue; + } + if (laddr->action == SCTP_DEL_IP_ADDRESS) { + /* address is being deleted */ + continue; + } + sifa = sctp_is_ifa_addr_acceptable(laddr->ifa, dest_is_loop, + dest_is_priv, fam); + if (sifa == NULL) + continue; + if (((non_asoc_addr_ok == 0) && + (sctp_is_addr_restricted(stcb, sifa))) || + (non_asoc_addr_ok && + (sctp_is_addr_restricted(stcb, sifa)) && + (!sctp_is_addr_pending(stcb, sifa)))) { + /* on the no-no list */ + continue; + } + stcb->asoc.last_used_address = laddr; + atomic_add_int(&sifa->refcount, 1); + return (sifa); + } + if (start_at_beginning == 0) { + stcb->asoc.last_used_address = NULL; + goto sctp_from_the_top2; + } + return (NULL); +} + +static struct sctp_ifa * +sctp_select_nth_preferred_addr_from_ifn_boundall(struct sctp_ifn *ifn, + struct sctp_tcb *stcb, + int non_asoc_addr_ok, + uint8_t dest_is_loop, + uint8_t dest_is_priv, + int addr_wanted, + sa_family_t fam, + sctp_route_t * ro +) +{ + struct sctp_ifa *ifa, *sifa; + int num_eligible_addr = 0; + +#ifdef INET6 + struct sockaddr_in6 sin6, lsa6; + + if (fam == AF_INET6) { + memcpy(&sin6, &ro->ro_dst, sizeof(struct sockaddr_in6)); + (void)sa6_recoverscope(&sin6); + } +#endif /* INET6 */ + LIST_FOREACH(ifa, &ifn->ifalist, next_ifa) { + if ((ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && + (non_asoc_addr_ok == 0)) + continue; + sifa = sctp_is_ifa_addr_preferred(ifa, dest_is_loop, + dest_is_priv, fam); + if (sifa == NULL) + continue; +#ifdef INET6 + if (fam == AF_INET6 && + dest_is_loop && + sifa->src_is_loop && sifa->src_is_priv) { + /* + * don't allow fe80::1 to be a src on loop ::1, we + * don't list it to the peer so we will get an + * abort. + */ + continue; + } + if (fam == AF_INET6 && + IN6_IS_ADDR_LINKLOCAL(&sifa->address.sin6.sin6_addr) && + IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) { + /* + * link-local <-> link-local must belong to the same + * scope. + */ + memcpy(&lsa6, &sifa->address.sin6, sizeof(struct sockaddr_in6)); + (void)sa6_recoverscope(&lsa6); + if (sin6.sin6_scope_id != lsa6.sin6_scope_id) { + continue; + } + } +#endif /* INET6 */ + + /* + * Check if the IPv6 address matches to next-hop. In the + * mobile case, old IPv6 address may be not deleted from the + * interface. Then, the interface has previous and new + * addresses. We should use one corresponding to the + * next-hop. (by micchie) + */ +#ifdef INET6 + if (stcb && fam == AF_INET6 && + sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) { + if (sctp_v6src_match_nexthop(&sifa->address.sin6, ro) + == 0) { + continue; + } + } +#endif + /* Avoid topologically incorrect IPv4 address */ + if (stcb && fam == AF_INET && + sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) { + if (sctp_v4src_match_nexthop(sifa, ro) == 0) { + continue; + } + } + if (stcb) { + if (sctp_is_address_in_scope(ifa, + stcb->asoc.ipv4_addr_legal, + stcb->asoc.ipv6_addr_legal, + stcb->asoc.loopback_scope, + stcb->asoc.ipv4_local_scope, + stcb->asoc.local_scope, + stcb->asoc.site_scope, 0) == 0) { + continue; + } + if (((non_asoc_addr_ok == 0) && + (sctp_is_addr_restricted(stcb, sifa))) || + (non_asoc_addr_ok && + (sctp_is_addr_restricted(stcb, sifa)) && + (!sctp_is_addr_pending(stcb, sifa)))) { + /* + * It is restricted for some reason.. + * probably not yet added. + */ + continue; + } + } + if (num_eligible_addr >= addr_wanted) { + return (sifa); + } + num_eligible_addr++; + } + return (NULL); +} + + +static int +sctp_count_num_preferred_boundall(struct sctp_ifn *ifn, + struct sctp_tcb *stcb, + int non_asoc_addr_ok, + uint8_t dest_is_loop, + uint8_t dest_is_priv, + sa_family_t fam) +{ + struct sctp_ifa *ifa, *sifa; + int num_eligible_addr = 0; + + LIST_FOREACH(ifa, &ifn->ifalist, next_ifa) { + if ((ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && + (non_asoc_addr_ok == 0)) { + continue; + } + sifa = sctp_is_ifa_addr_preferred(ifa, dest_is_loop, + dest_is_priv, fam); + if (sifa == NULL) { + continue; + } + if (stcb) { + if (sctp_is_address_in_scope(ifa, + stcb->asoc.ipv4_addr_legal, + stcb->asoc.ipv6_addr_legal, + stcb->asoc.loopback_scope, + stcb->asoc.ipv4_local_scope, + stcb->asoc.local_scope, + stcb->asoc.site_scope, 0) == 0) { + continue; + } + if (((non_asoc_addr_ok == 0) && + (sctp_is_addr_restricted(stcb, sifa))) || + (non_asoc_addr_ok && + (sctp_is_addr_restricted(stcb, sifa)) && + (!sctp_is_addr_pending(stcb, sifa)))) { + /* + * It is restricted for some reason.. + * probably not yet added. + */ + continue; + } + } + num_eligible_addr++; + } + return (num_eligible_addr); +} + +static struct sctp_ifa * +sctp_choose_boundall(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_nets *net, + sctp_route_t * ro, + uint32_t vrf_id, + uint8_t dest_is_priv, + uint8_t dest_is_loop, + int non_asoc_addr_ok, + sa_family_t fam) +{ + int cur_addr_num = 0, num_preferred = 0; + void *ifn; + struct sctp_ifn *sctp_ifn, *looked_at = NULL, *emit_ifn; + struct sctp_ifa *sctp_ifa, *sifa; + uint32_t ifn_index; + struct sctp_vrf *vrf; + + /*- + * For boundall we can use any address in the association. + * If non_asoc_addr_ok is set we can use any address (at least in + * theory). So we look for preferred addresses first. If we find one, + * we use it. Otherwise we next try to get an address on the + * interface, which we should be able to do (unless non_asoc_addr_ok + * is false and we are routed out that way). In these cases where we + * can't use the address of the interface we go through all the + * ifn's looking for an address we can use and fill that in. Punting + * means we send back address 0, which will probably cause problems + * actually since then IP will fill in the address of the route ifn, + * which means we probably already rejected it.. i.e. here comes an + * abort :-<. + */ + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) + return (NULL); + + ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); + ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro); + emit_ifn = looked_at = sctp_ifn = sctp_find_ifn(ifn, ifn_index); + if (sctp_ifn == NULL) { + /* ?? We don't have this guy ?? */ + SCTPDBG(SCTP_DEBUG_OUTPUT2, "No ifn emit interface?\n"); + goto bound_all_plan_b; + } + SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifn_index:%d name:%s is emit interface\n", + ifn_index, sctp_ifn->ifn_name); + + if (net) { + cur_addr_num = net->indx_of_eligible_next_to_use; + } + num_preferred = sctp_count_num_preferred_boundall(sctp_ifn, + stcb, + non_asoc_addr_ok, + dest_is_loop, + dest_is_priv, fam); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "Found %d preferred source addresses for intf:%s\n", + num_preferred, sctp_ifn->ifn_name); + if (num_preferred == 0) { + /* + * no eligible addresses, we must use some other interface + * address if we can find one. + */ + goto bound_all_plan_b; + } + /* + * Ok we have num_eligible_addr set with how many we can use, this + * may vary from call to call due to addresses being deprecated + * etc.. + */ + if (cur_addr_num >= num_preferred) { + cur_addr_num = 0; + } + /* + * select the nth address from the list (where cur_addr_num is the + * nth) and 0 is the first one, 1 is the second one etc... + */ + SCTPDBG(SCTP_DEBUG_OUTPUT2, "cur_addr_num:%d\n", cur_addr_num); + + sctp_ifa = sctp_select_nth_preferred_addr_from_ifn_boundall(sctp_ifn, stcb, non_asoc_addr_ok, dest_is_loop, + dest_is_priv, cur_addr_num, fam, ro); + + /* if sctp_ifa is NULL something changed??, fall to plan b. */ + if (sctp_ifa) { + atomic_add_int(&sctp_ifa->refcount, 1); + if (net) { + /* save off where the next one we will want */ + net->indx_of_eligible_next_to_use = cur_addr_num + 1; + } + return (sctp_ifa); + } + /* + * plan_b: Look at all interfaces and find a preferred address. If + * no preferred fall through to plan_c. + */ +bound_all_plan_b: + SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan B\n"); + LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { + SCTPDBG(SCTP_DEBUG_OUTPUT2, "Examine interface %s\n", + sctp_ifn->ifn_name); + if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { + /* wrong base scope */ + SCTPDBG(SCTP_DEBUG_OUTPUT2, "skip\n"); + continue; + } + if ((sctp_ifn == looked_at) && looked_at) { + /* already looked at this guy */ + SCTPDBG(SCTP_DEBUG_OUTPUT2, "already seen\n"); + continue; + } + num_preferred = sctp_count_num_preferred_boundall(sctp_ifn, stcb, non_asoc_addr_ok, + dest_is_loop, dest_is_priv, fam); + SCTPDBG(SCTP_DEBUG_OUTPUT2, + "Found ifn:%p %d preferred source addresses\n", + ifn, num_preferred); + if (num_preferred == 0) { + /* None on this interface. */ + SCTPDBG(SCTP_DEBUG_OUTPUT2, "No prefered -- skipping to next\n"); + continue; + } + SCTPDBG(SCTP_DEBUG_OUTPUT2, + "num preferred:%d on interface:%p cur_addr_num:%d\n", + num_preferred, sctp_ifn, cur_addr_num); + + /* + * Ok we have num_eligible_addr set with how many we can + * use, this may vary from call to call due to addresses + * being deprecated etc.. + */ + if (cur_addr_num >= num_preferred) { + cur_addr_num = 0; + } + sifa = sctp_select_nth_preferred_addr_from_ifn_boundall(sctp_ifn, stcb, non_asoc_addr_ok, dest_is_loop, + dest_is_priv, cur_addr_num, fam, ro); + if (sifa == NULL) + continue; + if (net) { + net->indx_of_eligible_next_to_use = cur_addr_num + 1; + SCTPDBG(SCTP_DEBUG_OUTPUT2, "we selected %d\n", + cur_addr_num); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "Source:"); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "Dest:"); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &net->ro._l_addr.sa); + } + atomic_add_int(&sifa->refcount, 1); + return (sifa); + + } + + /* plan_c: do we have an acceptable address on the emit interface */ + SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan C: find acceptable on interface\n"); + if (emit_ifn == NULL) { + goto plan_d; + } + LIST_FOREACH(sctp_ifa, &emit_ifn->ifalist, next_ifa) { + if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && + (non_asoc_addr_ok == 0)) + continue; + sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, + dest_is_priv, fam); + if (sifa == NULL) + continue; + if (stcb) { + if (sctp_is_address_in_scope(sifa, + stcb->asoc.ipv4_addr_legal, + stcb->asoc.ipv6_addr_legal, + stcb->asoc.loopback_scope, + stcb->asoc.ipv4_local_scope, + stcb->asoc.local_scope, + stcb->asoc.site_scope, 0) == 0) { + continue; + } + if (((non_asoc_addr_ok == 0) && + (sctp_is_addr_restricted(stcb, sifa))) || + (non_asoc_addr_ok && + (sctp_is_addr_restricted(stcb, sifa)) && + (!sctp_is_addr_pending(stcb, sifa)))) { + /* + * It is restricted for some reason.. + * probably not yet added. + */ + continue; + } + } + atomic_add_int(&sifa->refcount, 1); + return (sifa); + } +plan_d: + /* + * plan_d: We are in trouble. No preferred address on the emit + * interface. And not even a preferred address on all interfaces. Go + * out and see if we can find an acceptable address somewhere + * amongst all interfaces. + */ + SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan D\n"); + LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { + if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { + /* wrong base scope */ + continue; + } + if ((sctp_ifn == looked_at) && looked_at) + /* already looked at this guy */ + continue; + + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && + (non_asoc_addr_ok == 0)) + continue; + sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, + dest_is_loop, + dest_is_priv, fam); + if (sifa == NULL) + continue; + if (stcb) { + if (sctp_is_address_in_scope(sifa, + stcb->asoc.ipv4_addr_legal, + stcb->asoc.ipv6_addr_legal, + stcb->asoc.loopback_scope, + stcb->asoc.ipv4_local_scope, + stcb->asoc.local_scope, + stcb->asoc.site_scope, 0) == 0) { + continue; + } + if (((non_asoc_addr_ok == 0) && + (sctp_is_addr_restricted(stcb, sifa))) || + (non_asoc_addr_ok && + (sctp_is_addr_restricted(stcb, sifa)) && + (!sctp_is_addr_pending(stcb, sifa)))) { + /* + * It is restricted for some + * reason.. probably not yet added. + */ + continue; + } + } + atomic_add_int(&sifa->refcount, 1); + return (sifa); + } + } + /* + * Ok we can find NO address to source from that is not on our + * restricted list and non_asoc_address is NOT ok, or it is on our + * restricted list. We can't source to it :-( + */ + return (NULL); +} + + + +/* tcb may be NULL */ +struct sctp_ifa * +sctp_source_address_selection(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + sctp_route_t * ro, + struct sctp_nets *net, + int non_asoc_addr_ok, uint32_t vrf_id) +{ + struct sockaddr_in *to = (struct sockaddr_in *)&ro->ro_dst; + +#ifdef INET6 + struct sockaddr_in6 *to6 = (struct sockaddr_in6 *)&ro->ro_dst; + +#endif + struct sctp_ifa *answer; + uint8_t dest_is_priv, dest_is_loop; + sa_family_t fam; + + /*- + * Rules: - Find the route if needed, cache if I can. - Look at + * interface address in route, Is it in the bound list. If so we + * have the best source. - If not we must rotate amongst the + * addresses. + * + * Cavets and issues + * + * Do we need to pay attention to scope. We can have a private address + * or a global address we are sourcing or sending to. So if we draw + * it out + * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz + * For V4 + * ------------------------------------------ + * source * dest * result + * ----------------------------------------- + * Private * Global * NAT + * ----------------------------------------- + * Private * Private * No problem + * ----------------------------------------- + * Global * Private * Huh, How will this work? + * ----------------------------------------- + * Global * Global * No Problem + *------------------------------------------ + * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz + * For V6 + *------------------------------------------ + * source * dest * result + * ----------------------------------------- + * Linklocal * Global * + * ----------------------------------------- + * Linklocal * Linklocal * No problem + * ----------------------------------------- + * Global * Linklocal * Huh, How will this work? + * ----------------------------------------- + * Global * Global * No Problem + *------------------------------------------ + * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz + * + * And then we add to that what happens if there are multiple addresses + * assigned to an interface. Remember the ifa on a ifn is a linked + * list of addresses. So one interface can have more than one IP + * address. What happens if we have both a private and a global + * address? Do we then use context of destination to sort out which + * one is best? And what about NAT's sending P->G may get you a NAT + * translation, or should you select the G thats on the interface in + * preference. + * + * Decisions: + * + * - count the number of addresses on the interface. + * - if it is one, no problem except case . + * For we will assume a NAT out there. + * - if there are more than one, then we need to worry about scope P + * or G. We should prefer G -> G and P -> P if possible. + * Then as a secondary fall back to mixed types G->P being a last + * ditch one. + * - The above all works for bound all, but bound specific we need to + * use the same concept but instead only consider the bound + * addresses. If the bound set is NOT assigned to the interface then + * we must use rotation amongst the bound addresses.. + */ + if (ro->ro_rt == NULL) { + /* + * Need a route to cache. + */ + SCTP_RTALLOC(ro, vrf_id); + } + if (ro->ro_rt == NULL) { + return (NULL); + } + fam = to->sin_family; + dest_is_priv = dest_is_loop = 0; + /* Setup our scopes for the destination */ + switch (fam) { + case AF_INET: + /* Scope based on outbound address */ + if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) { + dest_is_loop = 1; + if (net != NULL) { + /* mark it as local */ + net->addr_is_local = 1; + } + } else if ((IN4_ISPRIVATE_ADDRESS(&to->sin_addr))) { + dest_is_priv = 1; + } + break; +#ifdef INET6 + case AF_INET6: + /* Scope based on outbound address */ + if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr) || + SCTP_ROUTE_IS_REAL_LOOP(ro)) { + /* + * If the address is a loopback address, which + * consists of "::1" OR "fe80::1%lo0", we are + * loopback scope. But we don't use dest_is_priv + * (link local addresses). + */ + dest_is_loop = 1; + if (net != NULL) { + /* mark it as local */ + net->addr_is_local = 1; + } + } else if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) { + dest_is_priv = 1; + } + break; +#endif + } + SCTPDBG(SCTP_DEBUG_OUTPUT2, "Select source addr for:"); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)to); + SCTP_IPI_ADDR_RLOCK(); + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + /* + * Bound all case + */ + answer = sctp_choose_boundall(inp, stcb, net, ro, vrf_id, + dest_is_priv, dest_is_loop, + non_asoc_addr_ok, fam); + SCTP_IPI_ADDR_RUNLOCK(); + return (answer); + } + /* + * Subset bound case + */ + if (stcb) { + answer = sctp_choose_boundspecific_stcb(inp, stcb, net, ro, + vrf_id, dest_is_priv, + dest_is_loop, + non_asoc_addr_ok, fam); + } else { + answer = sctp_choose_boundspecific_inp(inp, ro, vrf_id, + non_asoc_addr_ok, + dest_is_priv, + dest_is_loop, fam); + } + SCTP_IPI_ADDR_RUNLOCK(); + return (answer); +} + +static int +sctp_find_cmsg(int c_type, void *data, struct mbuf *control, int cpsize) +{ + struct cmsghdr cmh; + int tlen, at; + + tlen = SCTP_BUF_LEN(control); + at = 0; + /* + * Independent of how many mbufs, find the c_type inside the control + * structure and copy out the data. + */ + while (at < tlen) { + if ((tlen - at) < (int)CMSG_ALIGN(sizeof(cmh))) { + /* not enough room for one more we are done. */ + return (0); + } + m_copydata(control, at, sizeof(cmh), (caddr_t)&cmh); + if (((int)cmh.cmsg_len + at) > tlen) { + /* + * this is real messed up since there is not enough + * data here to cover the cmsg header. We are done. + */ + return (0); + } + if ((cmh.cmsg_level == IPPROTO_SCTP) && + (c_type == cmh.cmsg_type)) { + /* found the one we want, copy it out */ + at += CMSG_ALIGN(sizeof(struct cmsghdr)); + if ((int)(cmh.cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr))) < cpsize) { + /* + * space of cmsg_len after header not big + * enough + */ + return (0); + } + m_copydata(control, at, cpsize, data); + return (1); + } else { + at += CMSG_ALIGN(cmh.cmsg_len); + if (cmh.cmsg_len == 0) { + break; + } + } + } + /* not found */ + return (0); +} + +static struct mbuf * +sctp_add_cookie(struct sctp_inpcb *inp, struct mbuf *init, int init_offset, + struct mbuf *initack, int initack_offset, struct sctp_state_cookie *stc_in, uint8_t ** signature) +{ + struct mbuf *copy_init, *copy_initack, *m_at, *sig, *mret; + struct sctp_state_cookie *stc; + struct sctp_paramhdr *ph; + uint8_t *foo; + int sig_offset; + uint16_t cookie_sz; + + mret = NULL; + mret = sctp_get_mbuf_for_msg((sizeof(struct sctp_state_cookie) + + sizeof(struct sctp_paramhdr)), 0, + M_DONTWAIT, 1, MT_DATA); + if (mret == NULL) { + return (NULL); + } + copy_init = SCTP_M_COPYM(init, init_offset, M_COPYALL, M_DONTWAIT); + if (copy_init == NULL) { + sctp_m_freem(mret); + return (NULL); + } +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = copy_init; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_ICOPY); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + copy_initack = SCTP_M_COPYM(initack, initack_offset, M_COPYALL, + M_DONTWAIT); + if (copy_initack == NULL) { + sctp_m_freem(mret); + sctp_m_freem(copy_init); + return (NULL); + } +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = copy_initack; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_ICOPY); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + /* easy side we just drop it on the end */ + ph = mtod(mret, struct sctp_paramhdr *); + SCTP_BUF_LEN(mret) = sizeof(struct sctp_state_cookie) + + sizeof(struct sctp_paramhdr); + stc = (struct sctp_state_cookie *)((caddr_t)ph + + sizeof(struct sctp_paramhdr)); + ph->param_type = htons(SCTP_STATE_COOKIE); + ph->param_length = 0; /* fill in at the end */ + /* Fill in the stc cookie data */ + memcpy(stc, stc_in, sizeof(struct sctp_state_cookie)); + + /* tack the INIT and then the INIT-ACK onto the chain */ + cookie_sz = 0; + m_at = mret; + for (m_at = mret; m_at; m_at = SCTP_BUF_NEXT(m_at)) { + cookie_sz += SCTP_BUF_LEN(m_at); + if (SCTP_BUF_NEXT(m_at) == NULL) { + SCTP_BUF_NEXT(m_at) = copy_init; + break; + } + } + + for (m_at = copy_init; m_at; m_at = SCTP_BUF_NEXT(m_at)) { + cookie_sz += SCTP_BUF_LEN(m_at); + if (SCTP_BUF_NEXT(m_at) == NULL) { + SCTP_BUF_NEXT(m_at) = copy_initack; + break; + } + } + + for (m_at = copy_initack; m_at; m_at = SCTP_BUF_NEXT(m_at)) { + cookie_sz += SCTP_BUF_LEN(m_at); + if (SCTP_BUF_NEXT(m_at) == NULL) { + break; + } + } + sig = sctp_get_mbuf_for_msg(SCTP_SECRET_SIZE, 0, M_DONTWAIT, 1, MT_DATA); + if (sig == NULL) { + /* no space, so free the entire chain */ + sctp_m_freem(mret); + return (NULL); + } + SCTP_BUF_LEN(sig) = 0; + SCTP_BUF_NEXT(m_at) = sig; + sig_offset = 0; + foo = (uint8_t *) (mtod(sig, caddr_t)+sig_offset); + memset(foo, 0, SCTP_SIGNATURE_SIZE); + *signature = foo; + SCTP_BUF_LEN(sig) += SCTP_SIGNATURE_SIZE; + cookie_sz += SCTP_SIGNATURE_SIZE; + ph->param_length = htons(cookie_sz); + return (mret); +} + + +static uint8_t +sctp_get_ect(struct sctp_tcb *stcb, + struct sctp_tmit_chunk *chk) +{ + uint8_t this_random; + + /* Huh? */ + if (SCTP_BASE_SYSCTL(sctp_ecn_enable) == 0) + return (0); + + if (SCTP_BASE_SYSCTL(sctp_ecn_nonce) == 0) + /* no nonce, always return ECT0 */ + return (SCTP_ECT0_BIT); + + if (stcb->asoc.peer_supports_ecn_nonce == 0) { + /* Peer does NOT support it, so we send a ECT0 only */ + return (SCTP_ECT0_BIT); + } + if (chk == NULL) + return (SCTP_ECT0_BIT); + + if ((stcb->asoc.hb_random_idx > 3) || + ((stcb->asoc.hb_random_idx == 3) && + (stcb->asoc.hb_ect_randombit > 7))) { + uint32_t rndval; + +warp_drive_sa: + rndval = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep); + memcpy(stcb->asoc.hb_random_values, &rndval, + sizeof(stcb->asoc.hb_random_values)); + this_random = stcb->asoc.hb_random_values[0]; + stcb->asoc.hb_random_idx = 0; + stcb->asoc.hb_ect_randombit = 0; + } else { + if (stcb->asoc.hb_ect_randombit > 7) { + stcb->asoc.hb_ect_randombit = 0; + stcb->asoc.hb_random_idx++; + if (stcb->asoc.hb_random_idx > 3) { + goto warp_drive_sa; + } + } + this_random = stcb->asoc.hb_random_values[stcb->asoc.hb_random_idx]; + } + if ((this_random >> stcb->asoc.hb_ect_randombit) & 0x01) { + if (chk != NULL) + /* ECN Nonce stuff */ + chk->rec.data.ect_nonce = SCTP_ECT1_BIT; + stcb->asoc.hb_ect_randombit++; + return (SCTP_ECT1_BIT); + } else { + stcb->asoc.hb_ect_randombit++; + return (SCTP_ECT0_BIT); + } +} + +static int +sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, /* may be NULL */ + struct sctp_nets *net, + struct sockaddr *to, + struct mbuf *m, + uint32_t auth_offset, + struct sctp_auth_chunk *auth, + uint16_t auth_keyid, + int nofragment_flag, + int ecn_ok, + struct sctp_tmit_chunk *chk, + int out_of_asoc_ok, + uint16_t src_port, + uint16_t dest_port, + uint32_t v_tag, + uint16_t port, + int so_locked, +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif + union sctp_sockstore *over_addr +) +/* nofragment_flag to tell if IP_DF should be set (IPv4 only) */ +{ + /* + * Given a mbuf chain (via SCTP_BUF_NEXT()) that holds a packet + * header WITH an SCTPHDR but no IP header, endpoint inp and sa + * structure: - fill in the HMAC digest of any AUTH chunk in the + * packet. - calculate and fill in the SCTP checksum. - prepend an + * IP address header. - if boundall use INADDR_ANY. - if + * boundspecific do source address selection. - set fragmentation + * option for ipV4. - On return from IP output, check/adjust mtu + * size of output interface and smallest_mtu size as well. + */ + /* Will need ifdefs around this */ + struct mbuf *o_pak; + struct mbuf *newm; + struct sctphdr *sctphdr; + int packet_length; + int ret; + uint32_t vrf_id; + sctp_route_t *ro = NULL; + struct udphdr *udp = NULL; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so = NULL; + +#endif + + if ((net) && (net->dest_state & SCTP_ADDR_OUT_OF_SCOPE)) { + SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT); + sctp_m_freem(m); + return (EFAULT); + } + if (stcb) { + vrf_id = stcb->asoc.vrf_id; + } else { + vrf_id = inp->def_vrf_id; + } + + /* fill in the HMAC digest for any AUTH chunk in the packet */ + if ((auth != NULL) && (stcb != NULL)) { + sctp_fill_hmac_digest_m(m, auth_offset, auth, stcb, auth_keyid); + } + if (to->sa_family == AF_INET) { + struct ip *ip = NULL; + sctp_route_t iproute; + uint8_t tos_value; + int len; + + len = sizeof(struct ip) + sizeof(struct sctphdr); + if (port) { + len += sizeof(struct udphdr); + } + newm = sctp_get_mbuf_for_msg(len, 1, M_DONTWAIT, 1, MT_DATA); + if (newm == NULL) { + sctp_m_freem(m); + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + SCTP_ALIGN_TO_END(newm, len); + SCTP_BUF_LEN(newm) = len; + SCTP_BUF_NEXT(newm) = m; + m = newm; + packet_length = sctp_calculate_len(m); + ip = mtod(m, struct ip *); + ip->ip_v = IPVERSION; + ip->ip_hl = (sizeof(struct ip) >> 2); + if (net) { + tos_value = net->tos_flowlabel & 0x000000ff; + } else { + tos_value = inp->ip_inp.inp.inp_ip_tos; + } + if ((nofragment_flag) && (port == 0)) { + ip->ip_off = IP_DF; + } else + ip->ip_off = 0; + + /* FreeBSD has a function for ip_id's */ + ip->ip_id = ip_newid(); + + ip->ip_ttl = inp->ip_inp.inp.inp_ip_ttl; + ip->ip_len = packet_length; + if (stcb) { + if ((stcb->asoc.ecn_allowed) && ecn_ok) { + /* Enable ECN */ + ip->ip_tos = ((u_char)(tos_value & 0xfc) | sctp_get_ect(stcb, chk)); + } else { + /* No ECN */ + ip->ip_tos = (u_char)(tos_value & 0xfc); + } + } else { + /* no association at all */ + ip->ip_tos = (tos_value & 0xfc); + } + if (port) { + ip->ip_p = IPPROTO_UDP; + } else { + ip->ip_p = IPPROTO_SCTP; + } + ip->ip_sum = 0; + if (net == NULL) { + ro = &iproute; + memset(&iproute, 0, sizeof(iproute)); + memcpy(&ro->ro_dst, to, to->sa_len); + } else { + ro = (sctp_route_t *) & net->ro; + } + /* Now the address selection part */ + ip->ip_dst.s_addr = ((struct sockaddr_in *)to)->sin_addr.s_addr; + + /* call the routine to select the src address */ + if (net && out_of_asoc_ok == 0) { + if (net->ro._s_addr && (net->ro._s_addr->localifa_flags & (SCTP_BEING_DELETED | SCTP_ADDR_IFA_UNUSEABLE))) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + net->src_addr_selected = 0; + if (ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = NULL; + } + } + if (net->src_addr_selected == 0) { + /* Cache the source address */ + net->ro._s_addr = sctp_source_address_selection(inp, stcb, + ro, net, 0, + vrf_id); + net->src_addr_selected = 1; + } + if (net->ro._s_addr == NULL) { + /* No route to host */ + net->src_addr_selected = 0; + goto no_route; + } + ip->ip_src = net->ro._s_addr->address.sin.sin_addr; + } else { + if (over_addr == NULL) { + struct sctp_ifa *_lsrc; + + _lsrc = sctp_source_address_selection(inp, stcb, ro, + net, + out_of_asoc_ok, + vrf_id); + if (_lsrc == NULL) { + goto no_route; + } + ip->ip_src = _lsrc->address.sin.sin_addr; + sctp_free_ifa(_lsrc); + } else { + ip->ip_src = over_addr->sin.sin_addr; + SCTP_RTALLOC(ro, vrf_id); + } + } + if (port) { + udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); + udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); + udp->uh_dport = port; + udp->uh_ulen = htons(packet_length - sizeof(struct ip)); + udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP)); + sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr)); + } else { + sctphdr = (struct sctphdr *)((caddr_t)ip + sizeof(struct ip)); + } + + sctphdr->src_port = src_port; + sctphdr->dest_port = dest_port; + sctphdr->v_tag = v_tag; + sctphdr->checksum = 0; + + /* + * If source address selection fails and we find no route + * then the ip_output should fail as well with a + * NO_ROUTE_TO_HOST type error. We probably should catch + * that somewhere and abort the association right away + * (assuming this is an INIT being sent). + */ + if ((ro->ro_rt == NULL)) { + /* + * src addr selection failed to find a route (or + * valid source addr), so we can't get there from + * here (yet)! + */ + no_route: + SCTPDBG(SCTP_DEBUG_OUTPUT1, + "%s: dropped packet - no valid source addr\n", + __FUNCTION__); + if (net) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, + "Destination was "); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT1, + &net->ro._l_addr.sa); + if (net->dest_state & SCTP_ADDR_CONFIRMED) { + if ((net->dest_state & SCTP_ADDR_REACHABLE) && stcb) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "no route takes interface %p down\n", net); + sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, + stcb, + SCTP_FAILED_THRESHOLD, + (void *)net, + so_locked); + net->dest_state &= ~SCTP_ADDR_REACHABLE; + net->dest_state |= SCTP_ADDR_NOT_REACHABLE; + /* + * JRS 5/14/07 - If a + * destination is + * unreachable, the PF bit + * is turned off. This + * allows an unambiguous use + * of the PF bit for + * destinations that are + * reachable but potentially + * failed. If the + * destination is set to the + * unreachable state, also + * set the destination to + * the PF state. + */ + /* + * Add debug message here if + * destination is not in PF + * state. + */ + /* + * Stop any running T3 + * timers here? + */ + if ((stcb->asoc.sctp_cmt_on_off == 1) && + (stcb->asoc.sctp_cmt_pf > 0)) { + net->dest_state &= ~SCTP_ADDR_PF; + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Destination %p moved from PF to unreachable.\n", + net); + } + } + } + if (stcb) { + if (net == stcb->asoc.primary_destination) { + /* need a new primary */ + struct sctp_nets *alt; + + alt = sctp_find_alternate_net(stcb, net, 0); + if (alt != net) { + if (sctp_set_primary_addr(stcb, + (struct sockaddr *)NULL, + alt) == 0) { + net->dest_state |= SCTP_ADDR_WAS_PRIMARY; + if (net->ro._s_addr) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + } + net->src_addr_selected = 0; + } + } + } + } + } + SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); + sctp_m_freem(m); + return (EHOSTUNREACH); + } + if (ro != &iproute) { + memcpy(&iproute, ro, sizeof(*ro)); + } + SCTPDBG(SCTP_DEBUG_OUTPUT3, "Calling ipv4 output routine from low level src addr:%x\n", + (uint32_t) (ntohl(ip->ip_src.s_addr))); + SCTPDBG(SCTP_DEBUG_OUTPUT3, "Destination is %x\n", + (uint32_t) (ntohl(ip->ip_dst.s_addr))); + SCTPDBG(SCTP_DEBUG_OUTPUT3, "RTP route is %p through\n", + ro->ro_rt); + + if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { + /* failed to prepend data, give up */ + SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + sctp_m_freem(m); + return (ENOMEM); + } +#ifdef SCTP_PACKET_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) + sctp_packet_log(m, packet_length); +#endif + SCTP_ATTACH_CHAIN(o_pak, m, packet_length); + if (port) { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + if (!(SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) && + (stcb) && + (stcb->asoc.loopback_scope))) { + sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip) + sizeof(struct udphdr)); + SCTP_STAT_INCR(sctps_sendswcrc); + } else { + SCTP_STAT_INCR(sctps_sendnocrc); + } +#endif + SCTP_ENABLE_UDP_CSUM(o_pak); + } else { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + m->m_pkthdr.csum_flags = CSUM_SCTP; + m->m_pkthdr.csum_data = 0; + SCTP_STAT_INCR(sctps_sendhwcrc); +#endif + } + /* send it out. table id is taken from stcb */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) { + so = SCTP_INP_SO(inp); + SCTP_SOCKET_UNLOCK(so, 0); + } +#endif + SCTP_IP_OUTPUT(ret, o_pak, ro, stcb, vrf_id); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) { + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 0); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + } +#endif + SCTP_STAT_INCR(sctps_sendpackets); + SCTP_STAT_INCR_COUNTER64(sctps_outpackets); + if (ret) + SCTP_STAT_INCR(sctps_senderrors); + + SCTPDBG(SCTP_DEBUG_OUTPUT3, "IP output returns %d\n", ret); + if (net == NULL) { + /* free tempy routes */ + if (ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = NULL; + } + } else { + /* PMTU check versus smallest asoc MTU goes here */ + if ((ro->ro_rt != NULL) && + (net->ro._s_addr)) { + uint32_t mtu; + + mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_rt); + if (net->port) { + mtu -= sizeof(struct udphdr); + } + if (mtu && (stcb->asoc.smallest_mtu > mtu)) { + sctp_mtu_size_reset(inp, &stcb->asoc, mtu); + net->mtu = mtu; + } + } else if (ro->ro_rt == NULL) { + /* route was freed */ + if (net->ro._s_addr && + net->src_addr_selected) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + } + net->src_addr_selected = 0; + } + } + return (ret); + } +#ifdef INET6 + else if (to->sa_family == AF_INET6) { + uint32_t flowlabel; + struct ip6_hdr *ip6h; + struct route_in6 ip6route; + struct ifnet *ifp; + u_char flowTop; + uint16_t flowBottom; + u_char tosBottom, tosTop; + struct sockaddr_in6 *sin6, tmp, *lsa6, lsa6_tmp; + int prev_scope = 0; + struct sockaddr_in6 lsa6_storage; + int error; + u_short prev_port = 0; + int len; + + if (net != NULL) { + flowlabel = net->tos_flowlabel; + } else { + flowlabel = ((struct in6pcb *)inp)->in6p_flowinfo; + } + + len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr); + if (port) { + len += sizeof(struct udphdr); + } + newm = sctp_get_mbuf_for_msg(len, 1, M_DONTWAIT, 1, MT_DATA); + if (newm == NULL) { + sctp_m_freem(m); + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + SCTP_ALIGN_TO_END(newm, len); + SCTP_BUF_LEN(newm) = len; + SCTP_BUF_NEXT(newm) = m; + m = newm; + packet_length = sctp_calculate_len(m); + + ip6h = mtod(m, struct ip6_hdr *); + /* + * We assume here that inp_flow is in host byte order within + * the TCB! + */ + flowBottom = flowlabel & 0x0000ffff; + flowTop = ((flowlabel & 0x000f0000) >> 16); + tosTop = (((flowlabel & 0xf0) >> 4) | IPV6_VERSION); + /* protect *sin6 from overwrite */ + sin6 = (struct sockaddr_in6 *)to; + tmp = *sin6; + sin6 = &tmp; + + /* KAME hack: embed scopeid */ + if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { + SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + return (EINVAL); + } + if (net == NULL) { + memset(&ip6route, 0, sizeof(ip6route)); + ro = (sctp_route_t *) & ip6route; + memcpy(&ro->ro_dst, sin6, sin6->sin6_len); + } else { + ro = (sctp_route_t *) & net->ro; + } + if (stcb != NULL) { + if ((stcb->asoc.ecn_allowed) && ecn_ok) { + /* Enable ECN */ + tosBottom = (((((struct in6pcb *)inp)->in6p_flowinfo & 0x0c) | sctp_get_ect(stcb, chk)) << 4); + } else { + /* No ECN */ + tosBottom = ((((struct in6pcb *)inp)->in6p_flowinfo & 0x0c) << 4); + } + } else { + /* we could get no asoc if it is a O-O-T-B packet */ + tosBottom = ((((struct in6pcb *)inp)->in6p_flowinfo & 0x0c) << 4); + } + ip6h->ip6_flow = htonl(((tosTop << 24) | ((tosBottom | flowTop) << 16) | flowBottom)); + if (port) { + ip6h->ip6_nxt = IPPROTO_UDP; + } else { + ip6h->ip6_nxt = IPPROTO_SCTP; + } + ip6h->ip6_plen = (packet_length - sizeof(struct ip6_hdr)); + ip6h->ip6_dst = sin6->sin6_addr; + + /* + * Add SRC address selection here: we can only reuse to a + * limited degree the kame src-addr-sel, since we can try + * their selection but it may not be bound. + */ + bzero(&lsa6_tmp, sizeof(lsa6_tmp)); + lsa6_tmp.sin6_family = AF_INET6; + lsa6_tmp.sin6_len = sizeof(lsa6_tmp); + lsa6 = &lsa6_tmp; + if (net && out_of_asoc_ok == 0) { + if (net->ro._s_addr && (net->ro._s_addr->localifa_flags & (SCTP_BEING_DELETED | SCTP_ADDR_IFA_UNUSEABLE))) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + net->src_addr_selected = 0; + if (ro->ro_rt) { + RTFREE(ro->ro_rt); + ro->ro_rt = NULL; + } + } + if (net->src_addr_selected == 0) { + sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; + /* KAME hack: embed scopeid */ + if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { + SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + return (EINVAL); + } + /* Cache the source address */ + net->ro._s_addr = sctp_source_address_selection(inp, + stcb, + ro, + net, + 0, + vrf_id); + (void)sa6_recoverscope(sin6); + net->src_addr_selected = 1; + } + if (net->ro._s_addr == NULL) { + SCTPDBG(SCTP_DEBUG_OUTPUT3, "V6:No route to host\n"); + net->src_addr_selected = 0; + goto no_route; + } + lsa6->sin6_addr = net->ro._s_addr->address.sin6.sin6_addr; + } else { + sin6 = (struct sockaddr_in6 *)&ro->ro_dst; + /* KAME hack: embed scopeid */ + if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { + SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + return (EINVAL); + } + if (over_addr == NULL) { + struct sctp_ifa *_lsrc; + + _lsrc = sctp_source_address_selection(inp, stcb, ro, + net, + out_of_asoc_ok, + vrf_id); + if (_lsrc == NULL) { + goto no_route; + } + lsa6->sin6_addr = _lsrc->address.sin6.sin6_addr; + sctp_free_ifa(_lsrc); + } else { + lsa6->sin6_addr = over_addr->sin6.sin6_addr; + SCTP_RTALLOC(ro, vrf_id); + } + (void)sa6_recoverscope(sin6); + } + lsa6->sin6_port = inp->sctp_lport; + + if (ro->ro_rt == NULL) { + /* + * src addr selection failed to find a route (or + * valid source addr), so we can't get there from + * here! + */ + goto no_route; + } + /* + * XXX: sa6 may not have a valid sin6_scope_id in the + * non-SCOPEDROUTING case. + */ + bzero(&lsa6_storage, sizeof(lsa6_storage)); + lsa6_storage.sin6_family = AF_INET6; + lsa6_storage.sin6_len = sizeof(lsa6_storage); + lsa6_storage.sin6_addr = lsa6->sin6_addr; + if ((error = sa6_recoverscope(&lsa6_storage)) != 0) { + SCTPDBG(SCTP_DEBUG_OUTPUT3, "recover scope fails error %d\n", error); + sctp_m_freem(m); + return (error); + } + /* XXX */ + lsa6_storage.sin6_addr = lsa6->sin6_addr; + lsa6_storage.sin6_port = inp->sctp_lport; + lsa6 = &lsa6_storage; + ip6h->ip6_src = lsa6->sin6_addr; + + if (port) { + udp = (struct udphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr)); + udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); + udp->uh_dport = port; + udp->uh_ulen = htons(packet_length - sizeof(struct ip6_hdr)); + udp->uh_sum = 0; + sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr)); + } else { + sctphdr = (struct sctphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr)); + } + + sctphdr->src_port = src_port; + sctphdr->dest_port = dest_port; + sctphdr->v_tag = v_tag; + sctphdr->checksum = 0; + + /* + * We set the hop limit now since there is a good chance + * that our ro pointer is now filled + */ + ip6h->ip6_hlim = SCTP_GET_HLIM(inp, ro); + ifp = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); + +#ifdef SCTP_DEBUG + /* Copy to be sure something bad is not happening */ + sin6->sin6_addr = ip6h->ip6_dst; + lsa6->sin6_addr = ip6h->ip6_src; +#endif + + SCTPDBG(SCTP_DEBUG_OUTPUT3, "Calling ipv6 output routine from low level\n"); + SCTPDBG(SCTP_DEBUG_OUTPUT3, "src: "); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, (struct sockaddr *)lsa6); + SCTPDBG(SCTP_DEBUG_OUTPUT3, "dst: "); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, (struct sockaddr *)sin6); + if (net) { + sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; + /* preserve the port and scope for link local send */ + prev_scope = sin6->sin6_scope_id; + prev_port = sin6->sin6_port; + } + if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { + /* failed to prepend data, give up */ + sctp_m_freem(m); + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } +#ifdef SCTP_PACKET_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) + sctp_packet_log(m, packet_length); +#endif + SCTP_ATTACH_CHAIN(o_pak, m, packet_length); + if (port) { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + if (!(SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) && + (stcb) && + (stcb->asoc.loopback_scope))) { + sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip6_hdr) + sizeof(struct udphdr)); + SCTP_STAT_INCR(sctps_sendswcrc); + } else { + SCTP_STAT_INCR(sctps_sendnocrc); + } +#endif + if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), packet_length - sizeof(struct ip6_hdr))) == 0) { + udp->uh_sum = 0xffff; + } + } else { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + m->m_pkthdr.csum_flags = CSUM_SCTP; + m->m_pkthdr.csum_data = 0; + SCTP_STAT_INCR(sctps_sendhwcrc); +#endif + } + /* send it out. table id is taken from stcb */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) { + so = SCTP_INP_SO(inp); + SCTP_SOCKET_UNLOCK(so, 0); + } +#endif + SCTP_IP6_OUTPUT(ret, o_pak, (struct route_in6 *)ro, &ifp, stcb, vrf_id); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if ((SCTP_BASE_SYSCTL(sctp_output_unlocked)) && (so_locked)) { + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 0); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + } +#endif + if (net) { + /* for link local this must be done */ + sin6->sin6_scope_id = prev_scope; + sin6->sin6_port = prev_port; + } + SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret); + SCTP_STAT_INCR(sctps_sendpackets); + SCTP_STAT_INCR_COUNTER64(sctps_outpackets); + if (ret) { + SCTP_STAT_INCR(sctps_senderrors); + } + if (net == NULL) { + /* Now if we had a temp route free it */ + if (ro->ro_rt) { + RTFREE(ro->ro_rt); + } + } else { + /* PMTU check versus smallest asoc MTU goes here */ + if (ro->ro_rt == NULL) { + /* Route was freed */ + if (net->ro._s_addr && + net->src_addr_selected) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + } + net->src_addr_selected = 0; + } + if ((ro->ro_rt != NULL) && + (net->ro._s_addr)) { + uint32_t mtu; + + mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_rt); + if (mtu && + (stcb->asoc.smallest_mtu > mtu)) { + sctp_mtu_size_reset(inp, &stcb->asoc, mtu); + net->mtu = mtu; + if (net->port) { + net->mtu -= sizeof(struct udphdr); + } + } + } else if (ifp) { + if (ND_IFINFO(ifp)->linkmtu && + (stcb->asoc.smallest_mtu > ND_IFINFO(ifp)->linkmtu)) { + sctp_mtu_size_reset(inp, + &stcb->asoc, + ND_IFINFO(ifp)->linkmtu); + } + } + } + return (ret); + } +#endif + else { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Unknown protocol (TSNH) type %d\n", + ((struct sockaddr *)to)->sa_family); + sctp_m_freem(m); + SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT); + return (EFAULT); + } +} + + +void +sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + struct mbuf *m, *m_at, *mp_last; + struct sctp_nets *net; + struct sctp_init_chunk *init; + struct sctp_supported_addr_param *sup_addr; + struct sctp_adaptation_layer_indication *ali; + struct sctp_ecn_supported_param *ecn; + struct sctp_prsctp_supported_param *prsctp; + struct sctp_ecn_nonce_supported_param *ecn_nonce; + struct sctp_supported_chunk_types_param *pr_supported; + int cnt_inits_to = 0; + int padval, ret; + int num_ext; + int p_len; + + /* INIT's always go to the primary (and usually ONLY address) */ + mp_last = NULL; + net = stcb->asoc.primary_destination; + if (net == NULL) { + net = TAILQ_FIRST(&stcb->asoc.nets); + if (net == NULL) { + /* TSNH */ + return; + } + /* we confirm any address we send an INIT to */ + net->dest_state &= ~SCTP_ADDR_UNCONFIRMED; + (void)sctp_set_primary_addr(stcb, NULL, net); + } else { + /* we confirm any address we send an INIT to */ + net->dest_state &= ~SCTP_ADDR_UNCONFIRMED; + } + SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT\n"); +#ifdef INET6 + if (((struct sockaddr *)&(net->ro._l_addr))->sa_family == AF_INET6) { + /* + * special hook, if we are sending to link local it will not + * show up in our private address count. + */ + struct sockaddr_in6 *sin6l; + + sin6l = &net->ro._l_addr.sin6; + if (IN6_IS_ADDR_LINKLOCAL(&sin6l->sin6_addr)) + cnt_inits_to = 1; + } +#endif + if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { + /* This case should not happen */ + SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - failed timer?\n"); + return; + } + /* start the INIT timer */ + sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, net); + + m = sctp_get_mbuf_for_msg(MCLBYTES, 1, M_DONTWAIT, 1, MT_DATA); + if (m == NULL) { + /* No memory, INIT timer will re-attempt. */ + SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - mbuf?\n"); + return; + } + SCTP_BUF_LEN(m) = sizeof(struct sctp_init_chunk); + /* + * assume peer supports asconf in order to be able to queue local + * address changes while an INIT is in flight and before the assoc + * is established. + */ + stcb->asoc.peer_supports_asconf = 1; + /* Now lets put the SCTP header in place */ + init = mtod(m, struct sctp_init_chunk *); + /* now the chunk header */ + init->ch.chunk_type = SCTP_INITIATION; + init->ch.chunk_flags = 0; + /* fill in later from mbuf we build */ + init->ch.chunk_length = 0; + /* place in my tag */ + init->init.initiate_tag = htonl(stcb->asoc.my_vtag); + /* set up some of the credits. */ + init->init.a_rwnd = htonl(max(inp->sctp_socket ? SCTP_SB_LIMIT_RCV(inp->sctp_socket) : 0, + SCTP_MINIMAL_RWND)); + + init->init.num_outbound_streams = htons(stcb->asoc.pre_open_streams); + init->init.num_inbound_streams = htons(stcb->asoc.max_inbound_streams); + init->init.initial_tsn = htonl(stcb->asoc.init_seq_number); + /* now the address restriction */ + sup_addr = (struct sctp_supported_addr_param *)((caddr_t)init + + sizeof(*init)); + sup_addr->ph.param_type = htons(SCTP_SUPPORTED_ADDRTYPE); +#ifdef INET6 + /* we support 2 types: IPv6/IPv4 */ + sup_addr->ph.param_length = htons(sizeof(*sup_addr) + sizeof(uint16_t)); + sup_addr->addr_type[0] = htons(SCTP_IPV4_ADDRESS); + sup_addr->addr_type[1] = htons(SCTP_IPV6_ADDRESS); +#else + /* we support 1 type: IPv4 */ + sup_addr->ph.param_length = htons(sizeof(*sup_addr) + sizeof(uint8_t)); + sup_addr->addr_type[0] = htons(SCTP_IPV4_ADDRESS); + sup_addr->addr_type[1] = htons(0); /* this is the padding */ +#endif + SCTP_BUF_LEN(m) += sizeof(*sup_addr) + sizeof(uint16_t); + /* adaptation layer indication parameter */ + ali = (struct sctp_adaptation_layer_indication *)((caddr_t)sup_addr + sizeof(*sup_addr) + sizeof(uint16_t)); + ali->ph.param_type = htons(SCTP_ULP_ADAPTATION); + ali->ph.param_length = htons(sizeof(*ali)); + ali->indication = ntohl(inp->sctp_ep.adaptation_layer_indicator); + SCTP_BUF_LEN(m) += sizeof(*ali); + ecn = (struct sctp_ecn_supported_param *)((caddr_t)ali + sizeof(*ali)); + + if (SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly)) { + /* Add NAT friendly parameter */ + struct sctp_paramhdr *ph; + + ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + ph->param_type = htons(SCTP_HAS_NAT_SUPPORT); + ph->param_length = htons(sizeof(struct sctp_paramhdr)); + SCTP_BUF_LEN(m) += sizeof(struct sctp_paramhdr); + ecn = (struct sctp_ecn_supported_param *)((caddr_t)ph + sizeof(*ph)); + } + /* now any cookie time extensions */ + if (stcb->asoc.cookie_preserve_req) { + struct sctp_cookie_perserve_param *cookie_preserve; + + cookie_preserve = (struct sctp_cookie_perserve_param *)(ecn); + cookie_preserve->ph.param_type = htons(SCTP_COOKIE_PRESERVE); + cookie_preserve->ph.param_length = htons( + sizeof(*cookie_preserve)); + cookie_preserve->time = htonl(stcb->asoc.cookie_preserve_req); + SCTP_BUF_LEN(m) += sizeof(*cookie_preserve); + ecn = (struct sctp_ecn_supported_param *)( + (caddr_t)cookie_preserve + sizeof(*cookie_preserve)); + stcb->asoc.cookie_preserve_req = 0; + } + /* ECN parameter */ + if (SCTP_BASE_SYSCTL(sctp_ecn_enable) == 1) { + ecn->ph.param_type = htons(SCTP_ECN_CAPABLE); + ecn->ph.param_length = htons(sizeof(*ecn)); + SCTP_BUF_LEN(m) += sizeof(*ecn); + prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn + + sizeof(*ecn)); + } else { + prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn); + } + /* And now tell the peer we do pr-sctp */ + prsctp->ph.param_type = htons(SCTP_PRSCTP_SUPPORTED); + prsctp->ph.param_length = htons(sizeof(*prsctp)); + SCTP_BUF_LEN(m) += sizeof(*prsctp); + + /* And now tell the peer we do all the extensions */ + pr_supported = (struct sctp_supported_chunk_types_param *) + ((caddr_t)prsctp + sizeof(*prsctp)); + pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT); + num_ext = 0; + pr_supported->chunk_types[num_ext++] = SCTP_ASCONF; + pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK; + pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN; + pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED; + pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET; + if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) { + pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION; + } + if (stcb->asoc.sctp_nr_sack_on_off == 1) { + pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK; + } + p_len = sizeof(*pr_supported) + num_ext; + pr_supported->ph.param_length = htons(p_len); + bzero((caddr_t)pr_supported + p_len, SCTP_SIZE32(p_len) - p_len); + SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); + + + /* ECN nonce: And now tell the peer we support ECN nonce */ + if (SCTP_BASE_SYSCTL(sctp_ecn_nonce)) { + ecn_nonce = (struct sctp_ecn_nonce_supported_param *) + ((caddr_t)pr_supported + SCTP_SIZE32(p_len)); + ecn_nonce->ph.param_type = htons(SCTP_ECN_NONCE_SUPPORTED); + ecn_nonce->ph.param_length = htons(sizeof(*ecn_nonce)); + SCTP_BUF_LEN(m) += sizeof(*ecn_nonce); + } + /* add authentication parameters */ + if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) { + struct sctp_auth_random *randp; + struct sctp_auth_hmac_algo *hmacs; + struct sctp_auth_chunk_list *chunks; + + /* attach RANDOM parameter, if available */ + if (stcb->asoc.authinfo.random != NULL) { + randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + p_len = sizeof(*randp) + stcb->asoc.authinfo.random_len; + /* random key already contains the header */ + bcopy(stcb->asoc.authinfo.random->key, randp, p_len); + /* zero out any padding required */ + bzero((caddr_t)randp + p_len, SCTP_SIZE32(p_len) - p_len); + SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); + } + /* add HMAC_ALGO parameter */ + hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + p_len = sctp_serialize_hmaclist(stcb->asoc.local_hmacs, + (uint8_t *) hmacs->hmac_ids); + if (p_len > 0) { + p_len += sizeof(*hmacs); + hmacs->ph.param_type = htons(SCTP_HMAC_LIST); + hmacs->ph.param_length = htons(p_len); + /* zero out any padding required */ + bzero((caddr_t)hmacs + p_len, SCTP_SIZE32(p_len) - p_len); + SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); + } + /* add CHUNKS parameter */ + chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + p_len = sctp_serialize_auth_chunks(stcb->asoc.local_auth_chunks, + chunks->chunk_types); + if (p_len > 0) { + p_len += sizeof(*chunks); + chunks->ph.param_type = htons(SCTP_CHUNK_LIST); + chunks->ph.param_length = htons(p_len); + /* zero out any padding required */ + bzero((caddr_t)chunks + p_len, SCTP_SIZE32(p_len) - p_len); + SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); + } + } + m_at = m; + /* now the addresses */ + { + struct sctp_scoping scp; + + /* + * To optimize this we could put the scoping stuff into a + * structure and remove the individual uint8's from the + * assoc structure. Then we could just sifa in the address + * within the stcb.. but for now this is a quick hack to get + * the address stuff teased apart. + */ + scp.ipv4_addr_legal = stcb->asoc.ipv4_addr_legal; + scp.ipv6_addr_legal = stcb->asoc.ipv6_addr_legal; + scp.loopback_scope = stcb->asoc.loopback_scope; + scp.ipv4_local_scope = stcb->asoc.ipv4_local_scope; + scp.local_scope = stcb->asoc.local_scope; + scp.site_scope = stcb->asoc.site_scope; + + m_at = sctp_add_addresses_to_i_ia(inp, &scp, m_at, cnt_inits_to); + } + + /* calulate the size and update pkt header and chunk header */ + p_len = 0; + for (m_at = m; m_at; m_at = SCTP_BUF_NEXT(m_at)) { + if (SCTP_BUF_NEXT(m_at) == NULL) + mp_last = m_at; + p_len += SCTP_BUF_LEN(m_at); + } + init->ch.chunk_length = htons(p_len); + /* + * We sifa 0 here to NOT set IP_DF if its IPv4, we ignore the return + * here since the timer will drive a retranmission. + */ + + /* I don't expect this to execute but we will be safe here */ + padval = p_len % 4; + if ((padval) && (mp_last)) { + /* + * The compiler worries that mp_last may not be set even + * though I think it is impossible :-> however we add + * mp_last here just in case. + */ + ret = sctp_add_pad_tombuf(mp_last, (4 - padval)); + if (ret) { + /* Houston we have a problem, no space */ + sctp_m_freem(m); + return; + } + p_len += padval; + } + SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - calls lowlevel_output\n"); + ret = sctp_lowlevel_chunk_output(inp, stcb, net, + (struct sockaddr *)&net->ro._l_addr, + m, 0, NULL, 0, 0, 0, NULL, 0, + inp->sctp_lport, stcb->rport, htonl(0), + net->port, so_locked, NULL); + SCTPDBG(SCTP_DEBUG_OUTPUT4, "lowlevel_output - %d\n", ret); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); +} + +struct mbuf * +sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, + int param_offset, int *abort_processing, struct sctp_chunkhdr *cp, int *nat_friendly) +{ + /* + * Given a mbuf containing an INIT or INIT-ACK with the param_offset + * being equal to the beginning of the params i.e. (iphlen + + * sizeof(struct sctp_init_msg) parse through the parameters to the + * end of the mbuf verifying that all parameters are known. + * + * For unknown parameters build and return a mbuf with + * UNRECOGNIZED_PARAMETER errors. If the flags indicate to stop + * processing this chunk stop, and set *abort_processing to 1. + * + * By having param_offset be pre-set to where parameters begin it is + * hoped that this routine may be reused in the future by new + * features. + */ + struct sctp_paramhdr *phdr, params; + + struct mbuf *mat, *op_err; + char tempbuf[SCTP_PARAM_BUFFER_SIZE]; + int at, limit, pad_needed; + uint16_t ptype, plen, padded_size; + int err_at; + + *abort_processing = 0; + mat = in_initpkt; + err_at = 0; + limit = ntohs(cp->chunk_length) - sizeof(struct sctp_init_chunk); + at = param_offset; + op_err = NULL; + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Check for unrecognized param's\n"); + phdr = sctp_get_next_param(mat, at, ¶ms, sizeof(params)); + while ((phdr != NULL) && ((size_t)limit >= sizeof(struct sctp_paramhdr))) { + ptype = ntohs(phdr->param_type); + plen = ntohs(phdr->param_length); + if ((plen > limit) || (plen < sizeof(struct sctp_paramhdr))) { + /* wacked parameter */ + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error %d\n", plen); + goto invalid_size; + } + limit -= SCTP_SIZE32(plen); + /*- + * All parameters for all chunks that we know/understand are + * listed here. We process them other places and make + * appropriate stop actions per the upper bits. However this + * is the generic routine processor's can call to get back + * an operr.. to either incorporate (init-ack) or send. + */ + padded_size = SCTP_SIZE32(plen); + switch (ptype) { + /* Param's with variable size */ + case SCTP_HEARTBEAT_INFO: + case SCTP_STATE_COOKIE: + case SCTP_UNRECOG_PARAM: + case SCTP_ERROR_CAUSE_IND: + /* ok skip fwd */ + at += padded_size; + break; + /* Param's with variable size within a range */ + case SCTP_CHUNK_LIST: + case SCTP_SUPPORTED_CHUNK_EXT: + if (padded_size > (sizeof(struct sctp_supported_chunk_types_param) + (sizeof(uint8_t) * SCTP_MAX_SUPPORTED_EXT))) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error chklist %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + case SCTP_SUPPORTED_ADDRTYPE: + if (padded_size > SCTP_MAX_ADDR_PARAMS_SIZE) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error supaddrtype %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + case SCTP_RANDOM: + if (padded_size > (sizeof(struct sctp_auth_random) + SCTP_RANDOM_MAX_SIZE)) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error random %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + case SCTP_SET_PRIM_ADDR: + case SCTP_DEL_IP_ADDRESS: + case SCTP_ADD_IP_ADDRESS: + if ((padded_size != sizeof(struct sctp_asconf_addrv4_param)) && + (padded_size != sizeof(struct sctp_asconf_addr_param))) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error setprim %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + /* Param's with a fixed size */ + case SCTP_IPV4_ADDRESS: + if (padded_size != sizeof(struct sctp_ipv4addr_param)) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ipv4 addr %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + case SCTP_IPV6_ADDRESS: + if (padded_size != sizeof(struct sctp_ipv6addr_param)) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ipv6 addr %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + case SCTP_COOKIE_PRESERVE: + if (padded_size != sizeof(struct sctp_cookie_perserve_param)) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error cookie-preserve %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + case SCTP_HAS_NAT_SUPPORT: + *nat_friendly = 1; + /* fall through */ + case SCTP_ECN_NONCE_SUPPORTED: + case SCTP_PRSCTP_SUPPORTED: + + if (padded_size != sizeof(struct sctp_paramhdr)) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ecnnonce/prsctp/nat support %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + case SCTP_ECN_CAPABLE: + if (padded_size != sizeof(struct sctp_ecn_supported_param)) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ecn %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + case SCTP_ULP_ADAPTATION: + if (padded_size != sizeof(struct sctp_adaptation_layer_indication)) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error adapatation %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + case SCTP_SUCCESS_REPORT: + if (padded_size != sizeof(struct sctp_asconf_paramhdr)) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error success %d\n", plen); + goto invalid_size; + } + at += padded_size; + break; + case SCTP_HOSTNAME_ADDRESS: + { + /* We can NOT handle HOST NAME addresses!! */ + int l_len; + + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Can't handle hostname addresses.. abort processing\n"); + *abort_processing = 1; + if (op_err == NULL) { + /* Ok need to try to get a mbuf */ +#ifdef INET6 + l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); +#else + l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); +#endif + l_len += plen; + l_len += sizeof(struct sctp_paramhdr); + op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA); + if (op_err) { + SCTP_BUF_LEN(op_err) = 0; + /* + * pre-reserve space for ip + * and sctp header and + * chunk hdr + */ +#ifdef INET6 + SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); +#else + SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); +#endif + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); + } + } + if (op_err) { + /* If we have space */ + struct sctp_paramhdr s; + + if (err_at % 4) { + uint32_t cpthis = 0; + + pad_needed = 4 - (err_at % 4); + m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis); + err_at += pad_needed; + } + s.param_type = htons(SCTP_CAUSE_UNRESOLVABLE_ADDR); + s.param_length = htons(sizeof(s) + plen); + m_copyback(op_err, err_at, sizeof(s), (caddr_t)&s); + err_at += sizeof(s); + phdr = sctp_get_next_param(mat, at, (struct sctp_paramhdr *)tempbuf, min(sizeof(tempbuf), plen)); + if (phdr == NULL) { + sctp_m_freem(op_err); + /* + * we are out of memory but + * we still need to have a + * look at what to do (the + * system is in trouble + * though). + */ + return (NULL); + } + m_copyback(op_err, err_at, plen, (caddr_t)phdr); + err_at += plen; + } + return (op_err); + break; + } + default: + /* + * we do not recognize the parameter figure out what + * we do. + */ + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Hit default param %x\n", ptype); + if ((ptype & 0x4000) == 0x4000) { + /* Report bit is set?? */ + SCTPDBG(SCTP_DEBUG_OUTPUT1, "report op err\n"); + if (op_err == NULL) { + int l_len; + + /* Ok need to try to get an mbuf */ +#ifdef INET6 + l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); +#else + l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); +#endif + l_len += plen; + l_len += sizeof(struct sctp_paramhdr); + op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA); + if (op_err) { + SCTP_BUF_LEN(op_err) = 0; +#ifdef INET6 + SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); +#else + SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); +#endif + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); + } + } + if (op_err) { + /* If we have space */ + struct sctp_paramhdr s; + + if (err_at % 4) { + uint32_t cpthis = 0; + + pad_needed = 4 - (err_at % 4); + m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis); + err_at += pad_needed; + } + s.param_type = htons(SCTP_UNRECOG_PARAM); + s.param_length = htons(sizeof(s) + plen); + m_copyback(op_err, err_at, sizeof(s), (caddr_t)&s); + err_at += sizeof(s); + if (plen > sizeof(tempbuf)) { + plen = sizeof(tempbuf); + } + phdr = sctp_get_next_param(mat, at, (struct sctp_paramhdr *)tempbuf, min(sizeof(tempbuf), plen)); + if (phdr == NULL) { + sctp_m_freem(op_err); + /* + * we are out of memory but + * we still need to have a + * look at what to do (the + * system is in trouble + * though). + */ + op_err = NULL; + goto more_processing; + } + m_copyback(op_err, err_at, plen, (caddr_t)phdr); + err_at += plen; + } + } + more_processing: + if ((ptype & 0x8000) == 0x0000) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "stop proc\n"); + return (op_err); + } else { + /* skip this chunk and continue processing */ + SCTPDBG(SCTP_DEBUG_OUTPUT1, "move on\n"); + at += SCTP_SIZE32(plen); + } + break; + + } + phdr = sctp_get_next_param(mat, at, ¶ms, sizeof(params)); + } + return (op_err); +invalid_size: + SCTPDBG(SCTP_DEBUG_OUTPUT1, "abort flag set\n"); + *abort_processing = 1; + if ((op_err == NULL) && phdr) { + int l_len; + +#ifdef INET6 + l_len = sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); +#else + l_len = sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); +#endif + l_len += (2 * sizeof(struct sctp_paramhdr)); + op_err = sctp_get_mbuf_for_msg(l_len, 0, M_DONTWAIT, 1, MT_DATA); + if (op_err) { + SCTP_BUF_LEN(op_err) = 0; +#ifdef INET6 + SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); +#else + SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); +#endif + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); + SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); + } + } + if ((op_err) && phdr) { + struct sctp_paramhdr s; + + if (err_at % 4) { + uint32_t cpthis = 0; + + pad_needed = 4 - (err_at % 4); + m_copyback(op_err, err_at, pad_needed, (caddr_t)&cpthis); + err_at += pad_needed; + } + s.param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + s.param_length = htons(sizeof(s) + sizeof(struct sctp_paramhdr)); + m_copyback(op_err, err_at, sizeof(s), (caddr_t)&s); + err_at += sizeof(s); + /* Only copy back the p-hdr that caused the issue */ + m_copyback(op_err, err_at, sizeof(struct sctp_paramhdr), (caddr_t)phdr); + } + return (op_err); +} + +static int +sctp_are_there_new_addresses(struct sctp_association *asoc, + struct mbuf *in_initpkt, int iphlen, int offset) +{ + /* + * Given a INIT packet, look through the packet to verify that there + * are NO new addresses. As we go through the parameters add reports + * of any un-understood parameters that require an error. Also we + * must return (1) to drop the packet if we see a un-understood + * parameter that tells us to drop the chunk. + */ + struct sockaddr_in sin4, *sa4; + +#ifdef INET6 + struct sockaddr_in6 sin6, *sa6; + +#endif + struct sockaddr *sa_touse; + struct sockaddr *sa; + struct sctp_paramhdr *phdr, params; + struct ip *iph; + +#ifdef INET6 + struct ip6_hdr *ip6h; + +#endif + struct mbuf *mat; + uint16_t ptype, plen; + int err_at; + uint8_t fnd; + struct sctp_nets *net; + + memset(&sin4, 0, sizeof(sin4)); +#ifdef INET6 + memset(&sin6, 0, sizeof(sin6)); +#endif + sin4.sin_family = AF_INET; + sin4.sin_len = sizeof(sin4); +#ifdef INET6 + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); +#endif + sa_touse = NULL; + /* First what about the src address of the pkt ? */ + iph = mtod(in_initpkt, struct ip *); + switch (iph->ip_v) { + case IPVERSION: + /* source addr is IPv4 */ + sin4.sin_addr = iph->ip_src; + sa_touse = (struct sockaddr *)&sin4; + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + /* source addr is IPv6 */ + ip6h = mtod(in_initpkt, struct ip6_hdr *); + sin6.sin6_addr = ip6h->ip6_src; + sa_touse = (struct sockaddr *)&sin6; + break; +#endif + default: + return (1); + } + + fnd = 0; + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + sa = (struct sockaddr *)&net->ro._l_addr; + if (sa->sa_family == sa_touse->sa_family) { + if (sa->sa_family == AF_INET) { + sa4 = (struct sockaddr_in *)sa; + if (sa4->sin_addr.s_addr == + sin4.sin_addr.s_addr) { + fnd = 1; + break; + } + } +#ifdef INET6 + if (sa->sa_family == AF_INET6) { + sa6 = (struct sockaddr_in6 *)sa; + if (SCTP6_ARE_ADDR_EQUAL(sa6, + &sin6)) { + fnd = 1; + break; + } + } +#endif + } + } + if (fnd == 0) { + /* New address added! no need to look futher. */ + return (1); + } + /* Ok so far lets munge through the rest of the packet */ + mat = in_initpkt; + err_at = 0; + sa_touse = NULL; + offset += sizeof(struct sctp_init_chunk); + phdr = sctp_get_next_param(mat, offset, ¶ms, sizeof(params)); + while (phdr) { + ptype = ntohs(phdr->param_type); + plen = ntohs(phdr->param_length); + if (ptype == SCTP_IPV4_ADDRESS) { + struct sctp_ipv4addr_param *p4, p4_buf; + + phdr = sctp_get_next_param(mat, offset, + (struct sctp_paramhdr *)&p4_buf, sizeof(p4_buf)); + if (plen != sizeof(struct sctp_ipv4addr_param) || + phdr == NULL) { + return (1); + } + p4 = (struct sctp_ipv4addr_param *)phdr; + sin4.sin_addr.s_addr = p4->addr; + sa_touse = (struct sockaddr *)&sin4; + } else if (ptype == SCTP_IPV6_ADDRESS) { + struct sctp_ipv6addr_param *p6, p6_buf; + + phdr = sctp_get_next_param(mat, offset, + (struct sctp_paramhdr *)&p6_buf, sizeof(p6_buf)); + if (plen != sizeof(struct sctp_ipv6addr_param) || + phdr == NULL) { + return (1); + } + p6 = (struct sctp_ipv6addr_param *)phdr; +#ifdef INET6 + memcpy((caddr_t)&sin6.sin6_addr, p6->addr, + sizeof(p6->addr)); +#endif + sa_touse = (struct sockaddr *)&sin4; + } + if (sa_touse) { + /* ok, sa_touse points to one to check */ + fnd = 0; + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + sa = (struct sockaddr *)&net->ro._l_addr; + if (sa->sa_family != sa_touse->sa_family) { + continue; + } + if (sa->sa_family == AF_INET) { + sa4 = (struct sockaddr_in *)sa; + if (sa4->sin_addr.s_addr == + sin4.sin_addr.s_addr) { + fnd = 1; + break; + } + } +#ifdef INET6 + if (sa->sa_family == AF_INET6) { + sa6 = (struct sockaddr_in6 *)sa; + if (SCTP6_ARE_ADDR_EQUAL( + sa6, &sin6)) { + fnd = 1; + break; + } + } +#endif + } + if (!fnd) { + /* New addr added! no need to look further */ + return (1); + } + } + offset += SCTP_SIZE32(plen); + phdr = sctp_get_next_param(mat, offset, ¶ms, sizeof(params)); + } + return (0); +} + +/* + * Given a MBUF chain that was sent into us containing an INIT. Build a + * INIT-ACK with COOKIE and send back. We assume that the in_initpkt has done + * a pullup to include IPv6/4header, SCTP header and initial part of INIT + * message (i.e. the struct sctp_init_msg). + */ +void +sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct mbuf *init_pkt, int iphlen, int offset, struct sctphdr *sh, + struct sctp_init_chunk *init_chk, uint32_t vrf_id, uint16_t port, int hold_inp_lock) +{ + struct sctp_association *asoc; + struct mbuf *m, *m_at, *m_tmp, *m_cookie, *op_err, *mp_last; + struct sctp_init_ack_chunk *initack; + struct sctp_adaptation_layer_indication *ali; + struct sctp_ecn_supported_param *ecn; + struct sctp_prsctp_supported_param *prsctp; + struct sctp_ecn_nonce_supported_param *ecn_nonce; + struct sctp_supported_chunk_types_param *pr_supported; + union sctp_sockstore store, store1, *over_addr; + struct sockaddr_in *sin, *to_sin; + +#ifdef INET6 + struct sockaddr_in6 *sin6, *to_sin6; + +#endif + struct ip *iph; + +#ifdef INET6 + struct ip6_hdr *ip6; + +#endif + struct sockaddr *to; + struct sctp_state_cookie stc; + struct sctp_nets *net = NULL; + uint8_t *signature = NULL; + int cnt_inits_to = 0; + uint16_t his_limit, i_want; + int abort_flag, padval; + int num_ext; + int p_len; + int nat_friendly = 0; + struct socket *so; + + if (stcb) + asoc = &stcb->asoc; + else + asoc = NULL; + mp_last = NULL; + if ((asoc != NULL) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) && + (sctp_are_there_new_addresses(asoc, init_pkt, iphlen, offset))) { + /* new addresses, out of here in non-cookie-wait states */ + /* + * Send a ABORT, we don't add the new address error clause + * though we even set the T bit and copy in the 0 tag.. this + * looks no different than if no listener was present. + */ + sctp_send_abort(init_pkt, iphlen, sh, 0, NULL, vrf_id, port); + return; + } + abort_flag = 0; + op_err = sctp_arethere_unrecognized_parameters(init_pkt, + (offset + sizeof(struct sctp_init_chunk)), + &abort_flag, (struct sctp_chunkhdr *)init_chk, &nat_friendly); + if (abort_flag) { +do_a_abort: + sctp_send_abort(init_pkt, iphlen, sh, + init_chk->init.initiate_tag, op_err, vrf_id, port); + return; + } + m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + if (m == NULL) { + /* No memory, INIT timer will re-attempt. */ + if (op_err) + sctp_m_freem(op_err); + return; + } + SCTP_BUF_LEN(m) = sizeof(struct sctp_init_chunk); + + /* the time I built cookie */ + (void)SCTP_GETTIME_TIMEVAL(&stc.time_entered); + + /* populate any tie tags */ + if (asoc != NULL) { + /* unlock before tag selections */ + stc.tie_tag_my_vtag = asoc->my_vtag_nonce; + stc.tie_tag_peer_vtag = asoc->peer_vtag_nonce; + stc.cookie_life = asoc->cookie_life; + net = asoc->primary_destination; + } else { + stc.tie_tag_my_vtag = 0; + stc.tie_tag_peer_vtag = 0; + /* life I will award this cookie */ + stc.cookie_life = inp->sctp_ep.def_cookie_life; + } + + /* copy in the ports for later check */ + stc.myport = sh->dest_port; + stc.peerport = sh->src_port; + + /* + * If we wanted to honor cookie life extentions, we would add to + * stc.cookie_life. For now we should NOT honor any extension + */ + stc.site_scope = stc.local_scope = stc.loopback_scope = 0; + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + struct inpcb *in_inp; + + /* Its a V6 socket */ + in_inp = (struct inpcb *)inp; + stc.ipv6_addr_legal = 1; + /* Now look at the binding flag to see if V4 will be legal */ + if (SCTP_IPV6_V6ONLY(in_inp) == 0) { + stc.ipv4_addr_legal = 1; + } else { + /* V4 addresses are NOT legal on the association */ + stc.ipv4_addr_legal = 0; + } + } else { + /* Its a V4 socket, no - V6 */ + stc.ipv4_addr_legal = 1; + stc.ipv6_addr_legal = 0; + } + +#ifdef SCTP_DONT_DO_PRIVADDR_SCOPE + stc.ipv4_scope = 1; +#else + stc.ipv4_scope = 0; +#endif + /* now for scope setup */ + memset((caddr_t)&store, 0, sizeof(store)); + memset((caddr_t)&store1, 0, sizeof(store1)); + sin = &store.sin; + to_sin = &store1.sin; +#ifdef INET6 + sin6 = &store.sin6; + to_sin6 = &store1.sin6; +#endif + iph = mtod(init_pkt, struct ip *); + /* establish the to_addr's */ + switch (iph->ip_v) { + case IPVERSION: + to_sin->sin_port = sh->dest_port; + to_sin->sin_family = AF_INET; + to_sin->sin_len = sizeof(struct sockaddr_in); + to_sin->sin_addr = iph->ip_dst; + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + ip6 = mtod(init_pkt, struct ip6_hdr *); + to_sin6->sin6_addr = ip6->ip6_dst; + to_sin6->sin6_scope_id = 0; + to_sin6->sin6_port = sh->dest_port; + to_sin6->sin6_family = AF_INET6; + to_sin6->sin6_len = sizeof(struct sockaddr_in6); + break; +#endif + default: + goto do_a_abort; + break; + }; + + if (net == NULL) { + to = (struct sockaddr *)&store; + switch (iph->ip_v) { + case IPVERSION: + { + sin->sin_family = AF_INET; + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_port = sh->src_port; + sin->sin_addr = iph->ip_src; + /* lookup address */ + stc.address[0] = sin->sin_addr.s_addr; + stc.address[1] = 0; + stc.address[2] = 0; + stc.address[3] = 0; + stc.addr_type = SCTP_IPV4_ADDRESS; + /* local from address */ + stc.laddress[0] = to_sin->sin_addr.s_addr; + stc.laddress[1] = 0; + stc.laddress[2] = 0; + stc.laddress[3] = 0; + stc.laddr_type = SCTP_IPV4_ADDRESS; + /* scope_id is only for v6 */ + stc.scope_id = 0; +#ifndef SCTP_DONT_DO_PRIVADDR_SCOPE + if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { + stc.ipv4_scope = 1; + } +#else + stc.ipv4_scope = 1; +#endif /* SCTP_DONT_DO_PRIVADDR_SCOPE */ + /* Must use the address in this case */ + if (sctp_is_address_on_local_host((struct sockaddr *)sin, vrf_id)) { + stc.loopback_scope = 1; + stc.ipv4_scope = 1; + stc.site_scope = 1; + stc.local_scope = 0; + } + break; + } +#ifdef INET6 + case IPV6_VERSION >> 4: + { + ip6 = mtod(init_pkt, struct ip6_hdr *); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_port = sh->src_port; + sin6->sin6_addr = ip6->ip6_src; + /* lookup address */ + memcpy(&stc.address, &sin6->sin6_addr, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + stc.addr_type = SCTP_IPV6_ADDRESS; + stc.scope_id = 0; + if (sctp_is_address_on_local_host((struct sockaddr *)sin6, vrf_id)) { + /* + * FIX ME: does this have scope from + * rcvif? + */ + (void)sa6_recoverscope(sin6); + stc.scope_id = sin6->sin6_scope_id; + sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)); + stc.loopback_scope = 1; + stc.local_scope = 0; + stc.site_scope = 1; + stc.ipv4_scope = 1; + } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { + /* + * If the new destination is a + * LINK_LOCAL we must have common + * both site and local scope. Don't + * set local scope though since we + * must depend on the source to be + * added implicitly. We cannot + * assure just because we share one + * link that all links are common. + */ + stc.local_scope = 0; + stc.site_scope = 1; + stc.ipv4_scope = 1; + /* + * we start counting for the private + * address stuff at 1. since the + * link local we source from won't + * show up in our scoped count. + */ + cnt_inits_to = 1; + /* + * pull out the scope_id from + * incoming pkt + */ + /* + * FIX ME: does this have scope from + * rcvif? + */ + (void)sa6_recoverscope(sin6); + stc.scope_id = sin6->sin6_scope_id; + sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)); + } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) { + /* + * If the new destination is + * SITE_LOCAL then we must have site + * scope in common. + */ + stc.site_scope = 1; + } + memcpy(&stc.laddress, &to_sin6->sin6_addr, sizeof(struct in6_addr)); + stc.laddr_type = SCTP_IPV6_ADDRESS; + break; + } +#endif + default: + /* TSNH */ + goto do_a_abort; + break; + } + } else { + /* set the scope per the existing tcb */ + +#ifdef INET6 + struct sctp_nets *lnet; + +#endif + + stc.loopback_scope = asoc->loopback_scope; + stc.ipv4_scope = asoc->ipv4_local_scope; + stc.site_scope = asoc->site_scope; + stc.local_scope = asoc->local_scope; +#ifdef INET6 + /* Why do we not consider IPv4 LL addresses? */ + TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) { + if (lnet->ro._l_addr.sin6.sin6_family == AF_INET6) { + if (IN6_IS_ADDR_LINKLOCAL(&lnet->ro._l_addr.sin6.sin6_addr)) { + /* + * if we have a LL address, start + * counting at 1. + */ + cnt_inits_to = 1; + } + } + } +#endif + /* use the net pointer */ + to = (struct sockaddr *)&net->ro._l_addr; + switch (to->sa_family) { + case AF_INET: + sin = (struct sockaddr_in *)to; + stc.address[0] = sin->sin_addr.s_addr; + stc.address[1] = 0; + stc.address[2] = 0; + stc.address[3] = 0; + stc.addr_type = SCTP_IPV4_ADDRESS; + if (net->src_addr_selected == 0) { + /* + * strange case here, the INIT should have + * did the selection. + */ + net->ro._s_addr = sctp_source_address_selection(inp, + stcb, (sctp_route_t *) & net->ro, + net, 0, vrf_id); + if (net->ro._s_addr == NULL) + return; + + net->src_addr_selected = 1; + + } + stc.laddress[0] = net->ro._s_addr->address.sin.sin_addr.s_addr; + stc.laddress[1] = 0; + stc.laddress[2] = 0; + stc.laddress[3] = 0; + stc.laddr_type = SCTP_IPV4_ADDRESS; + break; +#ifdef INET6 + case AF_INET6: + sin6 = (struct sockaddr_in6 *)to; + memcpy(&stc.address, &sin6->sin6_addr, + sizeof(struct in6_addr)); + stc.addr_type = SCTP_IPV6_ADDRESS; + if (net->src_addr_selected == 0) { + /* + * strange case here, the INIT should have + * did the selection. + */ + net->ro._s_addr = sctp_source_address_selection(inp, + stcb, (sctp_route_t *) & net->ro, + net, 0, vrf_id); + if (net->ro._s_addr == NULL) + return; + + net->src_addr_selected = 1; + } + memcpy(&stc.laddress, &net->ro._s_addr->address.sin6.sin6_addr, + sizeof(struct in6_addr)); + stc.laddr_type = SCTP_IPV6_ADDRESS; + break; +#endif + } + } + /* Now lets put the SCTP header in place */ + initack = mtod(m, struct sctp_init_ack_chunk *); + /* Save it off for quick ref */ + stc.peers_vtag = init_chk->init.initiate_tag; + /* who are we */ + memcpy(stc.identification, SCTP_VERSION_STRING, + min(strlen(SCTP_VERSION_STRING), sizeof(stc.identification))); + /* now the chunk header */ + initack->ch.chunk_type = SCTP_INITIATION_ACK; + initack->ch.chunk_flags = 0; + /* fill in later from mbuf we build */ + initack->ch.chunk_length = 0; + /* place in my tag */ + if ((asoc != NULL) && + ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_INUSE) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED))) { + /* re-use the v-tags and init-seq here */ + initack->init.initiate_tag = htonl(asoc->my_vtag); + initack->init.initial_tsn = htonl(asoc->init_seq_number); + } else { + uint32_t vtag, itsn; + + if (hold_inp_lock) { + SCTP_INP_INCR_REF(inp); + SCTP_INP_RUNLOCK(inp); + } + if (asoc) { + atomic_add_int(&asoc->refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + new_tag: + vtag = sctp_select_a_tag(inp, inp->sctp_lport, sh->src_port, 1); + if ((asoc->peer_supports_nat) && (vtag == asoc->my_vtag)) { + /* + * Got a duplicate vtag on some guy behind a + * nat make sure we don't use it. + */ + goto new_tag; + } + initack->init.initiate_tag = htonl(vtag); + /* get a TSN to use too */ + itsn = sctp_select_initial_TSN(&inp->sctp_ep); + initack->init.initial_tsn = htonl(itsn); + SCTP_TCB_LOCK(stcb); + atomic_add_int(&asoc->refcnt, -1); + } else { + vtag = sctp_select_a_tag(inp, inp->sctp_lport, sh->src_port, 1); + initack->init.initiate_tag = htonl(vtag); + /* get a TSN to use too */ + initack->init.initial_tsn = htonl(sctp_select_initial_TSN(&inp->sctp_ep)); + } + if (hold_inp_lock) { + SCTP_INP_RLOCK(inp); + SCTP_INP_DECR_REF(inp); + } + } + /* save away my tag to */ + stc.my_vtag = initack->init.initiate_tag; + + /* set up some of the credits. */ + so = inp->sctp_socket; + if (so == NULL) { + /* memory problem */ + sctp_m_freem(m); + return; + } else { + initack->init.a_rwnd = htonl(max(SCTP_SB_LIMIT_RCV(so), SCTP_MINIMAL_RWND)); + } + /* set what I want */ + his_limit = ntohs(init_chk->init.num_inbound_streams); + /* choose what I want */ + if (asoc != NULL) { + if (asoc->streamoutcnt > inp->sctp_ep.pre_open_stream_count) { + i_want = asoc->streamoutcnt; + } else { + i_want = inp->sctp_ep.pre_open_stream_count; + } + } else { + i_want = inp->sctp_ep.pre_open_stream_count; + } + if (his_limit < i_want) { + /* I Want more :< */ + initack->init.num_outbound_streams = init_chk->init.num_inbound_streams; + } else { + /* I can have what I want :> */ + initack->init.num_outbound_streams = htons(i_want); + } + /* tell him his limt. */ + initack->init.num_inbound_streams = + htons(inp->sctp_ep.max_open_streams_intome); + + /* adaptation layer indication parameter */ + ali = (struct sctp_adaptation_layer_indication *)((caddr_t)initack + sizeof(*initack)); + ali->ph.param_type = htons(SCTP_ULP_ADAPTATION); + ali->ph.param_length = htons(sizeof(*ali)); + ali->indication = ntohl(inp->sctp_ep.adaptation_layer_indicator); + SCTP_BUF_LEN(m) += sizeof(*ali); + ecn = (struct sctp_ecn_supported_param *)((caddr_t)ali + sizeof(*ali)); + + /* ECN parameter */ + if (SCTP_BASE_SYSCTL(sctp_ecn_enable) == 1) { + ecn->ph.param_type = htons(SCTP_ECN_CAPABLE); + ecn->ph.param_length = htons(sizeof(*ecn)); + SCTP_BUF_LEN(m) += sizeof(*ecn); + + prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn + + sizeof(*ecn)); + } else { + prsctp = (struct sctp_prsctp_supported_param *)((caddr_t)ecn); + } + /* And now tell the peer we do pr-sctp */ + prsctp->ph.param_type = htons(SCTP_PRSCTP_SUPPORTED); + prsctp->ph.param_length = htons(sizeof(*prsctp)); + SCTP_BUF_LEN(m) += sizeof(*prsctp); + if (nat_friendly) { + /* Add NAT friendly parameter */ + struct sctp_paramhdr *ph; + + ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + ph->param_type = htons(SCTP_HAS_NAT_SUPPORT); + ph->param_length = htons(sizeof(struct sctp_paramhdr)); + SCTP_BUF_LEN(m) += sizeof(struct sctp_paramhdr); + } + /* And now tell the peer we do all the extensions */ + pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT); + num_ext = 0; + pr_supported->chunk_types[num_ext++] = SCTP_ASCONF; + pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK; + pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN; + pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED; + pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET; + if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) + pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION; + if (SCTP_BASE_SYSCTL(sctp_nr_sack_on_off)) + pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK; + p_len = sizeof(*pr_supported) + num_ext; + pr_supported->ph.param_length = htons(p_len); + bzero((caddr_t)pr_supported + p_len, SCTP_SIZE32(p_len) - p_len); + SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); + + /* ECN nonce: And now tell the peer we support ECN nonce */ + if (SCTP_BASE_SYSCTL(sctp_ecn_nonce)) { + ecn_nonce = (struct sctp_ecn_nonce_supported_param *) + ((caddr_t)pr_supported + SCTP_SIZE32(p_len)); + ecn_nonce->ph.param_type = htons(SCTP_ECN_NONCE_SUPPORTED); + ecn_nonce->ph.param_length = htons(sizeof(*ecn_nonce)); + SCTP_BUF_LEN(m) += sizeof(*ecn_nonce); + } + /* add authentication parameters */ + if (!SCTP_BASE_SYSCTL(sctp_auth_disable)) { + struct sctp_auth_random *randp; + struct sctp_auth_hmac_algo *hmacs; + struct sctp_auth_chunk_list *chunks; + uint16_t random_len; + + /* generate and add RANDOM parameter */ + random_len = SCTP_AUTH_RANDOM_SIZE_DEFAULT; + randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + randp->ph.param_type = htons(SCTP_RANDOM); + p_len = sizeof(*randp) + random_len; + randp->ph.param_length = htons(p_len); + SCTP_READ_RANDOM(randp->random_data, random_len); + /* zero out any padding required */ + bzero((caddr_t)randp + p_len, SCTP_SIZE32(p_len) - p_len); + SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); + + /* add HMAC_ALGO parameter */ + hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + p_len = sctp_serialize_hmaclist(inp->sctp_ep.local_hmacs, + (uint8_t *) hmacs->hmac_ids); + if (p_len > 0) { + p_len += sizeof(*hmacs); + hmacs->ph.param_type = htons(SCTP_HMAC_LIST); + hmacs->ph.param_length = htons(p_len); + /* zero out any padding required */ + bzero((caddr_t)hmacs + p_len, SCTP_SIZE32(p_len) - p_len); + SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); + } + /* add CHUNKS parameter */ + chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + p_len = sctp_serialize_auth_chunks(inp->sctp_ep.local_auth_chunks, + chunks->chunk_types); + if (p_len > 0) { + p_len += sizeof(*chunks); + chunks->ph.param_type = htons(SCTP_CHUNK_LIST); + chunks->ph.param_length = htons(p_len); + /* zero out any padding required */ + bzero((caddr_t)chunks + p_len, SCTP_SIZE32(p_len) - p_len); + SCTP_BUF_LEN(m) += SCTP_SIZE32(p_len); + } + } + m_at = m; + /* now the addresses */ + { + struct sctp_scoping scp; + + /* + * To optimize this we could put the scoping stuff into a + * structure and remove the individual uint8's from the stc + * structure. Then we could just sifa in the address within + * the stc.. but for now this is a quick hack to get the + * address stuff teased apart. + */ + scp.ipv4_addr_legal = stc.ipv4_addr_legal; + scp.ipv6_addr_legal = stc.ipv6_addr_legal; + scp.loopback_scope = stc.loopback_scope; + scp.ipv4_local_scope = stc.ipv4_scope; + scp.local_scope = stc.local_scope; + scp.site_scope = stc.site_scope; + m_at = sctp_add_addresses_to_i_ia(inp, &scp, m_at, cnt_inits_to); + } + + /* tack on the operational error if present */ + if (op_err) { + struct mbuf *ol; + int llen; + + llen = 0; + ol = op_err; + while (ol) { + llen += SCTP_BUF_LEN(ol); + ol = SCTP_BUF_NEXT(ol); + } + if (llen % 4) { + /* must add a pad to the param */ + uint32_t cpthis = 0; + int padlen; + + padlen = 4 - (llen % 4); + m_copyback(op_err, llen, padlen, (caddr_t)&cpthis); + } + while (SCTP_BUF_NEXT(m_at) != NULL) { + m_at = SCTP_BUF_NEXT(m_at); + } + SCTP_BUF_NEXT(m_at) = op_err; + while (SCTP_BUF_NEXT(m_at) != NULL) { + m_at = SCTP_BUF_NEXT(m_at); + } + } + /* pre-calulate the size and update pkt header and chunk header */ + p_len = 0; + for (m_tmp = m; m_tmp; m_tmp = SCTP_BUF_NEXT(m_tmp)) { + p_len += SCTP_BUF_LEN(m_tmp); + if (SCTP_BUF_NEXT(m_tmp) == NULL) { + /* m_tmp should now point to last one */ + break; + } + } + + /* Now we must build a cookie */ + m_cookie = sctp_add_cookie(inp, init_pkt, offset, m, 0, &stc, &signature); + if (m_cookie == NULL) { + /* memory problem */ + sctp_m_freem(m); + return; + } + /* Now append the cookie to the end and update the space/size */ + SCTP_BUF_NEXT(m_tmp) = m_cookie; + + for (m_tmp = m_cookie; m_tmp; m_tmp = SCTP_BUF_NEXT(m_tmp)) { + p_len += SCTP_BUF_LEN(m_tmp); + if (SCTP_BUF_NEXT(m_tmp) == NULL) { + /* m_tmp should now point to last one */ + mp_last = m_tmp; + break; + } + } + /* + * Place in the size, but we don't include the last pad (if any) in + * the INIT-ACK. + */ + initack->ch.chunk_length = htons(p_len); + + /* + * Time to sign the cookie, we don't sign over the cookie signature + * though thus we set trailer. + */ + (void)sctp_hmac_m(SCTP_HMAC, + (uint8_t *) inp->sctp_ep.secret_key[(int)(inp->sctp_ep.current_secret_number)], + SCTP_SECRET_SIZE, m_cookie, sizeof(struct sctp_paramhdr), + (uint8_t *) signature, SCTP_SIGNATURE_SIZE); + /* + * We sifa 0 here to NOT set IP_DF if its IPv4, we ignore the return + * here since the timer will drive a retranmission. + */ + padval = p_len % 4; + if ((padval) && (mp_last)) { + /* see my previous comments on mp_last */ + int ret; + + ret = sctp_add_pad_tombuf(mp_last, (4 - padval)); + if (ret) { + /* Houston we have a problem, no space */ + sctp_m_freem(m); + return; + } + p_len += padval; + } + if (stc.loopback_scope) { + over_addr = &store1; + } else { + over_addr = NULL; + } + + (void)sctp_lowlevel_chunk_output(inp, NULL, NULL, to, m, 0, NULL, 0, 0, + 0, NULL, 0, + inp->sctp_lport, sh->src_port, init_chk->init.initiate_tag, + port, SCTP_SO_NOT_LOCKED, over_addr); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); +} + + +void +sctp_insert_on_wheel(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_stream_out *strq, int holds_lock) +{ + if (holds_lock == 0) { + SCTP_TCB_SEND_LOCK(stcb); + } + if ((strq->next_spoke.tqe_next == NULL) && + (strq->next_spoke.tqe_prev == NULL)) { + TAILQ_INSERT_TAIL(&asoc->out_wheel, strq, next_spoke); + } + if (holds_lock == 0) { + SCTP_TCB_SEND_UNLOCK(stcb); + } +} + +void +sctp_remove_from_wheel(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_stream_out *strq, + int holds_lock) +{ + /* take off and then setup so we know it is not on the wheel */ + if (holds_lock == 0) { + SCTP_TCB_SEND_LOCK(stcb); + } + if (TAILQ_EMPTY(&strq->outqueue)) { + if (asoc->last_out_stream == strq) { + asoc->last_out_stream = TAILQ_PREV(asoc->last_out_stream, sctpwheel_listhead, next_spoke); + if (asoc->last_out_stream == NULL) { + asoc->last_out_stream = TAILQ_LAST(&asoc->out_wheel, sctpwheel_listhead); + } + if (asoc->last_out_stream == strq) { + asoc->last_out_stream = NULL; + } + } + TAILQ_REMOVE(&asoc->out_wheel, strq, next_spoke); + strq->next_spoke.tqe_next = NULL; + strq->next_spoke.tqe_prev = NULL; + } + if (holds_lock == 0) { + SCTP_TCB_SEND_UNLOCK(stcb); + } +} + +static void +sctp_prune_prsctp(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_sndrcvinfo *srcv, + int dataout) +{ + int freed_spc = 0; + struct sctp_tmit_chunk *chk, *nchk; + + SCTP_TCB_LOCK_ASSERT(stcb); + if ((asoc->peer_supports_prsctp) && + (asoc->sent_queue_cnt_removeable > 0)) { + TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { + /* + * Look for chunks marked with the PR_SCTP flag AND + * the buffer space flag. If the one being sent is + * equal or greater priority then purge the old one + * and free some space. + */ + if (PR_SCTP_BUF_ENABLED(chk->flags)) { + /* + * This one is PR-SCTP AND buffer space + * limited type + */ + if (chk->rec.data.timetodrop.tv_sec >= (long)srcv->sinfo_timetolive) { + /* + * Lower numbers equates to higher + * priority so if the one we are + * looking at has a larger or equal + * priority we want to drop the data + * and NOT retransmit it. + */ + if (chk->data) { + /* + * We release the book_size + * if the mbuf is here + */ + int ret_spc; + int cause; + + if (chk->sent > SCTP_DATAGRAM_UNSENT) + cause = SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT; + else + cause = SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_UNSENT; + ret_spc = sctp_release_pr_sctp_chunk(stcb, chk, + cause, + SCTP_SO_LOCKED); + freed_spc += ret_spc; + if (freed_spc >= dataout) { + return; + } + } /* if chunk was present */ + } /* if of sufficent priority */ + } /* if chunk has enabled */ + } /* tailqforeach */ + + chk = TAILQ_FIRST(&asoc->send_queue); + while (chk) { + nchk = TAILQ_NEXT(chk, sctp_next); + /* Here we must move to the sent queue and mark */ + if (PR_SCTP_BUF_ENABLED(chk->flags)) { + if (chk->rec.data.timetodrop.tv_sec >= (long)srcv->sinfo_timetolive) { + if (chk->data) { + /* + * We release the book_size + * if the mbuf is here + */ + int ret_spc; + + ret_spc = sctp_release_pr_sctp_chunk(stcb, chk, + SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_UNSENT, + SCTP_SO_LOCKED); + + freed_spc += ret_spc; + if (freed_spc >= dataout) { + return; + } + } /* end if chk->data */ + } /* end if right class */ + } /* end if chk pr-sctp */ + chk = nchk; + } /* end while (chk) */ + } /* if enabled in asoc */ +} + +int +sctp_get_frag_point(struct sctp_tcb *stcb, + struct sctp_association *asoc) +{ + int siz, ovh; + + /* + * For endpoints that have both v6 and v4 addresses we must reserve + * room for the ipv6 header, for those that are only dealing with V4 + * we use a larger frag point. + */ + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ovh = SCTP_MED_OVERHEAD; + } else { + ovh = SCTP_MED_V4_OVERHEAD; + } + + if (stcb->asoc.sctp_frag_point > asoc->smallest_mtu) + siz = asoc->smallest_mtu - ovh; + else + siz = (stcb->asoc.sctp_frag_point - ovh); + /* + * if (siz > (MCLBYTES-sizeof(struct sctp_data_chunk))) { + */ + /* A data chunk MUST fit in a cluster */ + /* siz = (MCLBYTES - sizeof(struct sctp_data_chunk)); */ + /* } */ + + /* adjust for an AUTH chunk if DATA requires auth */ + if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) + siz -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); + + if (siz % 4) { + /* make it an even word boundary please */ + siz -= (siz % 4); + } + return (siz); +} + +static void +sctp_set_prsctp_policy(struct sctp_stream_queue_pending *sp) +{ + sp->pr_sctp_on = 0; + /* + * We assume that the user wants PR_SCTP_TTL if the user provides a + * positive lifetime but does not specify any PR_SCTP policy. This + * is a BAD assumption and causes problems at least with the + * U-Vancovers MPI folks. I will change this to be no policy means + * NO PR-SCTP. + */ + if (PR_SCTP_ENABLED(sp->sinfo_flags)) { + sp->act_flags |= PR_SCTP_POLICY(sp->sinfo_flags); + sp->pr_sctp_on = 1; + } else { + return; + } + switch (PR_SCTP_POLICY(sp->sinfo_flags)) { + case CHUNK_FLAGS_PR_SCTP_BUF: + /* + * Time to live is a priority stored in tv_sec when doing + * the buffer drop thing. + */ + sp->ts.tv_sec = sp->timetolive; + sp->ts.tv_usec = 0; + break; + case CHUNK_FLAGS_PR_SCTP_TTL: + { + struct timeval tv; + + (void)SCTP_GETTIME_TIMEVAL(&sp->ts); + tv.tv_sec = sp->timetolive / 1000; + tv.tv_usec = (sp->timetolive * 1000) % 1000000; + /* + * TODO sctp_constants.h needs alternative time + * macros when _KERNEL is undefined. + */ + timevaladd(&sp->ts, &tv); + } + break; + case CHUNK_FLAGS_PR_SCTP_RTX: + /* + * Time to live is a the number or retransmissions stored in + * tv_sec. + */ + sp->ts.tv_sec = sp->timetolive; + sp->ts.tv_usec = 0; + break; + default: + SCTPDBG(SCTP_DEBUG_USRREQ1, + "Unknown PR_SCTP policy %u.\n", + PR_SCTP_POLICY(sp->sinfo_flags)); + break; + } +} + +static int +sctp_msg_append(struct sctp_tcb *stcb, + struct sctp_nets *net, + struct mbuf *m, + struct sctp_sndrcvinfo *srcv, int hold_stcb_lock) +{ + int error = 0, holds_lock; + struct mbuf *at; + struct sctp_stream_queue_pending *sp = NULL; + struct sctp_stream_out *strm; + + /* + * Given an mbuf chain, put it into the association send queue and + * place it on the wheel + */ + holds_lock = hold_stcb_lock; + if (srcv->sinfo_stream >= stcb->asoc.streamoutcnt) { + /* Invalid stream number */ + SCTP_LTRACE_ERR_RET_PKT(m, NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_now; + } + if ((stcb->asoc.stream_locked) && + (stcb->asoc.stream_locked_on != srcv->sinfo_stream)) { + SCTP_LTRACE_ERR_RET_PKT(m, NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_now; + } + strm = &stcb->asoc.strmout[srcv->sinfo_stream]; + /* Now can we send this? */ + if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_SENT) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) || + (stcb->asoc.state & SCTP_STATE_SHUTDOWN_PENDING)) { + /* got data while shutting down */ + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); + error = ECONNRESET; + goto out_now; + } + sctp_alloc_a_strmoq(stcb, sp); + if (sp == NULL) { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + error = ENOMEM; + goto out_now; + } + sp->sinfo_flags = srcv->sinfo_flags; + sp->timetolive = srcv->sinfo_timetolive; + sp->ppid = srcv->sinfo_ppid; + sp->context = srcv->sinfo_context; + sp->strseq = 0; + if (sp->sinfo_flags & SCTP_ADDR_OVER) { + sp->net = net; + atomic_add_int(&sp->net->ref_count, 1); + } else { + sp->net = NULL; + } + (void)SCTP_GETTIME_TIMEVAL(&sp->ts); + sp->stream = srcv->sinfo_stream; + sp->msg_is_complete = 1; + sp->sender_all_done = 1; + sp->some_taken = 0; + sp->data = m; + sp->tail_mbuf = NULL; + sp->length = 0; + at = m; + sctp_set_prsctp_policy(sp); + /* + * We could in theory (for sendall) sifa the length in, but we would + * still have to hunt through the chain since we need to setup the + * tail_mbuf + */ + while (at) { + if (SCTP_BUF_NEXT(at) == NULL) + sp->tail_mbuf = at; + sp->length += SCTP_BUF_LEN(at); + at = SCTP_BUF_NEXT(at); + } + SCTP_TCB_SEND_LOCK(stcb); + sctp_snd_sb_alloc(stcb, sp->length); + atomic_add_int(&stcb->asoc.stream_queue_cnt, 1); + TAILQ_INSERT_TAIL(&strm->outqueue, sp, next); + if ((srcv->sinfo_flags & SCTP_UNORDERED) == 0) { + sp->strseq = strm->next_sequence_sent; + strm->next_sequence_sent++; + } + if ((strm->next_spoke.tqe_next == NULL) && + (strm->next_spoke.tqe_prev == NULL)) { + /* Not on wheel, insert */ + sctp_insert_on_wheel(stcb, &stcb->asoc, strm, 1); + } + m = NULL; + SCTP_TCB_SEND_UNLOCK(stcb); +out_now: + if (m) { + sctp_m_freem(m); + } + return (error); +} + + +static struct mbuf * +sctp_copy_mbufchain(struct mbuf *clonechain, + struct mbuf *outchain, + struct mbuf **endofchain, + int can_take_mbuf, + int sizeofcpy, + uint8_t copy_by_ref) +{ + struct mbuf *m; + struct mbuf *appendchain; + caddr_t cp; + int len; + + if (endofchain == NULL) { + /* error */ +error_out: + if (outchain) + sctp_m_freem(outchain); + return (NULL); + } + if (can_take_mbuf) { + appendchain = clonechain; + } else { + if (!copy_by_ref && + (sizeofcpy <= (int)((((SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count) - 1) * MLEN) + MHLEN))) + ) { + /* Its not in a cluster */ + if (*endofchain == NULL) { + /* lets get a mbuf cluster */ + if (outchain == NULL) { + /* This is the general case */ + new_mbuf: + outchain = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_HEADER); + if (outchain == NULL) { + goto error_out; + } + SCTP_BUF_LEN(outchain) = 0; + *endofchain = outchain; + /* get the prepend space */ + SCTP_BUF_RESV_UF(outchain, (SCTP_FIRST_MBUF_RESV + 4)); + } else { + /* + * We really should not get a NULL + * in endofchain + */ + /* find end */ + m = outchain; + while (m) { + if (SCTP_BUF_NEXT(m) == NULL) { + *endofchain = m; + break; + } + m = SCTP_BUF_NEXT(m); + } + /* sanity */ + if (*endofchain == NULL) { + /* + * huh, TSNH XXX maybe we + * should panic + */ + sctp_m_freem(outchain); + goto new_mbuf; + } + } + /* get the new end of length */ + len = M_TRAILINGSPACE(*endofchain); + } else { + /* how much is left at the end? */ + len = M_TRAILINGSPACE(*endofchain); + } + /* Find the end of the data, for appending */ + cp = (mtod((*endofchain), caddr_t)+SCTP_BUF_LEN((*endofchain))); + + /* Now lets copy it out */ + if (len >= sizeofcpy) { + /* It all fits, copy it in */ + m_copydata(clonechain, 0, sizeofcpy, cp); + SCTP_BUF_LEN((*endofchain)) += sizeofcpy; + } else { + /* fill up the end of the chain */ + if (len > 0) { + m_copydata(clonechain, 0, len, cp); + SCTP_BUF_LEN((*endofchain)) += len; + /* now we need another one */ + sizeofcpy -= len; + } + m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_HEADER); + if (m == NULL) { + /* We failed */ + goto error_out; + } + SCTP_BUF_NEXT((*endofchain)) = m; + *endofchain = m; + cp = mtod((*endofchain), caddr_t); + m_copydata(clonechain, len, sizeofcpy, cp); + SCTP_BUF_LEN((*endofchain)) += sizeofcpy; + } + return (outchain); + } else { + /* copy the old fashion way */ + appendchain = SCTP_M_COPYM(clonechain, 0, M_COPYALL, M_DONTWAIT); +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = appendchain; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_ICOPY); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + } + } + if (appendchain == NULL) { + /* error */ + if (outchain) + sctp_m_freem(outchain); + return (NULL); + } + if (outchain) { + /* tack on to the end */ + if (*endofchain != NULL) { + SCTP_BUF_NEXT(((*endofchain))) = appendchain; + } else { + m = outchain; + while (m) { + if (SCTP_BUF_NEXT(m) == NULL) { + SCTP_BUF_NEXT(m) = appendchain; + break; + } + m = SCTP_BUF_NEXT(m); + } + } + /* + * save off the end and update the end-chain postion + */ + m = appendchain; + while (m) { + if (SCTP_BUF_NEXT(m) == NULL) { + *endofchain = m; + break; + } + m = SCTP_BUF_NEXT(m); + } + return (outchain); + } else { + /* save off the end and update the end-chain postion */ + m = appendchain; + while (m) { + if (SCTP_BUF_NEXT(m) == NULL) { + *endofchain = m; + break; + } + m = SCTP_BUF_NEXT(m); + } + return (appendchain); + } +} + +int +sctp_med_chunk_output(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_association *asoc, + int *num_out, + int *reason_code, + int control_only, int from_where, + struct timeval *now, int *now_filled, int frag_point, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); + +static void +sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, + uint32_t val) +{ + struct sctp_copy_all *ca; + struct mbuf *m; + int ret = 0; + int added_control = 0; + int un_sent, do_chunk_output = 1; + struct sctp_association *asoc; + + ca = (struct sctp_copy_all *)ptr; + if (ca->m == NULL) { + return; + } + if (ca->inp != inp) { + /* TSNH */ + return; + } + if ((ca->m) && ca->sndlen) { + m = SCTP_M_COPYM(ca->m, 0, M_COPYALL, M_DONTWAIT); + if (m == NULL) { + /* can't copy so we are done */ + ca->cnt_failed++; + return; + } +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = m; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_ICOPY); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + } else { + m = NULL; + } + SCTP_TCB_LOCK_ASSERT(stcb); + if (ca->sndrcv.sinfo_flags & SCTP_ABORT) { + /* Abort this assoc with m as the user defined reason */ + if (m) { + struct sctp_paramhdr *ph; + + SCTP_BUF_PREPEND(m, sizeof(struct sctp_paramhdr), M_DONTWAIT); + if (m) { + ph = mtod(m, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); + ph->param_length = htons(ca->sndlen); + } + /* + * We add one here to keep the assoc from + * dis-appearing on us. + */ + atomic_add_int(&stcb->asoc.refcnt, 1); + sctp_abort_an_association(inp, stcb, + SCTP_RESPONSE_TO_USER_REQ, + m, SCTP_SO_NOT_LOCKED); + /* + * sctp_abort_an_association calls sctp_free_asoc() + * free association will NOT free it since we + * incremented the refcnt .. we do this to prevent + * it being freed and things getting tricky since we + * could end up (from free_asoc) calling inpcb_free + * which would get a recursive lock call to the + * iterator lock.. But as a consequence of that the + * stcb will return to us un-locked.. since + * free_asoc returns with either no TCB or the TCB + * unlocked, we must relock.. to unlock in the + * iterator timer :-0 + */ + SCTP_TCB_LOCK(stcb); + atomic_add_int(&stcb->asoc.refcnt, -1); + goto no_chunk_output; + } + } else { + if (m) { + ret = sctp_msg_append(stcb, stcb->asoc.primary_destination, m, + &ca->sndrcv, 1); + } + asoc = &stcb->asoc; + if (ca->sndrcv.sinfo_flags & SCTP_EOF) { + /* shutdown this assoc */ + int cnt; + + cnt = sctp_is_there_unsent_data(stcb); + + if (TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->sent_queue) && + (cnt == 0)) { + if (asoc->locked_on_sending) { + goto abort_anyway; + } + /* + * there is nothing queued to send, so I'm + * done... + */ + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + /* + * only send SHUTDOWN the first time + * through + */ + sctp_send_shutdown(stcb, stcb->asoc.primary_destination); + if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, + asoc->primary_destination); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, + asoc->primary_destination); + added_control = 1; + do_chunk_output = 0; + } + } else { + /* + * we still got (or just got) data to send, + * so set SHUTDOWN_PENDING + */ + /* + * XXX sockets draft says that SCTP_EOF + * should be sent with no data. currently, + * we will allow user data to be sent first + * and move to SHUTDOWN-PENDING + */ + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + if (asoc->locked_on_sending) { + /* + * Locked to send out the + * data + */ + struct sctp_stream_queue_pending *sp; + + sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead); + if (sp) { + if ((sp->length == 0) && (sp->msg_is_complete == 0)) + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + } + } + asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; + if (TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->sent_queue) && + (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { + abort_anyway: + atomic_add_int(&stcb->asoc.refcnt, 1); + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_RESPONSE_TO_USER_REQ, + NULL, SCTP_SO_NOT_LOCKED); + atomic_add_int(&stcb->asoc.refcnt, -1); + goto no_chunk_output; + } + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, + asoc->primary_destination); + } + } + + } + } + un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) + + (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk))); + + if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) && + (stcb->asoc.total_flight > 0) && + (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD)) + ) { + do_chunk_output = 0; + } + if (do_chunk_output) + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_NOT_LOCKED); + else if (added_control) { + int num_out = 0, reason = 0, now_filled = 0; + struct timeval now; + int frag_point; + + frag_point = sctp_get_frag_point(stcb, &stcb->asoc); + (void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out, + &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_NOT_LOCKED); + } +no_chunk_output: + if (ret) { + ca->cnt_failed++; + } else { + ca->cnt_sent++; + } +} + +static void +sctp_sendall_completes(void *ptr, uint32_t val) +{ + struct sctp_copy_all *ca; + + ca = (struct sctp_copy_all *)ptr; + /* + * Do a notify here? Kacheong suggests that the notify be done at + * the send time.. so you would push up a notification if any send + * failed. Don't know if this is feasable since the only failures we + * have is "memory" related and if you cannot get an mbuf to send + * the data you surely can't get an mbuf to send up to notify the + * user you can't send the data :-> + */ + + /* now free everything */ + sctp_m_freem(ca->m); + SCTP_FREE(ca, SCTP_M_COPYAL); +} + + +#define MC_ALIGN(m, len) do { \ + SCTP_BUF_RESV_UF(m, ((MCLBYTES - (len)) & ~(sizeof(long) - 1)); \ +} while (0) + + + +static struct mbuf * +sctp_copy_out_all(struct uio *uio, int len) +{ + struct mbuf *ret, *at; + int left, willcpy, cancpy, error; + + ret = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_WAIT, 1, MT_DATA); + if (ret == NULL) { + /* TSNH */ + return (NULL); + } + left = len; + SCTP_BUF_LEN(ret) = 0; + /* save space for the data chunk header */ + cancpy = M_TRAILINGSPACE(ret); + willcpy = min(cancpy, left); + at = ret; + while (left > 0) { + /* Align data to the end */ + error = uiomove(mtod(at, caddr_t), willcpy, uio); + if (error) { + err_out_now: + sctp_m_freem(at); + return (NULL); + } + SCTP_BUF_LEN(at) = willcpy; + SCTP_BUF_NEXT_PKT(at) = SCTP_BUF_NEXT(at) = 0; + left -= willcpy; + if (left > 0) { + SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg(left, 0, M_WAIT, 1, MT_DATA); + if (SCTP_BUF_NEXT(at) == NULL) { + goto err_out_now; + } + at = SCTP_BUF_NEXT(at); + SCTP_BUF_LEN(at) = 0; + cancpy = M_TRAILINGSPACE(at); + willcpy = min(cancpy, left); + } + } + return (ret); +} + +static int +sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, + struct sctp_sndrcvinfo *srcv) +{ + int ret; + struct sctp_copy_all *ca; + + SCTP_MALLOC(ca, struct sctp_copy_all *, sizeof(struct sctp_copy_all), + SCTP_M_COPYAL); + if (ca == NULL) { + sctp_m_freem(m); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + memset(ca, 0, sizeof(struct sctp_copy_all)); + + ca->inp = inp; + memcpy(&ca->sndrcv, srcv, sizeof(struct sctp_nonpad_sndrcvinfo)); + /* + * take off the sendall flag, it would be bad if we failed to do + * this :-0 + */ + ca->sndrcv.sinfo_flags &= ~SCTP_SENDALL; + /* get length and mbuf chain */ + if (uio) { + ca->sndlen = uio->uio_resid; + ca->m = sctp_copy_out_all(uio, ca->sndlen); + if (ca->m == NULL) { + SCTP_FREE(ca, SCTP_M_COPYAL); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + } else { + /* Gather the length of the send */ + struct mbuf *mat; + + mat = m; + ca->sndlen = 0; + while (m) { + ca->sndlen += SCTP_BUF_LEN(m); + m = SCTP_BUF_NEXT(m); + } + ca->m = mat; + } + ret = sctp_initiate_iterator(NULL, sctp_sendall_iterator, NULL, + SCTP_PCB_ANY_FLAGS, SCTP_PCB_ANY_FEATURES, + SCTP_ASOC_ANY_STATE, + (void *)ca, 0, + sctp_sendall_completes, inp, 1); + if (ret) { + SCTP_PRINTF("Failed to initiate iterator for sendall\n"); + SCTP_FREE(ca, SCTP_M_COPYAL); + SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT); + return (EFAULT); + } + return (0); +} + + +void +sctp_toss_old_cookies(struct sctp_tcb *stcb, struct sctp_association *asoc) +{ + struct sctp_tmit_chunk *chk, *nchk; + + chk = TAILQ_FIRST(&asoc->control_send_queue); + while (chk) { + nchk = TAILQ_NEXT(chk, sctp_next); + if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) { + TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + asoc->ctrl_queue_cnt--; + sctp_free_a_chunk(stcb, chk); + } + chk = nchk; + } +} + +void +sctp_toss_old_asconf(struct sctp_tcb *stcb) +{ + struct sctp_association *asoc; + struct sctp_tmit_chunk *chk, *chk_tmp; + struct sctp_asconf_chunk *acp; + + asoc = &stcb->asoc; + for (chk = TAILQ_FIRST(&asoc->asconf_send_queue); chk != NULL; + chk = chk_tmp) { + /* get next chk */ + chk_tmp = TAILQ_NEXT(chk, sctp_next); + /* find SCTP_ASCONF chunk in queue */ + if (chk->rec.chunk_id.id == SCTP_ASCONF) { + if (chk->data) { + acp = mtod(chk->data, struct sctp_asconf_chunk *); + if (compare_with_wrap(ntohl(acp->serial_number), stcb->asoc.asconf_seq_out_acked, MAX_SEQ)) { + /* Not Acked yet */ + break; + } + } + TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + asoc->ctrl_queue_cnt--; + sctp_free_a_chunk(stcb, chk); + } + } +} + + +static void +sctp_clean_up_datalist(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_tmit_chunk **data_list, + int bundle_at, + struct sctp_nets *net) +{ + int i; + struct sctp_tmit_chunk *tp1; + + for (i = 0; i < bundle_at; i++) { + /* off of the send queue */ + TAILQ_REMOVE(&asoc->send_queue, data_list[i], sctp_next); + asoc->send_queue_cnt--; + if (i > 0) { + /* + * Any chunk NOT 0 you zap the time chunk 0 gets + * zapped or set based on if a RTO measurment is + * needed. + */ + data_list[i]->do_rtt = 0; + } + /* record time */ + data_list[i]->sent_rcv_time = net->last_sent_time; + data_list[i]->rec.data.fast_retran_tsn = data_list[i]->rec.data.TSN_seq; + if (data_list[i]->whoTo == NULL) { + data_list[i]->whoTo = net; + atomic_add_int(&net->ref_count, 1); + } + /* on to the sent queue */ + tp1 = TAILQ_LAST(&asoc->sent_queue, sctpchunk_listhead); + if ((tp1) && (compare_with_wrap(tp1->rec.data.TSN_seq, + data_list[i]->rec.data.TSN_seq, MAX_TSN))) { + struct sctp_tmit_chunk *tpp; + + /* need to move back */ + back_up_more: + tpp = TAILQ_PREV(tp1, sctpchunk_listhead, sctp_next); + if (tpp == NULL) { + TAILQ_INSERT_BEFORE(tp1, data_list[i], sctp_next); + goto all_done; + } + tp1 = tpp; + if (compare_with_wrap(tp1->rec.data.TSN_seq, + data_list[i]->rec.data.TSN_seq, MAX_TSN)) { + goto back_up_more; + } + TAILQ_INSERT_AFTER(&asoc->sent_queue, tp1, data_list[i], sctp_next); + } else { + TAILQ_INSERT_TAIL(&asoc->sent_queue, + data_list[i], + sctp_next); + } +all_done: + /* This does not lower until the cum-ack passes it */ + asoc->sent_queue_cnt++; + if ((asoc->peers_rwnd <= 0) && + (asoc->total_flight == 0) && + (bundle_at == 1)) { + /* Mark the chunk as being a window probe */ + SCTP_STAT_INCR(sctps_windowprobed); + } +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xC2, 3); +#endif + data_list[i]->sent = SCTP_DATAGRAM_SENT; + data_list[i]->snd_count = 1; + data_list[i]->rec.data.chunk_was_revoked = 0; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_UP, + data_list[i]->whoTo->flight_size, + data_list[i]->book_size, + (uintptr_t) data_list[i]->whoTo, + data_list[i]->rec.data.TSN_seq); + } + sctp_flight_size_increase(data_list[i]); + sctp_total_flight_increase(stcb, data_list[i]); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { + sctp_log_rwnd(SCTP_DECREASE_PEER_RWND, + asoc->peers_rwnd, data_list[i]->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); + } + asoc->peers_rwnd = sctp_sbspace_sub(asoc->peers_rwnd, + (uint32_t) (data_list[i]->send_size + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh))); + if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { + /* SWS sender side engages */ + asoc->peers_rwnd = 0; + } + } +} + +static void +sctp_clean_up_ctl(struct sctp_tcb *stcb, struct sctp_association *asoc) +{ + struct sctp_tmit_chunk *chk, *nchk; + + for (chk = TAILQ_FIRST(&asoc->control_send_queue); + chk; chk = nchk) { + nchk = TAILQ_NEXT(chk, sctp_next); + if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || + (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK) || /* EY */ + (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) || + (chk->rec.chunk_id.id == SCTP_HEARTBEAT_ACK) || + (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) || + (chk->rec.chunk_id.id == SCTP_SHUTDOWN) || + (chk->rec.chunk_id.id == SCTP_SHUTDOWN_ACK) || + (chk->rec.chunk_id.id == SCTP_OPERATION_ERROR) || + (chk->rec.chunk_id.id == SCTP_PACKET_DROPPED) || + (chk->rec.chunk_id.id == SCTP_COOKIE_ACK) || + (chk->rec.chunk_id.id == SCTP_ECN_CWR) || + (chk->rec.chunk_id.id == SCTP_ASCONF_ACK)) { + /* Stray chunks must be cleaned up */ + clean_up_anyway: + TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + asoc->ctrl_queue_cnt--; + if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) + asoc->fwd_tsn_cnt--; + sctp_free_a_chunk(stcb, chk); + } else if (chk->rec.chunk_id.id == SCTP_STREAM_RESET) { + /* special handling, we must look into the param */ + if (chk != asoc->str_reset) { + goto clean_up_anyway; + } + } + } +} + + +static int +sctp_can_we_split_this(struct sctp_tcb *stcb, + uint32_t length, + uint32_t goal_mtu, uint32_t frag_point, int eeor_on) +{ + /* + * Make a decision on if I should split a msg into multiple parts. + * This is only asked of incomplete messages. + */ + if (eeor_on) { + /* + * If we are doing EEOR we need to always send it if its the + * entire thing, since it might be all the guy is putting in + * the hopper. + */ + if (goal_mtu >= length) { + /*- + * If we have data outstanding, + * we get another chance when the sack + * arrives to transmit - wait for more data + */ + if (stcb->asoc.total_flight == 0) { + /* + * If nothing is in flight, we zero the + * packet counter. + */ + return (length); + } + return (0); + + } else { + /* You can fill the rest */ + return (goal_mtu); + } + } + /*- + * For those strange folk that make the send buffer + * smaller than our fragmentation point, we can't + * get a full msg in so we have to allow splitting. + */ + if (SCTP_SB_LIMIT_SND(stcb->sctp_socket) < frag_point) { + return (length); + } + if ((length <= goal_mtu) || + ((length - goal_mtu) < SCTP_BASE_SYSCTL(sctp_min_residual))) { + /* Sub-optimial residual don't split in non-eeor mode. */ + return (0); + } + /* + * If we reach here length is larger than the goal_mtu. Do we wish + * to split it for the sake of packet putting together? + */ + if (goal_mtu >= min(SCTP_BASE_SYSCTL(sctp_min_split_point), frag_point)) { + /* Its ok to split it */ + return (min(goal_mtu, frag_point)); + } + /* Nope, can't split */ + return (0); + +} + +static uint32_t +sctp_move_to_outqueue(struct sctp_tcb *stcb, + struct sctp_stream_out *strq, + uint32_t goal_mtu, + uint32_t frag_point, + int *locked, + int *giveup, + int eeor_mode, + int *bail) +{ + /* Move from the stream to the send_queue keeping track of the total */ + struct sctp_association *asoc; + struct sctp_stream_queue_pending *sp; + struct sctp_tmit_chunk *chk; + struct sctp_data_chunk *dchkh; + uint32_t to_move, length; + uint8_t rcv_flags = 0; + uint8_t some_taken; + uint8_t send_lock_up = 0; + + SCTP_TCB_LOCK_ASSERT(stcb); + asoc = &stcb->asoc; +one_more_time: + /* sa_ignore FREED_MEMORY */ + sp = TAILQ_FIRST(&strq->outqueue); + if (sp == NULL) { + *locked = 0; + if (send_lock_up == 0) { + SCTP_TCB_SEND_LOCK(stcb); + send_lock_up = 1; + } + sp = TAILQ_FIRST(&strq->outqueue); + if (sp) { + goto one_more_time; + } + if (strq->last_msg_incomplete) { + SCTP_PRINTF("Huh? Stream:%d lm_in_c=%d but queue is NULL\n", + strq->stream_no, + strq->last_msg_incomplete); + strq->last_msg_incomplete = 0; + } + to_move = 0; + if (send_lock_up) { + SCTP_TCB_SEND_UNLOCK(stcb); + send_lock_up = 0; + } + goto out_of; + } + if ((sp->msg_is_complete) && (sp->length == 0)) { + if (sp->sender_all_done) { + /* + * We are doing differed cleanup. Last time through + * when we took all the data the sender_all_done was + * not set. + */ + if ((sp->put_last_out == 0) && (sp->discard_rest == 0)) { + SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n"); + SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n", + sp->sender_all_done, + sp->length, + sp->msg_is_complete, + sp->put_last_out, + send_lock_up); + } + if ((TAILQ_NEXT(sp, next) == NULL) && (send_lock_up == 0)) { + SCTP_TCB_SEND_LOCK(stcb); + send_lock_up = 1; + } + atomic_subtract_int(&asoc->stream_queue_cnt, 1); + TAILQ_REMOVE(&strq->outqueue, sp, next); + if (sp->net) { + sctp_free_remote_addr(sp->net); + sp->net = NULL; + } + if (sp->data) { + sctp_m_freem(sp->data); + sp->data = NULL; + } + sctp_free_a_strmoq(stcb, sp); + /* we can't be locked to it */ + *locked = 0; + stcb->asoc.locked_on_sending = NULL; + if (send_lock_up) { + SCTP_TCB_SEND_UNLOCK(stcb); + send_lock_up = 0; + } + /* back to get the next msg */ + goto one_more_time; + } else { + /* + * sender just finished this but still holds a + * reference + */ + *locked = 1; + *giveup = 1; + to_move = 0; + goto out_of; + } + } else { + /* is there some to get */ + if (sp->length == 0) { + /* no */ + *locked = 1; + *giveup = 1; + to_move = 0; + goto out_of; + } else if (sp->discard_rest) { + if (send_lock_up == 0) { + SCTP_TCB_SEND_LOCK(stcb); + send_lock_up = 1; + } + /* Whack down the size */ + atomic_subtract_int(&stcb->asoc.total_output_queue_size, sp->length); + if ((stcb->sctp_socket != NULL) && \ + ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { + atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc, sp->length); + } + if (sp->data) { + sctp_m_freem(sp->data); + sp->data = NULL; + sp->tail_mbuf = NULL; + } + sp->length = 0; + sp->some_taken = 1; + *locked = 1; + *giveup = 1; + to_move = 0; + goto out_of; + } + } + some_taken = sp->some_taken; + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + sp->msg_is_complete = 1; + } +re_look: + length = sp->length; + if (sp->msg_is_complete) { + /* The message is complete */ + to_move = min(length, frag_point); + if (to_move == length) { + /* All of it fits in the MTU */ + if (sp->some_taken) { + rcv_flags |= SCTP_DATA_LAST_FRAG; + sp->put_last_out = 1; + } else { + rcv_flags |= SCTP_DATA_NOT_FRAG; + sp->put_last_out = 1; + } + } else { + /* Not all of it fits, we fragment */ + if (sp->some_taken == 0) { + rcv_flags |= SCTP_DATA_FIRST_FRAG; + } + sp->some_taken = 1; + } + } else { + to_move = sctp_can_we_split_this(stcb, length, goal_mtu, frag_point, eeor_mode); + if (to_move) { + /*- + * We use a snapshot of length in case it + * is expanding during the compare. + */ + uint32_t llen; + + llen = length; + if (to_move >= llen) { + to_move = llen; + if (send_lock_up == 0) { + /*- + * We are taking all of an incomplete msg + * thus we need a send lock. + */ + SCTP_TCB_SEND_LOCK(stcb); + send_lock_up = 1; + if (sp->msg_is_complete) { + /* + * the sender finished the + * msg + */ + goto re_look; + } + } + } + if (sp->some_taken == 0) { + rcv_flags |= SCTP_DATA_FIRST_FRAG; + sp->some_taken = 1; + } + } else { + /* Nothing to take. */ + if (sp->some_taken) { + *locked = 1; + } + *giveup = 1; + to_move = 0; + goto out_of; + } + } + + /* If we reach here, we can copy out a chunk */ + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* No chunk memory */ + *giveup = 1; + to_move = 0; + goto out_of; + } + /* + * Setup for unordered if needed by looking at the user sent info + * flags. + */ + if (sp->sinfo_flags & SCTP_UNORDERED) { + rcv_flags |= SCTP_DATA_UNORDERED; + } + if ((SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && ((sp->sinfo_flags & SCTP_EOF) == SCTP_EOF)) || + ((sp->sinfo_flags & SCTP_SACK_IMMEDIATELY) == SCTP_SACK_IMMEDIATELY)) { + rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY; + } + /* clear out the chunk before setting up */ + memset(chk, 0, sizeof(*chk)); + chk->rec.data.rcv_flags = rcv_flags; + + if (to_move >= length) { + /* we think we can steal the whole thing */ + if ((sp->sender_all_done == 0) && (send_lock_up == 0)) { + SCTP_TCB_SEND_LOCK(stcb); + send_lock_up = 1; + } + if (to_move < sp->length) { + /* bail, it changed */ + goto dont_do_it; + } + chk->data = sp->data; + chk->last_mbuf = sp->tail_mbuf; + /* register the stealing */ + sp->data = sp->tail_mbuf = NULL; + } else { + struct mbuf *m; + +dont_do_it: + chk->data = SCTP_M_COPYM(sp->data, 0, to_move, M_DONTWAIT); + chk->last_mbuf = NULL; + if (chk->data == NULL) { + sp->some_taken = some_taken; + sctp_free_a_chunk(stcb, chk); + *bail = 1; + to_move = 0; + goto out_of; + } +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = chk->data; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_ICOPY); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + /* Pull off the data */ + m_adj(sp->data, to_move); + /* Now lets work our way down and compact it */ + m = sp->data; + while (m && (SCTP_BUF_LEN(m) == 0)) { + sp->data = SCTP_BUF_NEXT(m); + SCTP_BUF_NEXT(m) = NULL; + if (sp->tail_mbuf == m) { + /*- + * Freeing tail? TSNH since + * we supposedly were taking less + * than the sp->length. + */ +#ifdef INVARIANTS + panic("Huh, freing tail? - TSNH"); +#else + SCTP_PRINTF("Huh, freeing tail? - TSNH\n"); + sp->tail_mbuf = sp->data = NULL; + sp->length = 0; +#endif + + } + sctp_m_free(m); + m = sp->data; + } + } + if (SCTP_BUF_IS_EXTENDED(chk->data)) { + chk->copy_by_ref = 1; + } else { + chk->copy_by_ref = 0; + } + /* + * get last_mbuf and counts of mb useage This is ugly but hopefully + * its only one mbuf. + */ + if (chk->last_mbuf == NULL) { + chk->last_mbuf = chk->data; + while (SCTP_BUF_NEXT(chk->last_mbuf) != NULL) { + chk->last_mbuf = SCTP_BUF_NEXT(chk->last_mbuf); + } + } + if (to_move > length) { + /*- This should not happen either + * since we always lower to_move to the size + * of sp->length if its larger. + */ +#ifdef INVARIANTS + panic("Huh, how can to_move be larger?"); +#else + SCTP_PRINTF("Huh, how can to_move be larger?\n"); + sp->length = 0; +#endif + } else { + atomic_subtract_int(&sp->length, to_move); + } + if (M_LEADINGSPACE(chk->data) < (int)sizeof(struct sctp_data_chunk)) { + /* Not enough room for a chunk header, get some */ + struct mbuf *m; + + m = sctp_get_mbuf_for_msg(1, 0, M_DONTWAIT, 0, MT_DATA); + if (m == NULL) { + /* + * we're in trouble here. _PREPEND below will free + * all the data if there is no leading space, so we + * must put the data back and restore. + */ + if (send_lock_up == 0) { + SCTP_TCB_SEND_LOCK(stcb); + send_lock_up = 1; + } + if (chk->data == NULL) { + /* unsteal the data */ + sp->data = chk->data; + sp->tail_mbuf = chk->last_mbuf; + } else { + struct mbuf *m_tmp; + + /* reassemble the data */ + m_tmp = sp->data; + sp->data = chk->data; + SCTP_BUF_NEXT(chk->last_mbuf) = m_tmp; + } + sp->some_taken = some_taken; + atomic_add_int(&sp->length, to_move); + chk->data = NULL; + *bail = 1; + sctp_free_a_chunk(stcb, chk); + to_move = 0; + goto out_of; + } else { + SCTP_BUF_LEN(m) = 0; + SCTP_BUF_NEXT(m) = chk->data; + chk->data = m; + M_ALIGN(chk->data, 4); + } + } + SCTP_BUF_PREPEND(chk->data, sizeof(struct sctp_data_chunk), M_DONTWAIT); + if (chk->data == NULL) { + /* HELP, TSNH since we assured it would not above? */ +#ifdef INVARIANTS + panic("prepend failes HELP?"); +#else + SCTP_PRINTF("prepend fails HELP?\n"); + sctp_free_a_chunk(stcb, chk); +#endif + *bail = 1; + to_move = 0; + goto out_of; + } + sctp_snd_sb_alloc(stcb, sizeof(struct sctp_data_chunk)); + chk->book_size = chk->send_size = (to_move + sizeof(struct sctp_data_chunk)); + chk->book_size_scale = 0; + chk->sent = SCTP_DATAGRAM_UNSENT; + + chk->flags = 0; + chk->asoc = &stcb->asoc; + chk->pad_inplace = 0; + chk->no_fr_allowed = 0; + chk->rec.data.stream_seq = sp->strseq; + chk->rec.data.stream_number = sp->stream; + chk->rec.data.payloadtype = sp->ppid; + chk->rec.data.context = sp->context; + chk->rec.data.doing_fast_retransmit = 0; + chk->rec.data.ect_nonce = 0; /* ECN Nonce */ + + chk->rec.data.timetodrop = sp->ts; + chk->flags = sp->act_flags; + + if (sp->net) { + chk->whoTo = sp->net; + atomic_add_int(&chk->whoTo->ref_count, 1); + } else + chk->whoTo = NULL; + + if (sp->holds_key_ref) { + chk->auth_keyid = sp->auth_keyid; + sctp_auth_key_acquire(stcb, chk->auth_keyid); + chk->holds_key_ref = 1; + } + chk->rec.data.TSN_seq = atomic_fetchadd_int(&asoc->sending_seq, 1); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_AT_SEND_2_OUTQ) { + sctp_misc_ints(SCTP_STRMOUT_LOG_SEND, + (uintptr_t) stcb, sp->length, + (uint32_t) ((chk->rec.data.stream_number << 16) | chk->rec.data.stream_seq), + chk->rec.data.TSN_seq); + } + dchkh = mtod(chk->data, struct sctp_data_chunk *); + /* + * Put the rest of the things in place now. Size was done earlier in + * previous loop prior to padding. + */ + +#ifdef SCTP_ASOCLOG_OF_TSNS + SCTP_TCB_LOCK_ASSERT(stcb); + if (asoc->tsn_out_at >= SCTP_TSN_LOG_SIZE) { + asoc->tsn_out_at = 0; + asoc->tsn_out_wrapped = 1; + } + asoc->out_tsnlog[asoc->tsn_out_at].tsn = chk->rec.data.TSN_seq; + asoc->out_tsnlog[asoc->tsn_out_at].strm = chk->rec.data.stream_number; + asoc->out_tsnlog[asoc->tsn_out_at].seq = chk->rec.data.stream_seq; + asoc->out_tsnlog[asoc->tsn_out_at].sz = chk->send_size; + asoc->out_tsnlog[asoc->tsn_out_at].flgs = chk->rec.data.rcv_flags; + asoc->out_tsnlog[asoc->tsn_out_at].stcb = (void *)stcb; + asoc->out_tsnlog[asoc->tsn_out_at].in_pos = asoc->tsn_out_at; + asoc->out_tsnlog[asoc->tsn_out_at].in_out = 2; + asoc->tsn_out_at++; +#endif + + dchkh->ch.chunk_type = SCTP_DATA; + dchkh->ch.chunk_flags = chk->rec.data.rcv_flags; + dchkh->dp.tsn = htonl(chk->rec.data.TSN_seq); + dchkh->dp.stream_id = htons(strq->stream_no); + dchkh->dp.stream_sequence = htons(chk->rec.data.stream_seq); + dchkh->dp.protocol_id = chk->rec.data.payloadtype; + dchkh->ch.chunk_length = htons(chk->send_size); + /* Now advance the chk->send_size by the actual pad needed. */ + if (chk->send_size < SCTP_SIZE32(chk->book_size)) { + /* need a pad */ + struct mbuf *lm; + int pads; + + pads = SCTP_SIZE32(chk->book_size) - chk->send_size; + if (sctp_pad_lastmbuf(chk->data, pads, chk->last_mbuf) == 0) { + chk->pad_inplace = 1; + } + if ((lm = SCTP_BUF_NEXT(chk->last_mbuf)) != NULL) { + /* pad added an mbuf */ + chk->last_mbuf = lm; + } + chk->send_size += pads; + } + /* We only re-set the policy if it is on */ + if (sp->pr_sctp_on) { + sctp_set_prsctp_policy(sp); + asoc->pr_sctp_cnt++; + chk->pr_sctp_on = 1; + } else { + chk->pr_sctp_on = 0; + } + if (sp->msg_is_complete && (sp->length == 0) && (sp->sender_all_done)) { + /* All done pull and kill the message */ + atomic_subtract_int(&asoc->stream_queue_cnt, 1); + if (sp->put_last_out == 0) { + SCTP_PRINTF("Gak, put out entire msg with NO end!-2\n"); + SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n", + sp->sender_all_done, + sp->length, + sp->msg_is_complete, + sp->put_last_out, + send_lock_up); + } + if ((send_lock_up == 0) && (TAILQ_NEXT(sp, next) == NULL)) { + SCTP_TCB_SEND_LOCK(stcb); + send_lock_up = 1; + } + TAILQ_REMOVE(&strq->outqueue, sp, next); + if (sp->net) { + sctp_free_remote_addr(sp->net); + sp->net = NULL; + } + if (sp->data) { + sctp_m_freem(sp->data); + sp->data = NULL; + } + sctp_free_a_strmoq(stcb, sp); + + /* we can't be locked to it */ + *locked = 0; + stcb->asoc.locked_on_sending = NULL; + } else { + /* more to go, we are locked */ + *locked = 1; + } + asoc->chunks_on_out_queue++; + TAILQ_INSERT_TAIL(&asoc->send_queue, chk, sctp_next); + asoc->send_queue_cnt++; +out_of: + if (send_lock_up) { + SCTP_TCB_SEND_UNLOCK(stcb); + send_lock_up = 0; + } + return (to_move); +} + + +static struct sctp_stream_out * +sctp_select_a_stream(struct sctp_tcb *stcb, struct sctp_association *asoc) +{ + struct sctp_stream_out *strq; + + /* Find the next stream to use */ + if (asoc->last_out_stream == NULL) { + strq = TAILQ_FIRST(&asoc->out_wheel); + } else { + strq = TAILQ_NEXT(asoc->last_out_stream, next_spoke); + if (strq == NULL) { + strq = TAILQ_FIRST(&asoc->out_wheel); + } + } + return (strq); +} + + +static void +sctp_fill_outqueue(struct sctp_tcb *stcb, + struct sctp_nets *net, int frag_point, int eeor_mode, int *quit_now) +{ + struct sctp_association *asoc; + struct sctp_stream_out *strq, *strqn; + int goal_mtu, moved_how_much, total_moved = 0, bail = 0; + int locked, giveup; + struct sctp_stream_queue_pending *sp; + + SCTP_TCB_LOCK_ASSERT(stcb); + asoc = &stcb->asoc; +#ifdef INET6 + if (net->ro._l_addr.sin6.sin6_family == AF_INET6) { + goal_mtu = net->mtu - SCTP_MIN_OVERHEAD; + } else { + /* ?? not sure what else to do */ + goal_mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; + } +#else + goal_mtu = net->mtu - SCTP_MIN_OVERHEAD; +#endif + /* Need an allowance for the data chunk header too */ + goal_mtu -= sizeof(struct sctp_data_chunk); + + /* must make even word boundary */ + goal_mtu &= 0xfffffffc; + if (asoc->locked_on_sending) { + /* We are stuck on one stream until the message completes. */ + strq = asoc->locked_on_sending; + locked = 1; + } else { + strq = sctp_select_a_stream(stcb, asoc); + locked = 0; + } + strqn = strq; + while ((goal_mtu > 0) && strq) { + sp = TAILQ_FIRST(&strq->outqueue); + if (sp == NULL) { + break; + } + /** + * Honor the users' choice if given. If not given, + * pull it only to the primary path in case of not using + * CMT. + */ + if (((sp->net != NULL) && + (sp->net != net)) || + ((sp->net == NULL) && + (asoc->sctp_cmt_on_off == 0) && + (asoc->primary_destination != net))) { + /* Do not pull to this network */ + if (locked) { + break; + } else { + strq = sctp_select_a_stream(stcb, asoc); + if (strq == NULL) + /* none left */ + break; + if (strqn == strq) { + /* I have circled */ + break; + } + continue; + } + } + giveup = 0; + bail = 0; + moved_how_much = sctp_move_to_outqueue(stcb, strq, goal_mtu, frag_point, &locked, + &giveup, eeor_mode, &bail); + if (moved_how_much) + asoc->last_out_stream = strq; + + if (locked) { + asoc->locked_on_sending = strq; + if ((moved_how_much == 0) || (giveup) || bail) + /* no more to move for now */ + break; + } else { + asoc->locked_on_sending = NULL; + if (TAILQ_EMPTY(&strq->outqueue)) { + if (strq == strqn) { + /* Must move start to next one */ + strqn = TAILQ_NEXT(strq, next_spoke); + if (strqn == NULL) { + strqn = TAILQ_FIRST(&asoc->out_wheel); + if (strqn == NULL) { + break; + } + } + } + sctp_remove_from_wheel(stcb, asoc, strq, 0); + } + if ((giveup) || bail) { + break; + } + strq = sctp_select_a_stream(stcb, asoc); + if (strq == NULL) { + break; + } + } + total_moved += moved_how_much; + goal_mtu -= (moved_how_much + sizeof(struct sctp_data_chunk)); + goal_mtu &= 0xfffffffc; + } + if (bail) + *quit_now = 1; + + if (total_moved == 0) { + if ((stcb->asoc.sctp_cmt_on_off == 0) && + (net == stcb->asoc.primary_destination)) { + /* ran dry for primary network net */ + SCTP_STAT_INCR(sctps_primary_randry); + } else if (stcb->asoc.sctp_cmt_on_off == 1) { + /* ran dry with CMT on */ + SCTP_STAT_INCR(sctps_cmt_randry); + } + } +} + +void +sctp_fix_ecn_echo(struct sctp_association *asoc) +{ + struct sctp_tmit_chunk *chk; + + TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { + if (chk->rec.chunk_id.id == SCTP_ECN_ECHO) { + chk->sent = SCTP_DATAGRAM_UNSENT; + } + } +} + +void +sctp_move_chunks_from_net(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + struct sctp_association *asoc; + struct sctp_stream_out *outs; + struct sctp_tmit_chunk *chk; + struct sctp_stream_queue_pending *sp; + + if (net == NULL) { + return; + } + asoc = &stcb->asoc; + TAILQ_FOREACH(outs, &asoc->out_wheel, next_spoke) { + TAILQ_FOREACH(sp, &outs->outqueue, next) { + if (sp->net == net) { + sctp_free_remote_addr(sp->net); + sp->net = NULL; + } + } + } + TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) { + if (chk->whoTo == net) { + sctp_free_remote_addr(chk->whoTo); + chk->whoTo = NULL; + } + } +} + +int +sctp_med_chunk_output(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_association *asoc, + int *num_out, + int *reason_code, + int control_only, int from_where, + struct timeval *now, int *now_filled, int frag_point, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + /* + * Ok this is the generic chunk service queue. we must do the + * following: - Service the stream queue that is next, moving any + * message (note I must get a complete message i.e. FIRST/MIDDLE and + * LAST to the out queue in one pass) and assigning TSN's - Check to + * see if the cwnd/rwnd allows any output, if so we go ahead and + * fomulate and send the low level chunks. Making sure to combine + * any control in the control chunk queue also. + */ + struct sctp_nets *net, *start_at, *old_start_at = NULL; + struct mbuf *outchain, *endoutchain; + struct sctp_tmit_chunk *chk, *nchk; + + /* temp arrays for unlinking */ + struct sctp_tmit_chunk *data_list[SCTP_MAX_DATA_BUNDLING]; + int no_fragmentflg, error; + unsigned int max_rwnd_per_dest, max_send_per_dest; + int one_chunk, hbflag, skip_data_for_this_net; + int asconf, cookie, no_out_cnt; + int bundle_at, ctl_cnt, no_data_chunks, eeor_mode; + unsigned int mtu, r_mtu, omtu, mx_mtu, to_out; + int tsns_sent = 0; + uint32_t auth_offset = 0; + struct sctp_auth_chunk *auth = NULL; + uint16_t auth_keyid; + int override_ok = 1; + int data_auth_reqd = 0; + + /* + * JRS 5/14/07 - Add flag for whether a heartbeat is sent to the + * destination. + */ + int pf_hbflag = 0; + int quit_now = 0; + + *num_out = 0; + auth_keyid = stcb->asoc.authinfo.active_keyid; + + if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || + (asoc->state & SCTP_STATE_SHUTDOWN_RECEIVED) || + (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) { + eeor_mode = 1; + } else { + eeor_mode = 0; + } + ctl_cnt = no_out_cnt = asconf = cookie = 0; + /* + * First lets prime the pump. For each destination, if there is room + * in the flight size, attempt to pull an MTU's worth out of the + * stream queues into the general send_queue + */ +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xC2, 2); +#endif + SCTP_TCB_LOCK_ASSERT(stcb); + hbflag = 0; + if ((control_only) || (asoc->stream_reset_outstanding)) + no_data_chunks = 1; + else + no_data_chunks = 0; + + /* Nothing to possible to send? */ + if (TAILQ_EMPTY(&asoc->control_send_queue) && + TAILQ_EMPTY(&asoc->asconf_send_queue) && + TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->out_wheel)) { + *reason_code = 9; + return (0); + } + if (asoc->peers_rwnd == 0) { + /* No room in peers rwnd */ + *reason_code = 1; + if (asoc->total_flight > 0) { + /* we are allowed one chunk in flight */ + no_data_chunks = 1; + } + } + max_rwnd_per_dest = ((asoc->peers_rwnd + asoc->total_flight) / asoc->numnets); + if (stcb->sctp_socket) + max_send_per_dest = SCTP_SB_LIMIT_SND(stcb->sctp_socket) / asoc->numnets; + else + max_send_per_dest = 0; + if ((no_data_chunks == 0) && (!TAILQ_EMPTY(&asoc->out_wheel))) { + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + /* + * This for loop we are in takes in each net, if + * its's got space in cwnd and has data sent to it + * (when CMT is off) then it calls + * sctp_fill_outqueue for the net. This gets data on + * the send queue for that network. + * + * In sctp_fill_outqueue TSN's are assigned and data is + * copied out of the stream buffers. Note mostly + * copy by reference (we hope). + */ + net->window_probe = 0; + if ((net->dest_state & SCTP_ADDR_NOT_REACHABLE) || + (net->dest_state & SCTP_ADDR_UNCONFIRMED)) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, 1, + SCTP_CWND_LOG_FILL_OUTQ_CALLED); + } + continue; + } + if ((asoc->sctp_cmt_on_off == 0) && + (asoc->primary_destination != net) && + (net->ref_count < 2)) { + /* nothing can be in queue for this guy */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, 2, + SCTP_CWND_LOG_FILL_OUTQ_CALLED); + } + continue; + } + if (net->flight_size >= net->cwnd) { + /* skip this network, no room - can't fill */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, 3, + SCTP_CWND_LOG_FILL_OUTQ_CALLED); + } + continue; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, 4, SCTP_CWND_LOG_FILL_OUTQ_CALLED); + } + sctp_fill_outqueue(stcb, net, frag_point, eeor_mode, &quit_now); + if (quit_now) { + /* memory alloc failure */ + no_data_chunks = 1; + break; + } + } + } + /* now service each destination and send out what we can for it */ + /* Nothing to send? */ + if (TAILQ_EMPTY(&asoc->control_send_queue) && + TAILQ_EMPTY(&asoc->asconf_send_queue) && + TAILQ_EMPTY(&asoc->send_queue)) { + *reason_code = 8; + return (0); + } + if (asoc->sctp_cmt_on_off == 1) { + /* get the last start point */ + start_at = asoc->last_net_cmt_send_started; + if (start_at == NULL) { + /* null so to beginning */ + start_at = TAILQ_FIRST(&asoc->nets); + } else { + start_at = TAILQ_NEXT(asoc->last_net_cmt_send_started, sctp_next); + if (start_at == NULL) { + start_at = TAILQ_FIRST(&asoc->nets); + } + } + asoc->last_net_cmt_send_started = start_at; + } else { + start_at = TAILQ_FIRST(&asoc->nets); + } + old_start_at = NULL; +again_one_more_time: + for (net = start_at; net != NULL; net = TAILQ_NEXT(net, sctp_next)) { + /* how much can we send? */ + /* SCTPDBG("Examine for sending net:%x\n", (uint32_t)net); */ + if (old_start_at && (old_start_at == net)) { + /* through list ocmpletely. */ + break; + } + tsns_sent = 0xa; + if ((asoc->sctp_cmt_on_off == 0) && + (asoc->primary_destination != net) && + (net->ref_count < 2)) { + /* + * Ref-count of 1 so we cannot have data or control + * queued to this address. Skip it (non-CMT). + */ + continue; + } + if (TAILQ_EMPTY(&asoc->control_send_queue) && + TAILQ_EMPTY(&asoc->asconf_send_queue) && + (net->flight_size >= net->cwnd)) { + /* + * Nothing on control or asconf and flight is full, + * we can skip even in the CMT case. + */ + continue; + } + ctl_cnt = bundle_at = 0; + endoutchain = outchain = NULL; + no_fragmentflg = 1; + one_chunk = 0; + if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { + skip_data_for_this_net = 1; + } else { + skip_data_for_this_net = 0; + } + if ((net->ro.ro_rt) && (net->ro.ro_rt->rt_ifp)) { + /* + * if we have a route and an ifp check to see if we + * have room to send to this guy + */ + struct ifnet *ifp; + + ifp = net->ro.ro_rt->rt_ifp; + if ((ifp->if_snd.ifq_len + 2) >= ifp->if_snd.ifq_maxlen) { + SCTP_STAT_INCR(sctps_ifnomemqueued); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { + sctp_log_maxburst(stcb, net, ifp->if_snd.ifq_len, ifp->if_snd.ifq_maxlen, SCTP_MAX_IFP_APPLIED); + } + continue; + } + } + switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) { + case AF_INET: + mtu = net->mtu - (sizeof(struct ip) + sizeof(struct sctphdr)); + break; +#ifdef INET6 + case AF_INET6: + mtu = net->mtu - (sizeof(struct ip6_hdr) + sizeof(struct sctphdr)); + break; +#endif + default: + /* TSNH */ + mtu = net->mtu; + break; + } + mx_mtu = mtu; + to_out = 0; + if (mtu > asoc->peers_rwnd) { + if (asoc->total_flight > 0) { + /* We have a packet in flight somewhere */ + r_mtu = asoc->peers_rwnd; + } else { + /* We are always allowed to send one MTU out */ + one_chunk = 1; + r_mtu = mtu; + } + } else { + r_mtu = mtu; + } + /************************/ + /* ASCONF transmission */ + /************************/ + /* Now first lets go through the asconf queue */ + for (chk = TAILQ_FIRST(&asoc->asconf_send_queue); + chk; chk = nchk) { + nchk = TAILQ_NEXT(chk, sctp_next); + if (chk->rec.chunk_id.id != SCTP_ASCONF) { + continue; + } + if (chk->whoTo != net) { + /* + * No, not sent to the network we are + * looking at + */ + break; + } + if (chk->data == NULL) { + break; + } + if (chk->sent != SCTP_DATAGRAM_UNSENT && + chk->sent != SCTP_DATAGRAM_RESEND) { + break; + } + /* + * if no AUTH is yet included and this chunk + * requires it, make sure to account for it. We + * don't apply the size until the AUTH chunk is + * actually added below in case there is no room for + * this chunk. NOTE: we overload the use of "omtu" + * here + */ + if ((auth == NULL) && + sctp_auth_is_required_chunk(chk->rec.chunk_id.id, + stcb->asoc.peer_auth_chunks)) { + omtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); + } else + omtu = 0; + /* Here we do NOT factor the r_mtu */ + if ((chk->send_size < (int)(mtu - omtu)) || + (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) { + /* + * We probably should glom the mbuf chain + * from the chk->data for control but the + * problem is it becomes yet one more level + * of tracking to do if for some reason + * output fails. Then I have got to + * reconstruct the merged control chain.. el + * yucko.. for now we take the easy way and + * do the copy + */ + /* + * Add an AUTH chunk, if chunk requires it + * save the offset into the chain for AUTH + */ + if ((auth == NULL) && + (sctp_auth_is_required_chunk(chk->rec.chunk_id.id, + stcb->asoc.peer_auth_chunks))) { + outchain = sctp_add_auth_chunk(outchain, + &endoutchain, + &auth, + &auth_offset, + stcb, + chk->rec.chunk_id.id); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + } + outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, + (int)chk->rec.chunk_id.can_take_data, + chk->send_size, chk->copy_by_ref); + if (outchain == NULL) { + *reason_code = 8; + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + /* update our MTU size */ + if (mtu > (chk->send_size + omtu)) + mtu -= (chk->send_size + omtu); + else + mtu = 0; + to_out += (chk->send_size + omtu); + /* Do clear IP_DF ? */ + if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { + no_fragmentflg = 0; + } + if (chk->rec.chunk_id.can_take_data) + chk->data = NULL; + /* + * set hb flag since we can use these for + * RTO + */ + hbflag = 1; + asconf = 1; + /* + * should sysctl this: don't bundle data + * with ASCONF since it requires AUTH + */ + no_data_chunks = 1; + chk->sent = SCTP_DATAGRAM_SENT; + chk->snd_count++; + if (mtu == 0) { + /* + * Ok we are out of room but we can + * output without effecting the + * flight size since this little guy + * is a control only packet. + */ + sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net); + /* + * do NOT clear the asconf flag as + * it is used to do appropriate + * source address selection. + */ + if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, + (struct sockaddr *)&net->ro._l_addr, + outchain, auth_offset, auth, + stcb->asoc.authinfo.active_keyid, + no_fragmentflg, 0, NULL, asconf, + inp->sctp_lport, stcb->rport, + htonl(stcb->asoc.peer_vtag), + net->port, so_locked, NULL))) { + if (error == ENOBUFS) { + asoc->ifp_had_enobuf = 1; + SCTP_STAT_INCR(sctps_lowlevelerr); + } + if (from_where == 0) { + SCTP_STAT_INCR(sctps_lowlevelerrusr); + } + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); + *now_filled = 1; + *now = net->last_sent_time; + } else { + net->last_sent_time = *now; + } + hbflag = 0; + /* error, could not output */ + if (error == EHOSTUNREACH) { + /* + * Destination went + * unreachable + * during this send + */ + sctp_move_chunks_from_net(stcb, net); + } + *reason_code = 7; + continue; + } else + asoc->ifp_had_enobuf = 0; + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); + *now_filled = 1; + *now = net->last_sent_time; + } else { + net->last_sent_time = *now; + } + hbflag = 0; + /* + * increase the number we sent, if a + * cookie is sent we don't tell them + * any was sent out. + */ + outchain = endoutchain = NULL; + auth = NULL; + auth_offset = 0; + if (!no_out_cnt) + *num_out += ctl_cnt; + /* recalc a clean slate and setup */ + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + mtu = (net->mtu - SCTP_MIN_OVERHEAD); + } else { + mtu = (net->mtu - SCTP_MIN_V4_OVERHEAD); + } + to_out = 0; + no_fragmentflg = 1; + } + } + } + /************************/ + /* Control transmission */ + /************************/ + /* Now first lets go through the control queue */ + for (chk = TAILQ_FIRST(&asoc->control_send_queue); + chk; chk = nchk) { + nchk = TAILQ_NEXT(chk, sctp_next); + if (chk->whoTo != net) { + /* + * No, not sent to the network we are + * looking at + */ + continue; + } + if (chk->data == NULL) { + continue; + } + if (chk->sent != SCTP_DATAGRAM_UNSENT) { + /* + * It must be unsent. Cookies and ASCONF's + * hang around but there timers will force + * when marked for resend. + */ + continue; + } + /* + * if no AUTH is yet included and this chunk + * requires it, make sure to account for it. We + * don't apply the size until the AUTH chunk is + * actually added below in case there is no room for + * this chunk. NOTE: we overload the use of "omtu" + * here + */ + if ((auth == NULL) && + sctp_auth_is_required_chunk(chk->rec.chunk_id.id, + stcb->asoc.peer_auth_chunks)) { + omtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); + } else + omtu = 0; + /* Here we do NOT factor the r_mtu */ + if ((chk->send_size <= (int)(mtu - omtu)) || + (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) { + /* + * We probably should glom the mbuf chain + * from the chk->data for control but the + * problem is it becomes yet one more level + * of tracking to do if for some reason + * output fails. Then I have got to + * reconstruct the merged control chain.. el + * yucko.. for now we take the easy way and + * do the copy + */ + /* + * Add an AUTH chunk, if chunk requires it + * save the offset into the chain for AUTH + */ + if ((auth == NULL) && + (sctp_auth_is_required_chunk(chk->rec.chunk_id.id, + stcb->asoc.peer_auth_chunks))) { + outchain = sctp_add_auth_chunk(outchain, + &endoutchain, + &auth, + &auth_offset, + stcb, + chk->rec.chunk_id.id); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + } + outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, + (int)chk->rec.chunk_id.can_take_data, + chk->send_size, chk->copy_by_ref); + if (outchain == NULL) { + *reason_code = 8; + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + /* update our MTU size */ + if (mtu > (chk->send_size + omtu)) + mtu -= (chk->send_size + omtu); + else + mtu = 0; + to_out += (chk->send_size + omtu); + /* Do clear IP_DF ? */ + if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { + no_fragmentflg = 0; + } + if (chk->rec.chunk_id.can_take_data) + chk->data = NULL; + /* Mark things to be removed, if needed */ + if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || + (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK) || /* EY */ + (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) || + (chk->rec.chunk_id.id == SCTP_HEARTBEAT_ACK) || + (chk->rec.chunk_id.id == SCTP_SHUTDOWN) || + (chk->rec.chunk_id.id == SCTP_SHUTDOWN_ACK) || + (chk->rec.chunk_id.id == SCTP_OPERATION_ERROR) || + (chk->rec.chunk_id.id == SCTP_COOKIE_ACK) || + (chk->rec.chunk_id.id == SCTP_ECN_CWR) || + (chk->rec.chunk_id.id == SCTP_PACKET_DROPPED) || + (chk->rec.chunk_id.id == SCTP_ASCONF_ACK)) { + + if (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) { + hbflag = 1; + /* + * JRS 5/14/07 - Set the + * flag to say a heartbeat + * is being sent. + */ + pf_hbflag = 1; + } + /* remove these chunks at the end */ + if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || + (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK)) { + /* turn off the timer */ + if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { + sctp_timer_stop(SCTP_TIMER_TYPE_RECV, + inp, stcb, net, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_1); + } + } + ctl_cnt++; + } else { + /* + * Other chunks, since they have + * timers running (i.e. COOKIE) we + * just "trust" that it gets sent or + * retransmitted. + */ + ctl_cnt++; + if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) { + cookie = 1; + no_out_cnt = 1; + } + chk->sent = SCTP_DATAGRAM_SENT; + chk->snd_count++; + } + if (mtu == 0) { + /* + * Ok we are out of room but we can + * output without effecting the + * flight size since this little guy + * is a control only packet. + */ + if (asconf) { + sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net); + /* + * do NOT clear the asconf + * flag as it is used to do + * appropriate source + * address selection. + */ + } + if (cookie) { + sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net); + cookie = 0; + } + if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, + (struct sockaddr *)&net->ro._l_addr, + outchain, + auth_offset, auth, + stcb->asoc.authinfo.active_keyid, + no_fragmentflg, 0, NULL, asconf, + inp->sctp_lport, stcb->rport, + htonl(stcb->asoc.peer_vtag), + net->port, so_locked, NULL))) { + if (error == ENOBUFS) { + asoc->ifp_had_enobuf = 1; + SCTP_STAT_INCR(sctps_lowlevelerr); + } + if (from_where == 0) { + SCTP_STAT_INCR(sctps_lowlevelerrusr); + } + /* error, could not output */ + if (hbflag) { + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); + *now_filled = 1; + *now = net->last_sent_time; + } else { + net->last_sent_time = *now; + } + hbflag = 0; + } + if (error == EHOSTUNREACH) { + /* + * Destination went + * unreachable + * during this send + */ + sctp_move_chunks_from_net(stcb, net); + } + *reason_code = 7; + continue; + } else + asoc->ifp_had_enobuf = 0; + /* Only HB or ASCONF advances time */ + if (hbflag) { + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); + *now_filled = 1; + *now = net->last_sent_time; + } else { + net->last_sent_time = *now; + } + hbflag = 0; + } + /* + * increase the number we sent, if a + * cookie is sent we don't tell them + * any was sent out. + */ + outchain = endoutchain = NULL; + auth = NULL; + auth_offset = 0; + if (!no_out_cnt) + *num_out += ctl_cnt; + /* recalc a clean slate and setup */ + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + mtu = (net->mtu - SCTP_MIN_OVERHEAD); + } else { + mtu = (net->mtu - SCTP_MIN_V4_OVERHEAD); + } + to_out = 0; + no_fragmentflg = 1; + } + } + } + /* JRI: if dest is in PF state, do not send data to it */ + if ((asoc->sctp_cmt_on_off == 1) && + (asoc->sctp_cmt_pf > 0) && + (net->dest_state & SCTP_ADDR_PF)) { + goto no_data_fill; + } + if (net->flight_size >= net->cwnd) { + goto no_data_fill; + } + if ((asoc->sctp_cmt_on_off == 1) && + (SCTP_BASE_SYSCTL(sctp_buffer_splitting) & SCTP_RECV_BUFFER_SPLITTING) && + (net->flight_size > max_rwnd_per_dest)) { + goto no_data_fill; + } + /* + * We need a specific accounting for the usage of the send + * buffer. We also need to check the number of messages per + * net. For now, this is better than nothing and it disabled + * by default... + */ + if ((asoc->sctp_cmt_on_off == 1) && + (SCTP_BASE_SYSCTL(sctp_buffer_splitting) & SCTP_SEND_BUFFER_SPLITTING) && + (max_send_per_dest > 0) && + (net->flight_size > max_send_per_dest)) { + goto no_data_fill; + } + /*********************/ + /* Data transmission */ + /*********************/ + /* + * if AUTH for DATA is required and no AUTH has been added + * yet, account for this in the mtu now... if no data can be + * bundled, this adjustment won't matter anyways since the + * packet will be going out... + */ + data_auth_reqd = sctp_auth_is_required_chunk(SCTP_DATA, + stcb->asoc.peer_auth_chunks); + if (data_auth_reqd && (auth == NULL)) { + mtu -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); + } + /* now lets add any data within the MTU constraints */ + switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) { + case AF_INET: + if (net->mtu > (sizeof(struct ip) + sizeof(struct sctphdr))) + omtu = net->mtu - (sizeof(struct ip) + sizeof(struct sctphdr)); + else + omtu = 0; + break; +#ifdef INET6 + case AF_INET6: + if (net->mtu > (sizeof(struct ip6_hdr) + sizeof(struct sctphdr))) + omtu = net->mtu - (sizeof(struct ip6_hdr) + sizeof(struct sctphdr)); + else + omtu = 0; + break; +#endif + default: + /* TSNH */ + omtu = 0; + break; + } + if ((((asoc->state & SCTP_STATE_OPEN) == SCTP_STATE_OPEN) && + (skip_data_for_this_net == 0)) || + (cookie)) { + for (chk = TAILQ_FIRST(&asoc->send_queue); chk; chk = nchk) { + if (no_data_chunks) { + /* let only control go out */ + *reason_code = 1; + break; + } + if (net->flight_size >= net->cwnd) { + /* skip this net, no room for data */ + *reason_code = 2; + break; + } + nchk = TAILQ_NEXT(chk, sctp_next); + if ((chk->whoTo != NULL) && + (chk->whoTo != net)) { + /* Don't send the chunk on this net */ + continue; + } + if ((chk->send_size > omtu) && ((chk->flags & CHUNK_FLAGS_FRAGMENT_OK) == 0)) { + /*- + * strange, we have a chunk that is + * to big for its destination and + * yet no fragment ok flag. + * Something went wrong when the + * PMTU changed...we did not mark + * this chunk for some reason?? I + * will fix it here by letting IP + * fragment it for now and printing + * a warning. This really should not + * happen ... + */ + SCTP_PRINTF("Warning chunk of %d bytes > mtu:%d and yet PMTU disc missed\n", + chk->send_size, mtu); + chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; + } + if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && + ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) == SCTP_STATE_SHUTDOWN_PENDING)) { + struct sctp_data_chunk *dchkh; + + dchkh = mtod(chk->data, struct sctp_data_chunk *); + dchkh->ch.chunk_flags |= SCTP_DATA_SACK_IMMEDIATELY; + } + if (((chk->send_size <= mtu) && (chk->send_size <= r_mtu)) || + ((chk->flags & CHUNK_FLAGS_FRAGMENT_OK) && (chk->send_size <= asoc->peers_rwnd))) { + /* ok we will add this one */ + + /* + * Add an AUTH chunk, if chunk + * requires it, save the offset into + * the chain for AUTH + */ + if (data_auth_reqd) { + if (auth == NULL) { + outchain = sctp_add_auth_chunk(outchain, + &endoutchain, + &auth, + &auth_offset, + stcb, + SCTP_DATA); + auth_keyid = chk->auth_keyid; + override_ok = 0; + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + } else if (override_ok) { + /* + * use this data's + * keyid + */ + auth_keyid = chk->auth_keyid; + override_ok = 0; + } else if (auth_keyid != chk->auth_keyid) { + /* + * different keyid, + * so done bundling + */ + break; + } + } + outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, 0, + chk->send_size, chk->copy_by_ref); + if (outchain == NULL) { + SCTPDBG(SCTP_DEBUG_OUTPUT3, "No memory?\n"); + if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { + sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); + } + *reason_code = 3; + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + /* upate our MTU size */ + /* Do clear IP_DF ? */ + if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { + no_fragmentflg = 0; + } + /* unsigned subtraction of mtu */ + if (mtu > chk->send_size) + mtu -= chk->send_size; + else + mtu = 0; + /* unsigned subtraction of r_mtu */ + if (r_mtu > chk->send_size) + r_mtu -= chk->send_size; + else + r_mtu = 0; + + to_out += chk->send_size; + if ((to_out > mx_mtu) && no_fragmentflg) { +#ifdef INVARIANTS + panic("Exceeding mtu of %d out size is %d", mx_mtu, to_out); +#else + SCTP_PRINTF("Exceeding mtu of %d out size is %d\n", + mx_mtu, to_out); +#endif + } + chk->window_probe = 0; + data_list[bundle_at++] = chk; + if (bundle_at >= SCTP_MAX_DATA_BUNDLING) { + mtu = 0; + break; + } + if (chk->sent == SCTP_DATAGRAM_UNSENT) { + if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) { + SCTP_STAT_INCR_COUNTER64(sctps_outorderchunks); + } else { + SCTP_STAT_INCR_COUNTER64(sctps_outunorderchunks); + } + if (((chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) == SCTP_DATA_LAST_FRAG) && + ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0)) + /* + * Count number of + * user msg's that + * were fragmented + * we do this by + * counting when we + * see a LAST + * fragment only. + */ + SCTP_STAT_INCR_COUNTER64(sctps_fragusrmsgs); + } + if ((mtu == 0) || (r_mtu == 0) || (one_chunk)) { + if ((one_chunk) && (stcb->asoc.total_flight == 0)) { + data_list[0]->window_probe = 1; + net->window_probe = 1; + } + break; + } + } else { + /* + * Must be sent in order of the + * TSN's (on a network) + */ + break; + } + } /* for (chunk gather loop for this net) */ + } /* if asoc.state OPEN */ +no_data_fill: + /* Is there something to send for this destination? */ + if (outchain) { + /* We may need to start a control timer or two */ + if (asconf) { + sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, + stcb, net); + /* + * do NOT clear the asconf flag as it is + * used to do appropriate source address + * selection. + */ + } + if (cookie) { + sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net); + cookie = 0; + } + /* must start a send timer if data is being sent */ + if (bundle_at && (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer))) { + /* + * no timer running on this destination + * restart it. + */ + sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); + } else if ((asoc->sctp_cmt_on_off == 1) && + (asoc->sctp_cmt_pf > 0) && + pf_hbflag && + ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF) && + (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer))) { + /* + * JRS 5/14/07 - If a HB has been sent to a + * PF destination and no T3 timer is + * currently running, start the T3 timer to + * track the HBs that were sent. + */ + sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); + } + /* Now send it, if there is anything to send :> */ + if ((error = sctp_lowlevel_chunk_output(inp, + stcb, + net, + (struct sockaddr *)&net->ro._l_addr, + outchain, + auth_offset, + auth, + auth_keyid, + no_fragmentflg, + bundle_at, + data_list[0], + asconf, + inp->sctp_lport, stcb->rport, + htonl(stcb->asoc.peer_vtag), + net->port, so_locked, NULL))) { + /* error, we could not output */ + if (error == ENOBUFS) { + SCTP_STAT_INCR(sctps_lowlevelerr); + asoc->ifp_had_enobuf = 1; + } + if (from_where == 0) { + SCTP_STAT_INCR(sctps_lowlevelerrusr); + } + SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); + if (hbflag) { + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); + *now_filled = 1; + *now = net->last_sent_time; + } else { + net->last_sent_time = *now; + } + hbflag = 0; + } + if (error == EHOSTUNREACH) { + /* + * Destination went unreachable + * during this send + */ + sctp_move_chunks_from_net(stcb, net); + } + *reason_code = 6; + /*- + * I add this line to be paranoid. As far as + * I can tell the continue, takes us back to + * the top of the for, but just to make sure + * I will reset these again here. + */ + ctl_cnt = bundle_at = 0; + continue; /* This takes us back to the + * for() for the nets. */ + } else { + asoc->ifp_had_enobuf = 0; + } + outchain = endoutchain = NULL; + auth = NULL; + auth_offset = 0; + if (bundle_at || hbflag) { + /* For data/asconf and hb set time */ + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); + *now_filled = 1; + *now = net->last_sent_time; + } else { + net->last_sent_time = *now; + } + } + if (!no_out_cnt) { + *num_out += (ctl_cnt + bundle_at); + } + if (bundle_at) { + /* setup for a RTO measurement */ + tsns_sent = data_list[0]->rec.data.TSN_seq; + /* fill time if not already filled */ + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(&asoc->time_last_sent); + *now_filled = 1; + *now = asoc->time_last_sent; + } else { + asoc->time_last_sent = *now; + } + data_list[0]->do_rtt = 1; + SCTP_STAT_INCR_BY(sctps_senddata, bundle_at); + sctp_clean_up_datalist(stcb, asoc, data_list, bundle_at, net); + if (SCTP_BASE_SYSCTL(sctp_early_fr)) { + if (net->flight_size < net->cwnd) { + /* start or restart it */ + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, inp, stcb, net, + SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_2); + } + SCTP_STAT_INCR(sctps_earlyfrstrout); + sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, inp, stcb, net); + } else { + /* stop it if its running */ + if (SCTP_OS_TIMER_PENDING(&net->fr_timer.timer)) { + SCTP_STAT_INCR(sctps_earlyfrstpout); + sctp_timer_stop(SCTP_TIMER_TYPE_EARLYFR, inp, stcb, net, + SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_3); + } + } + } + } + if (one_chunk) { + break; + } + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, tsns_sent, SCTP_CWND_LOG_FROM_SEND); + } + } + if (old_start_at == NULL) { + old_start_at = start_at; + start_at = TAILQ_FIRST(&asoc->nets); + if (old_start_at) + goto again_one_more_time; + } + /* + * At the end there should be no NON timed chunks hanging on this + * queue. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, *num_out, SCTP_CWND_LOG_FROM_SEND); + } + if ((*num_out == 0) && (*reason_code == 0)) { + *reason_code = 4; + } else { + *reason_code = 5; + } + sctp_clean_up_ctl(stcb, asoc); + return (0); +} + +void +sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err) +{ + /*- + * Prepend a OPERATIONAL_ERROR chunk header and put on the end of + * the control chunk queue. + */ + struct sctp_chunkhdr *hdr; + struct sctp_tmit_chunk *chk; + struct mbuf *mat; + + SCTP_TCB_LOCK_ASSERT(stcb); + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* no memory */ + sctp_m_freem(op_err); + return; + } + chk->copy_by_ref = 0; + SCTP_BUF_PREPEND(op_err, sizeof(struct sctp_chunkhdr), M_DONTWAIT); + if (op_err == NULL) { + sctp_free_a_chunk(stcb, chk); + return; + } + chk->send_size = 0; + mat = op_err; + while (mat != NULL) { + chk->send_size += SCTP_BUF_LEN(mat); + mat = SCTP_BUF_NEXT(mat); + } + chk->rec.chunk_id.id = SCTP_OPERATION_ERROR; + chk->rec.chunk_id.can_take_data = 1; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->flags = 0; + chk->asoc = &stcb->asoc; + chk->data = op_err; + chk->whoTo = chk->asoc->primary_destination; + atomic_add_int(&chk->whoTo->ref_count, 1); + hdr = mtod(op_err, struct sctp_chunkhdr *); + hdr->chunk_type = SCTP_OPERATION_ERROR; + hdr->chunk_flags = 0; + hdr->chunk_length = htons(chk->send_size); + TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, + chk, + sctp_next); + chk->asoc->ctrl_queue_cnt++; +} + +int +sctp_send_cookie_echo(struct mbuf *m, + int offset, + struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + /*- + * pull out the cookie and put it at the front of the control chunk + * queue. + */ + int at; + struct mbuf *cookie; + struct sctp_paramhdr parm, *phdr; + struct sctp_chunkhdr *hdr; + struct sctp_tmit_chunk *chk; + uint16_t ptype, plen; + + /* First find the cookie in the param area */ + cookie = NULL; + at = offset + sizeof(struct sctp_init_chunk); + + SCTP_TCB_LOCK_ASSERT(stcb); + do { + phdr = sctp_get_next_param(m, at, &parm, sizeof(parm)); + if (phdr == NULL) { + return (-3); + } + ptype = ntohs(phdr->param_type); + plen = ntohs(phdr->param_length); + if (ptype == SCTP_STATE_COOKIE) { + int pad; + + /* found the cookie */ + if ((pad = (plen % 4))) { + plen += 4 - pad; + } + cookie = SCTP_M_COPYM(m, at, plen, M_DONTWAIT); + if (cookie == NULL) { + /* No memory */ + return (-2); + } +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = cookie; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_ICOPY); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + break; + } + at += SCTP_SIZE32(plen); + } while (phdr); + if (cookie == NULL) { + /* Did not find the cookie */ + return (-3); + } + /* ok, we got the cookie lets change it into a cookie echo chunk */ + + /* first the change from param to cookie */ + hdr = mtod(cookie, struct sctp_chunkhdr *); + hdr->chunk_type = SCTP_COOKIE_ECHO; + hdr->chunk_flags = 0; + /* get the chunk stuff now and place it in the FRONT of the queue */ + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* no memory */ + sctp_m_freem(cookie); + return (-5); + } + chk->copy_by_ref = 0; + chk->send_size = plen; + chk->rec.chunk_id.id = SCTP_COOKIE_ECHO; + chk->rec.chunk_id.can_take_data = 0; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->flags = CHUNK_FLAGS_FRAGMENT_OK; + chk->asoc = &stcb->asoc; + chk->data = cookie; + chk->whoTo = chk->asoc->primary_destination; + atomic_add_int(&chk->whoTo->ref_count, 1); + TAILQ_INSERT_HEAD(&chk->asoc->control_send_queue, chk, sctp_next); + chk->asoc->ctrl_queue_cnt++; + return (0); +} + +void +sctp_send_heartbeat_ack(struct sctp_tcb *stcb, + struct mbuf *m, + int offset, + int chk_length, + struct sctp_nets *net) +{ + /* + * take a HB request and make it into a HB ack and send it. + */ + struct mbuf *outchain; + struct sctp_chunkhdr *chdr; + struct sctp_tmit_chunk *chk; + + + if (net == NULL) + /* must have a net pointer */ + return; + + outchain = SCTP_M_COPYM(m, offset, chk_length, M_DONTWAIT); + if (outchain == NULL) { + /* gak out of memory */ + return; + } +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = outchain; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_ICOPY); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + chdr = mtod(outchain, struct sctp_chunkhdr *); + chdr->chunk_type = SCTP_HEARTBEAT_ACK; + chdr->chunk_flags = 0; + if (chk_length % 4) { + /* need pad */ + uint32_t cpthis = 0; + int padlen; + + padlen = 4 - (chk_length % 4); + m_copyback(outchain, chk_length, padlen, (caddr_t)&cpthis); + } + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* no memory */ + sctp_m_freem(outchain); + return; + } + chk->copy_by_ref = 0; + chk->send_size = chk_length; + chk->rec.chunk_id.id = SCTP_HEARTBEAT_ACK; + chk->rec.chunk_id.can_take_data = 1; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->flags = 0; + chk->asoc = &stcb->asoc; + chk->data = outchain; + chk->whoTo = net; + atomic_add_int(&chk->whoTo->ref_count, 1); + TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); + chk->asoc->ctrl_queue_cnt++; +} + +void +sctp_send_cookie_ack(struct sctp_tcb *stcb) +{ + /* formulate and queue a cookie-ack back to sender */ + struct mbuf *cookie_ack; + struct sctp_chunkhdr *hdr; + struct sctp_tmit_chunk *chk; + + cookie_ack = NULL; + SCTP_TCB_LOCK_ASSERT(stcb); + + cookie_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_DONTWAIT, 1, MT_HEADER); + if (cookie_ack == NULL) { + /* no mbuf's */ + return; + } + SCTP_BUF_RESV_UF(cookie_ack, SCTP_MIN_OVERHEAD); + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* no memory */ + sctp_m_freem(cookie_ack); + return; + } + chk->copy_by_ref = 0; + chk->send_size = sizeof(struct sctp_chunkhdr); + chk->rec.chunk_id.id = SCTP_COOKIE_ACK; + chk->rec.chunk_id.can_take_data = 1; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->flags = 0; + chk->asoc = &stcb->asoc; + chk->data = cookie_ack; + if (chk->asoc->last_control_chunk_from != NULL) { + chk->whoTo = chk->asoc->last_control_chunk_from; + } else { + chk->whoTo = chk->asoc->primary_destination; + } + atomic_add_int(&chk->whoTo->ref_count, 1); + hdr = mtod(cookie_ack, struct sctp_chunkhdr *); + hdr->chunk_type = SCTP_COOKIE_ACK; + hdr->chunk_flags = 0; + hdr->chunk_length = htons(chk->send_size); + SCTP_BUF_LEN(cookie_ack) = chk->send_size; + TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); + chk->asoc->ctrl_queue_cnt++; + return; +} + + +void +sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + /* formulate and queue a SHUTDOWN-ACK back to the sender */ + struct mbuf *m_shutdown_ack; + struct sctp_shutdown_ack_chunk *ack_cp; + struct sctp_tmit_chunk *chk; + + m_shutdown_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_ack_chunk), 0, M_DONTWAIT, 1, MT_HEADER); + if (m_shutdown_ack == NULL) { + /* no mbuf's */ + return; + } + SCTP_BUF_RESV_UF(m_shutdown_ack, SCTP_MIN_OVERHEAD); + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* no memory */ + sctp_m_freem(m_shutdown_ack); + return; + } + chk->copy_by_ref = 0; + chk->send_size = sizeof(struct sctp_chunkhdr); + chk->rec.chunk_id.id = SCTP_SHUTDOWN_ACK; + chk->rec.chunk_id.can_take_data = 1; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->flags = 0; + chk->asoc = &stcb->asoc; + chk->data = m_shutdown_ack; + chk->whoTo = net; + atomic_add_int(&net->ref_count, 1); + + ack_cp = mtod(m_shutdown_ack, struct sctp_shutdown_ack_chunk *); + ack_cp->ch.chunk_type = SCTP_SHUTDOWN_ACK; + ack_cp->ch.chunk_flags = 0; + ack_cp->ch.chunk_length = htons(chk->send_size); + SCTP_BUF_LEN(m_shutdown_ack) = chk->send_size; + TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); + chk->asoc->ctrl_queue_cnt++; + return; +} + +void +sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + /* formulate and queue a SHUTDOWN to the sender */ + struct mbuf *m_shutdown; + struct sctp_shutdown_chunk *shutdown_cp; + struct sctp_tmit_chunk *chk; + + m_shutdown = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_chunk), 0, M_DONTWAIT, 1, MT_HEADER); + if (m_shutdown == NULL) { + /* no mbuf's */ + return; + } + SCTP_BUF_RESV_UF(m_shutdown, SCTP_MIN_OVERHEAD); + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* no memory */ + sctp_m_freem(m_shutdown); + return; + } + chk->copy_by_ref = 0; + chk->send_size = sizeof(struct sctp_shutdown_chunk); + chk->rec.chunk_id.id = SCTP_SHUTDOWN; + chk->rec.chunk_id.can_take_data = 1; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->flags = 0; + chk->asoc = &stcb->asoc; + chk->data = m_shutdown; + chk->whoTo = net; + atomic_add_int(&net->ref_count, 1); + + shutdown_cp = mtod(m_shutdown, struct sctp_shutdown_chunk *); + shutdown_cp->ch.chunk_type = SCTP_SHUTDOWN; + shutdown_cp->ch.chunk_flags = 0; + shutdown_cp->ch.chunk_length = htons(chk->send_size); + shutdown_cp->cumulative_tsn_ack = htonl(stcb->asoc.cumulative_tsn); + SCTP_BUF_LEN(m_shutdown) = chk->send_size; + TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); + chk->asoc->ctrl_queue_cnt++; + return; +} + +void +sctp_send_asconf(struct sctp_tcb *stcb, struct sctp_nets *net, int addr_locked) +{ + /* + * formulate and queue an ASCONF to the peer. ASCONF parameters + * should be queued on the assoc queue. + */ + struct sctp_tmit_chunk *chk; + struct mbuf *m_asconf; + int len; + + SCTP_TCB_LOCK_ASSERT(stcb); + + if ((!TAILQ_EMPTY(&stcb->asoc.asconf_send_queue)) && + (!sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS))) { + /* can't send a new one if there is one in flight already */ + return; + } + /* compose an ASCONF chunk, maximum length is PMTU */ + m_asconf = sctp_compose_asconf(stcb, &len, addr_locked); + if (m_asconf == NULL) { + return; + } + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* no memory */ + sctp_m_freem(m_asconf); + return; + } + chk->copy_by_ref = 0; + chk->data = m_asconf; + chk->send_size = len; + chk->rec.chunk_id.id = SCTP_ASCONF; + chk->rec.chunk_id.can_take_data = 0; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->flags = CHUNK_FLAGS_FRAGMENT_OK; + chk->asoc = &stcb->asoc; + chk->whoTo = net; + atomic_add_int(&chk->whoTo->ref_count, 1); + TAILQ_INSERT_TAIL(&chk->asoc->asconf_send_queue, chk, sctp_next); + chk->asoc->ctrl_queue_cnt++; + return; +} + +void +sctp_send_asconf_ack(struct sctp_tcb *stcb) +{ + /* + * formulate and queue a asconf-ack back to sender. the asconf-ack + * must be stored in the tcb. + */ + struct sctp_tmit_chunk *chk; + struct sctp_asconf_ack *ack, *latest_ack; + struct mbuf *m_ack, *m; + struct sctp_nets *net = NULL; + + SCTP_TCB_LOCK_ASSERT(stcb); + /* Get the latest ASCONF-ACK */ + latest_ack = TAILQ_LAST(&stcb->asoc.asconf_ack_sent, sctp_asconf_ackhead); + if (latest_ack == NULL) { + return; + } + if (latest_ack->last_sent_to != NULL && + latest_ack->last_sent_to == stcb->asoc.last_control_chunk_from) { + /* we're doing a retransmission */ + net = sctp_find_alternate_net(stcb, stcb->asoc.last_control_chunk_from, 0); + if (net == NULL) { + /* no alternate */ + if (stcb->asoc.last_control_chunk_from == NULL) + net = stcb->asoc.primary_destination; + else + net = stcb->asoc.last_control_chunk_from; + } + } else { + /* normal case */ + if (stcb->asoc.last_control_chunk_from == NULL) + net = stcb->asoc.primary_destination; + else + net = stcb->asoc.last_control_chunk_from; + } + latest_ack->last_sent_to = net; + + TAILQ_FOREACH(ack, &stcb->asoc.asconf_ack_sent, next) { + if (ack->data == NULL) { + continue; + } + /* copy the asconf_ack */ + m_ack = SCTP_M_COPYM(ack->data, 0, M_COPYALL, M_DONTWAIT); + if (m_ack == NULL) { + /* couldn't copy it */ + return; + } +#ifdef SCTP_MBUF_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + struct mbuf *mat; + + mat = m_ack; + while (mat) { + if (SCTP_BUF_IS_EXTENDED(mat)) { + sctp_log_mb(mat, SCTP_MBUF_ICOPY); + } + mat = SCTP_BUF_NEXT(mat); + } + } +#endif + + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* no memory */ + if (m_ack) + sctp_m_freem(m_ack); + return; + } + chk->copy_by_ref = 0; + + chk->whoTo = net; + chk->data = m_ack; + chk->send_size = 0; + /* Get size */ + m = m_ack; + chk->send_size = ack->len; + chk->rec.chunk_id.id = SCTP_ASCONF_ACK; + chk->rec.chunk_id.can_take_data = 1; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; /* XXX */ + chk->asoc = &stcb->asoc; + atomic_add_int(&chk->whoTo->ref_count, 1); + + TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); + chk->asoc->ctrl_queue_cnt++; + } + return; +} + + +static int +sctp_chunk_retransmission(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_association *asoc, + int *cnt_out, struct timeval *now, int *now_filled, int *fr_done, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + /*- + * send out one MTU of retransmission. If fast_retransmit is + * happening we ignore the cwnd. Otherwise we obey the cwnd and + * rwnd. For a Cookie or Asconf in the control chunk queue we + * retransmit them by themselves. + * + * For data chunks we will pick out the lowest TSN's in the sent_queue + * marked for resend and bundle them all together (up to a MTU of + * destination). The address to send to should have been + * selected/changed where the retransmission was marked (i.e. in FR + * or t3-timeout routines). + */ + struct sctp_tmit_chunk *data_list[SCTP_MAX_DATA_BUNDLING]; + struct sctp_tmit_chunk *chk, *fwd; + struct mbuf *m, *endofchain; + struct sctp_nets *net = NULL; + uint32_t tsns_sent = 0; + int no_fragmentflg, bundle_at, cnt_thru; + unsigned int mtu; + int error, i, one_chunk, fwd_tsn, ctl_cnt, tmr_started; + struct sctp_auth_chunk *auth = NULL; + uint32_t auth_offset = 0; + uint16_t auth_keyid; + int override_ok = 1; + int data_auth_reqd = 0; + uint32_t dmtu = 0; + + SCTP_TCB_LOCK_ASSERT(stcb); + tmr_started = ctl_cnt = bundle_at = error = 0; + no_fragmentflg = 1; + fwd_tsn = 0; + *cnt_out = 0; + fwd = NULL; + endofchain = m = NULL; + auth_keyid = stcb->asoc.authinfo.active_keyid; +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xC3, 1); +#endif + if ((TAILQ_EMPTY(&asoc->sent_queue)) && + (TAILQ_EMPTY(&asoc->control_send_queue))) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "SCTP hits empty queue with cnt set to %d?\n", + asoc->sent_queue_retran_cnt); + asoc->sent_queue_cnt = 0; + asoc->sent_queue_cnt_removeable = 0; + /* send back 0/0 so we enter normal transmission */ + *cnt_out = 0; + return (0); + } + TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { + if ((chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) || + (chk->rec.chunk_id.id == SCTP_STREAM_RESET) || + (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN)) { + if (chk->sent != SCTP_DATAGRAM_RESEND) { + continue; + } + if (chk->rec.chunk_id.id == SCTP_STREAM_RESET) { + if (chk != asoc->str_reset) { + /* + * not eligible for retran if its + * not ours + */ + continue; + } + } + ctl_cnt++; + if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) { + fwd_tsn = 1; + fwd = chk; + } + /* + * Add an AUTH chunk, if chunk requires it save the + * offset into the chain for AUTH + */ + if ((auth == NULL) && + (sctp_auth_is_required_chunk(chk->rec.chunk_id.id, + stcb->asoc.peer_auth_chunks))) { + m = sctp_add_auth_chunk(m, &endofchain, + &auth, &auth_offset, + stcb, + chk->rec.chunk_id.id); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + } + m = sctp_copy_mbufchain(chk->data, m, &endofchain, 0, chk->send_size, chk->copy_by_ref); + break; + } + } + one_chunk = 0; + cnt_thru = 0; + /* do we have control chunks to retransmit? */ + if (m != NULL) { + /* Start a timer no matter if we suceed or fail */ + if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) { + sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, chk->whoTo); + } else if (chk->rec.chunk_id.id == SCTP_ASCONF) + sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, chk->whoTo); + chk->snd_count++; /* update our count */ + if ((error = sctp_lowlevel_chunk_output(inp, stcb, chk->whoTo, + (struct sockaddr *)&chk->whoTo->ro._l_addr, m, + auth_offset, auth, stcb->asoc.authinfo.active_keyid, + no_fragmentflg, 0, NULL, 0, + inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), + chk->whoTo->port, so_locked, NULL))) { + SCTP_STAT_INCR(sctps_lowlevelerr); + return (error); + } + m = endofchain = NULL; + auth = NULL; + auth_offset = 0; + /* + * We don't want to mark the net->sent time here since this + * we use this for HB and retrans cannot measure RTT + */ + /* (void)SCTP_GETTIME_TIMEVAL(&chk->whoTo->last_sent_time); */ + *cnt_out += 1; + chk->sent = SCTP_DATAGRAM_SENT; + sctp_ucount_decr(stcb->asoc.sent_queue_retran_cnt); + if (fwd_tsn == 0) { + return (0); + } else { + /* Clean up the fwd-tsn list */ + sctp_clean_up_ctl(stcb, asoc); + return (0); + } + } + /* + * Ok, it is just data retransmission we need to do or that and a + * fwd-tsn with it all. + */ + if (TAILQ_EMPTY(&asoc->sent_queue)) { + return (SCTP_RETRAN_DONE); + } + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT)) { + /* not yet open, resend the cookie and that is it */ + return (1); + } +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(20, inp, stcb, NULL); +#endif + data_auth_reqd = sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks); + TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { + if (chk->sent != SCTP_DATAGRAM_RESEND) { + /* No, not sent to this net or not ready for rtx */ + continue; + } + if (chk->data == NULL) { + printf("TSN:%x chk->snd_count:%d chk->sent:%d can't retran - no data\n", + chk->rec.data.TSN_seq, chk->snd_count, chk->sent); + continue; + } + if ((SCTP_BASE_SYSCTL(sctp_max_retran_chunk)) && + (chk->snd_count >= SCTP_BASE_SYSCTL(sctp_max_retran_chunk))) { + /* Gak, we have exceeded max unlucky retran, abort! */ + SCTP_PRINTF("Gak, chk->snd_count:%d >= max:%d - send abort\n", + chk->snd_count, + SCTP_BASE_SYSCTL(sctp_max_retran_chunk)); + atomic_add_int(&stcb->asoc.refcnt, 1); + sctp_abort_an_association(stcb->sctp_ep, stcb, 0, NULL, so_locked); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return (SCTP_RETRAN_EXIT); + } + /* pick up the net */ + net = chk->whoTo; + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + mtu = (net->mtu - SCTP_MIN_OVERHEAD); + } else { + mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; + } + + if ((asoc->peers_rwnd < mtu) && (asoc->total_flight > 0)) { + /* No room in peers rwnd */ + uint32_t tsn; + + tsn = asoc->last_acked_seq + 1; + if (tsn == chk->rec.data.TSN_seq) { + /* + * we make a special exception for this + * case. The peer has no rwnd but is missing + * the lowest chunk.. which is probably what + * is holding up the rwnd. + */ + goto one_chunk_around; + } + return (1); + } +one_chunk_around: + if (asoc->peers_rwnd < mtu) { + one_chunk = 1; + if ((asoc->peers_rwnd == 0) && + (asoc->total_flight == 0)) { + chk->window_probe = 1; + chk->whoTo->window_probe = 1; + } + } +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xC3, 2); +#endif + bundle_at = 0; + m = NULL; + net->fast_retran_ip = 0; + if (chk->rec.data.doing_fast_retransmit == 0) { + /* + * if no FR in progress skip destination that have + * flight_size > cwnd. + */ + if (net->flight_size >= net->cwnd) { + continue; + } + } else { + /* + * Mark the destination net to have FR recovery + * limits put on it. + */ + *fr_done = 1; + net->fast_retran_ip = 1; + } + + /* + * if no AUTH is yet included and this chunk requires it, + * make sure to account for it. We don't apply the size + * until the AUTH chunk is actually added below in case + * there is no room for this chunk. + */ + if (data_auth_reqd && (auth == NULL)) { + dmtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); + } else + dmtu = 0; + + if ((chk->send_size <= (mtu - dmtu)) || + (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) { + /* ok we will add this one */ + if (data_auth_reqd) { + if (auth == NULL) { + m = sctp_add_auth_chunk(m, + &endofchain, + &auth, + &auth_offset, + stcb, + SCTP_DATA); + auth_keyid = chk->auth_keyid; + override_ok = 0; + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + } else if (override_ok) { + auth_keyid = chk->auth_keyid; + override_ok = 0; + } else if (chk->auth_keyid != auth_keyid) { + /* different keyid, so done bundling */ + break; + } + } + m = sctp_copy_mbufchain(chk->data, m, &endofchain, 0, chk->send_size, chk->copy_by_ref); + if (m == NULL) { + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + /* Do clear IP_DF ? */ + if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { + no_fragmentflg = 0; + } + /* upate our MTU size */ + if (mtu > (chk->send_size + dmtu)) + mtu -= (chk->send_size + dmtu); + else + mtu = 0; + data_list[bundle_at++] = chk; + if (one_chunk && (asoc->total_flight <= 0)) { + SCTP_STAT_INCR(sctps_windowprobed); + } + } + if (one_chunk == 0) { + /* + * now are there anymore forward from chk to pick + * up? + */ + fwd = TAILQ_NEXT(chk, sctp_next); + while (fwd) { + if (fwd->sent != SCTP_DATAGRAM_RESEND) { + /* Nope, not for retran */ + fwd = TAILQ_NEXT(fwd, sctp_next); + continue; + } + if (fwd->whoTo != net) { + /* Nope, not the net in question */ + fwd = TAILQ_NEXT(fwd, sctp_next); + continue; + } + if (data_auth_reqd && (auth == NULL)) { + dmtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); + } else + dmtu = 0; + if (fwd->send_size <= (mtu - dmtu)) { + if (data_auth_reqd) { + if (auth == NULL) { + m = sctp_add_auth_chunk(m, + &endofchain, + &auth, + &auth_offset, + stcb, + SCTP_DATA); + auth_keyid = fwd->auth_keyid; + override_ok = 0; + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + } else if (override_ok) { + auth_keyid = fwd->auth_keyid; + override_ok = 0; + } else if (fwd->auth_keyid != auth_keyid) { + /* + * different keyid, + * so done bundling + */ + break; + } + } + m = sctp_copy_mbufchain(fwd->data, m, &endofchain, 0, fwd->send_size, fwd->copy_by_ref); + if (m == NULL) { + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + /* Do clear IP_DF ? */ + if (fwd->flags & CHUNK_FLAGS_FRAGMENT_OK) { + no_fragmentflg = 0; + } + /* upate our MTU size */ + if (mtu > (fwd->send_size + dmtu)) + mtu -= (fwd->send_size + dmtu); + else + mtu = 0; + data_list[bundle_at++] = fwd; + if (bundle_at >= SCTP_MAX_DATA_BUNDLING) { + break; + } + fwd = TAILQ_NEXT(fwd, sctp_next); + } else { + /* can't fit so we are done */ + break; + } + } + } + /* Is there something to send for this destination? */ + if (m) { + /* + * No matter if we fail/or suceed we should start a + * timer. A failure is like a lost IP packet :-) + */ + if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { + /* + * no timer running on this destination + * restart it. + */ + sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); + tmr_started = 1; + } + /* Now lets send it, if there is anything to send :> */ + if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, + (struct sockaddr *)&net->ro._l_addr, m, + auth_offset, auth, auth_keyid, + no_fragmentflg, 0, NULL, 0, + inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), + net->port, so_locked, NULL))) { + /* error, we could not output */ + SCTP_STAT_INCR(sctps_lowlevelerr); + return (error); + } + m = endofchain = NULL; + auth = NULL; + auth_offset = 0; + /* For HB's */ + /* + * We don't want to mark the net->sent time here + * since this we use this for HB and retrans cannot + * measure RTT + */ + /* (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); */ + + /* For auto-close */ + cnt_thru++; + if (*now_filled == 0) { + (void)SCTP_GETTIME_TIMEVAL(&asoc->time_last_sent); + *now = asoc->time_last_sent; + *now_filled = 1; + } else { + asoc->time_last_sent = *now; + } + *cnt_out += bundle_at; +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xC4, bundle_at); +#endif + if (bundle_at) { + tsns_sent = data_list[0]->rec.data.TSN_seq; + } + for (i = 0; i < bundle_at; i++) { + SCTP_STAT_INCR(sctps_sendretransdata); + data_list[i]->sent = SCTP_DATAGRAM_SENT; + /* + * When we have a revoked data, and we + * retransmit it, then we clear the revoked + * flag since this flag dictates if we + * subtracted from the fs + */ + if (data_list[i]->rec.data.chunk_was_revoked) { + /* Deflate the cwnd */ + data_list[i]->whoTo->cwnd -= data_list[i]->book_size; + data_list[i]->rec.data.chunk_was_revoked = 0; + } + data_list[i]->snd_count++; + sctp_ucount_decr(asoc->sent_queue_retran_cnt); + /* record the time */ + data_list[i]->sent_rcv_time = asoc->time_last_sent; + if (data_list[i]->book_size_scale) { + /* + * need to double the book size on + * this one + */ + data_list[i]->book_size_scale = 0; + /* + * Since we double the booksize, we + * must also double the output queue + * size, since this get shrunk when + * we free by this amount. + */ + atomic_add_int(&((asoc)->total_output_queue_size), data_list[i]->book_size); + data_list[i]->book_size *= 2; + + + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { + sctp_log_rwnd(SCTP_DECREASE_PEER_RWND, + asoc->peers_rwnd, data_list[i]->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); + } + asoc->peers_rwnd = sctp_sbspace_sub(asoc->peers_rwnd, + (uint32_t) (data_list[i]->send_size + + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh))); + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_UP_RSND, + data_list[i]->whoTo->flight_size, + data_list[i]->book_size, + (uintptr_t) data_list[i]->whoTo, + data_list[i]->rec.data.TSN_seq); + } + sctp_flight_size_increase(data_list[i]); + sctp_total_flight_increase(stcb, data_list[i]); + if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { + /* SWS sender side engages */ + asoc->peers_rwnd = 0; + } + if ((i == 0) && + (data_list[i]->rec.data.doing_fast_retransmit)) { + SCTP_STAT_INCR(sctps_sendfastretrans); + if ((data_list[i] == TAILQ_FIRST(&asoc->sent_queue)) && + (tmr_started == 0)) { + /*- + * ok we just fast-retrans'd + * the lowest TSN, i.e the + * first on the list. In + * this case we want to give + * some more time to get a + * SACK back without a + * t3-expiring. + */ + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, inp, stcb, net, + SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_4); + sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); + } + } + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, net, tsns_sent, SCTP_CWND_LOG_FROM_RESEND); + } +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(21, inp, stcb, NULL); +#endif + } else { + /* None will fit */ + return (1); + } + if (asoc->sent_queue_retran_cnt <= 0) { + /* all done we have no more to retran */ + asoc->sent_queue_retran_cnt = 0; + break; + } + if (one_chunk) { + /* No more room in rwnd */ + return (1); + } + /* stop the for loop here. we sent out a packet */ + break; + } + return (0); +} + + +static int +sctp_timer_validation(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_association *asoc, + int ret) +{ + struct sctp_nets *net; + + /* Validate that a timer is running somewhere */ + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { + /* Here is a timer */ + return (ret); + } + } + SCTP_TCB_LOCK_ASSERT(stcb); + /* Gak, we did not have a timer somewhere */ + SCTPDBG(SCTP_DEBUG_OUTPUT3, "Deadlock avoided starting timer on a dest at retran\n"); + sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, asoc->primary_destination); + return (ret); +} + +void +sctp_chunk_output(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + int from_where, + int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + /*- + * Ok this is the generic chunk service queue. we must do the + * following: + * - See if there are retransmits pending, if so we must + * do these first. + * - Service the stream queue that is next, moving any + * message (note I must get a complete message i.e. + * FIRST/MIDDLE and LAST to the out queue in one pass) and assigning + * TSN's + * - Check to see if the cwnd/rwnd allows any output, if so we + * go ahead and fomulate and send the low level chunks. Making sure + * to combine any control in the control chunk queue also. + */ + struct sctp_association *asoc; + struct sctp_nets *net; + int error = 0, num_out = 0, tot_out = 0, ret = 0, reason_code = 0, + burst_cnt = 0, burst_limit = 0; + struct timeval now; + int now_filled = 0; + int nagle_on = 0; + int frag_point = sctp_get_frag_point(stcb, &stcb->asoc); + int un_sent = 0; + int fr_done, tot_frs = 0; + + asoc = &stcb->asoc; + if (from_where == SCTP_OUTPUT_FROM_USR_SEND) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY)) { + nagle_on = 0; + } else { + nagle_on = 1; + } + } + SCTP_TCB_LOCK_ASSERT(stcb); + + un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight); + + if ((un_sent <= 0) && + (TAILQ_EMPTY(&asoc->control_send_queue)) && + (TAILQ_EMPTY(&asoc->asconf_send_queue)) && + (asoc->sent_queue_retran_cnt == 0)) { + /* Nothing to do unless there is something to be sent left */ + return; + } + /* + * Do we have something to send, data or control AND a sack timer + * running, if so piggy-back the sack. + */ + if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { + sctp_send_sack(stcb); + (void)SCTP_OS_TIMER_STOP(&stcb->asoc.dack_timer.timer); + } + while (asoc->sent_queue_retran_cnt) { + /*- + * Ok, it is retransmission time only, we send out only ONE + * packet with a single call off to the retran code. + */ + if (from_where == SCTP_OUTPUT_FROM_COOKIE_ACK) { + /*- + * Special hook for handling cookiess discarded + * by peer that carried data. Send cookie-ack only + * and then the next call with get the retran's. + */ + (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, + from_where, + &now, &now_filled, frag_point, so_locked); + return; + } else if (from_where != SCTP_OUTPUT_FROM_HB_TMR) { + /* if its not from a HB then do it */ + fr_done = 0; + ret = sctp_chunk_retransmission(inp, stcb, asoc, &num_out, &now, &now_filled, &fr_done, so_locked); + if (fr_done) { + tot_frs++; + } + } else { + /* + * its from any other place, we don't allow retran + * output (only control) + */ + ret = 1; + } + if (ret > 0) { + /* Can't send anymore */ + /*- + * now lets push out control by calling med-level + * output once. this assures that we WILL send HB's + * if queued too. + */ + (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, + from_where, + &now, &now_filled, frag_point, so_locked); +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(8, inp, stcb, NULL); +#endif + (void)sctp_timer_validation(inp, stcb, asoc, ret); + return; + } + if (ret < 0) { + /*- + * The count was off.. retran is not happening so do + * the normal retransmission. + */ +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(9, inp, stcb, NULL); +#endif + if (ret == SCTP_RETRAN_EXIT) { + return; + } + break; + } + if (from_where == SCTP_OUTPUT_FROM_T3) { + /* Only one transmission allowed out of a timeout */ +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(10, inp, stcb, NULL); +#endif + /* Push out any control */ + (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, from_where, + &now, &now_filled, frag_point, so_locked); + return; + } + if (tot_frs > asoc->max_burst) { + /* Hit FR burst limit */ + return; + } + if ((num_out == 0) && (ret == 0)) { + + /* No more retrans to send */ + break; + } + } +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(12, inp, stcb, NULL); +#endif + /* Check for bad destinations, if they exist move chunks around. */ + burst_limit = asoc->max_burst; + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + if ((net->dest_state & SCTP_ADDR_NOT_REACHABLE) == + SCTP_ADDR_NOT_REACHABLE) { + /*- + * if possible move things off of this address we + * still may send below due to the dormant state but + * we try to find an alternate address to send to + * and if we have one we move all queued data on the + * out wheel to this alternate address. + */ + if (net->ref_count > 1) + sctp_move_chunks_from_net(stcb, net); + } else if ((asoc->sctp_cmt_on_off == 1) && + (asoc->sctp_cmt_pf > 0) && + ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) { + /* + * JRS 5/14/07 - If CMT PF is on and the current + * destination is in PF state, move all queued data + * to an alternate desination. + */ + if (net->ref_count > 1) + sctp_move_chunks_from_net(stcb, net); + } else { + /*- + * if ((asoc->sat_network) || (net->addr_is_local)) + * { burst_limit = asoc->max_burst * + * SCTP_SAT_NETWORK_BURST_INCR; } + */ + if (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst)) { + if ((net->flight_size + (burst_limit * net->mtu)) < net->cwnd) { + /* + * JRS - Use the congestion control + * given in the congestion control + * module + */ + asoc->cc_functions.sctp_cwnd_update_after_output(stcb, net, burst_limit); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { + sctp_log_maxburst(stcb, net, 0, burst_limit, SCTP_MAX_BURST_APPLIED); + } + SCTP_STAT_INCR(sctps_maxburstqueued); + } + net->fast_retran_ip = 0; + } else { + if (net->flight_size == 0) { + /* Should be decaying the cwnd here */ + ; + } + } + } + + } + burst_cnt = 0; + do { + error = sctp_med_chunk_output(inp, stcb, asoc, &num_out, + &reason_code, 0, from_where, + &now, &now_filled, frag_point, so_locked); + if (error) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Error %d was returned from med-c-op\n", error); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { + sctp_log_maxburst(stcb, asoc->primary_destination, error, burst_cnt, SCTP_MAX_BURST_ERROR_STOP); + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, NULL, error, SCTP_SEND_NOW_COMPLETES); + sctp_log_cwnd(stcb, NULL, 0xdeadbeef, SCTP_SEND_NOW_COMPLETES); + } + break; + } + SCTPDBG(SCTP_DEBUG_OUTPUT3, "m-c-o put out %d\n", num_out); + + tot_out += num_out; + burst_cnt++; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, NULL, num_out, SCTP_SEND_NOW_COMPLETES); + if (num_out == 0) { + sctp_log_cwnd(stcb, NULL, reason_code, SCTP_SEND_NOW_COMPLETES); + } + } + if (nagle_on) { + /*- + * When nagle is on, we look at how much is un_sent, then + * if its smaller than an MTU and we have data in + * flight we stop. + */ + un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) + + (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk))); + if ((un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD)) && + (stcb->asoc.total_flight > 0)) { + break; + } + } + if (TAILQ_EMPTY(&asoc->control_send_queue) && + TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->out_wheel)) { + /* Nothing left to send */ + break; + } + if ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) <= 0) { + /* Nothing left to send */ + break; + } + } while (num_out && (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) || + (burst_cnt < burst_limit))); + + if (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) == 0) { + if (burst_cnt >= burst_limit) { + SCTP_STAT_INCR(sctps_maxburstqueued); + asoc->burst_limit_applied = 1; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { + sctp_log_maxburst(stcb, asoc->primary_destination, 0, burst_cnt, SCTP_MAX_BURST_APPLIED); + } + } else { + asoc->burst_limit_applied = 0; + } + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + sctp_log_cwnd(stcb, NULL, tot_out, SCTP_SEND_NOW_COMPLETES); + } + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, we have put out %d chunks\n", + tot_out); + + /*- + * Now we need to clean up the control chunk chain if a ECNE is on + * it. It must be marked as UNSENT again so next call will continue + * to send it until such time that we get a CWR, to remove it. + */ + if (stcb->asoc.ecn_echo_cnt_onq) + sctp_fix_ecn_echo(asoc); + return; +} + + +int +sctp_output(inp, m, addr, control, p, flags) + struct sctp_inpcb *inp; + struct mbuf *m; + struct sockaddr *addr; + struct mbuf *control; + struct thread *p; + int flags; +{ + if (inp == NULL) { + SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); + return (EINVAL); + } + if (inp->sctp_socket == NULL) { + SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); + return (EINVAL); + } + return (sctp_sosend(inp->sctp_socket, + addr, + (struct uio *)NULL, + m, + control, + flags, p + )); +} + +void +send_forward_tsn(struct sctp_tcb *stcb, + struct sctp_association *asoc) +{ + struct sctp_tmit_chunk *chk; + struct sctp_forward_tsn_chunk *fwdtsn; + uint32_t advance_peer_ack_point; + + SCTP_TCB_LOCK_ASSERT(stcb); + TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { + if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) { + /* mark it to unsent */ + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + /* Do we correct its output location? */ + if (chk->whoTo != asoc->primary_destination) { + sctp_free_remote_addr(chk->whoTo); + chk->whoTo = asoc->primary_destination; + atomic_add_int(&chk->whoTo->ref_count, 1); + } + goto sctp_fill_in_rest; + } + } + /* Ok if we reach here we must build one */ + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + return; + } + asoc->fwd_tsn_cnt++; + chk->copy_by_ref = 0; + chk->rec.chunk_id.id = SCTP_FORWARD_CUM_TSN; + chk->rec.chunk_id.can_take_data = 0; + chk->asoc = asoc; + chk->whoTo = NULL; + chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + if (chk->data == NULL) { + sctp_free_a_chunk(stcb, chk); + return; + } + SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->whoTo = asoc->primary_destination; + atomic_add_int(&chk->whoTo->ref_count, 1); + TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next); + asoc->ctrl_queue_cnt++; +sctp_fill_in_rest: + /*- + * Here we go through and fill out the part that deals with + * stream/seq of the ones we skip. + */ + SCTP_BUF_LEN(chk->data) = 0; + { + struct sctp_tmit_chunk *at, *tp1, *last; + struct sctp_strseq *strseq; + unsigned int cnt_of_space, i, ovh; + unsigned int space_needed; + unsigned int cnt_of_skipped = 0; + + TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) { + if (at->sent != SCTP_FORWARD_TSN_SKIP) { + /* no more to look at */ + break; + } + if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) { + /* We don't report these */ + continue; + } + cnt_of_skipped++; + } + space_needed = (sizeof(struct sctp_forward_tsn_chunk) + + (cnt_of_skipped * sizeof(struct sctp_strseq))); + + cnt_of_space = M_TRAILINGSPACE(chk->data); + + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ovh = SCTP_MIN_OVERHEAD; + } else { + ovh = SCTP_MIN_V4_OVERHEAD; + } + if (cnt_of_space > (asoc->smallest_mtu - ovh)) { + /* trim to a mtu size */ + cnt_of_space = asoc->smallest_mtu - ovh; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { + sctp_misc_ints(SCTP_FWD_TSN_CHECK, + 0xff, 0, cnt_of_skipped, + asoc->advanced_peer_ack_point); + + } + advance_peer_ack_point = asoc->advanced_peer_ack_point; + if (cnt_of_space < space_needed) { + /*- + * ok we must trim down the chunk by lowering the + * advance peer ack point. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { + sctp_misc_ints(SCTP_FWD_TSN_CHECK, + 0xff, 0xff, cnt_of_space, + space_needed); + } + cnt_of_skipped = cnt_of_space - sizeof(struct sctp_forward_tsn_chunk); + cnt_of_skipped /= sizeof(struct sctp_strseq); + /*- + * Go through and find the TSN that will be the one + * we report. + */ + at = TAILQ_FIRST(&asoc->sent_queue); + for (i = 0; i < cnt_of_skipped; i++) { + tp1 = TAILQ_NEXT(at, sctp_next); + if (tp1 == NULL) { + break; + } + at = tp1; + } + if (at && SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { + sctp_misc_ints(SCTP_FWD_TSN_CHECK, + 0xff, cnt_of_skipped, at->rec.data.TSN_seq, + asoc->advanced_peer_ack_point); + } + last = at; + /*- + * last now points to last one I can report, update + * peer ack point + */ + if (last) + advance_peer_ack_point = last->rec.data.TSN_seq; + space_needed = sizeof(struct sctp_forward_tsn_chunk) + + cnt_of_skipped * sizeof(struct sctp_strseq); + } + chk->send_size = space_needed; + /* Setup the chunk */ + fwdtsn = mtod(chk->data, struct sctp_forward_tsn_chunk *); + fwdtsn->ch.chunk_length = htons(chk->send_size); + fwdtsn->ch.chunk_flags = 0; + fwdtsn->ch.chunk_type = SCTP_FORWARD_CUM_TSN; + fwdtsn->new_cumulative_tsn = htonl(advance_peer_ack_point); + SCTP_BUF_LEN(chk->data) = chk->send_size; + fwdtsn++; + /*- + * Move pointer to after the fwdtsn and transfer to the + * strseq pointer. + */ + strseq = (struct sctp_strseq *)fwdtsn; + /*- + * Now populate the strseq list. This is done blindly + * without pulling out duplicate stream info. This is + * inefficent but won't harm the process since the peer will + * look at these in sequence and will thus release anything. + * It could mean we exceed the PMTU and chop off some that + * we could have included.. but this is unlikely (aka 1432/4 + * would mean 300+ stream seq's would have to be reported in + * one FWD-TSN. With a bit of work we can later FIX this to + * optimize and pull out duplcates.. but it does add more + * overhead. So for now... not! + */ + at = TAILQ_FIRST(&asoc->sent_queue); + for (i = 0; i < cnt_of_skipped; i++) { + tp1 = TAILQ_NEXT(at, sctp_next); + if (tp1 == NULL) + break; + if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) { + /* We don't report these */ + i--; + at = tp1; + continue; + } + if (at->rec.data.TSN_seq == advance_peer_ack_point) { + at->rec.data.fwd_tsn_cnt = 0; + } + strseq->stream = ntohs(at->rec.data.stream_number); + strseq->sequence = ntohs(at->rec.data.stream_seq); + strseq++; + at = tp1; + } + } + return; + +} + +void +sctp_send_sack(struct sctp_tcb *stcb) +{ + /*- + * Queue up a SACK or NR-SACK in the control queue. + * We must first check to see if a SACK or NR-SACK is + * somehow on the control queue. + * If so, we will take and and remove the old one. + */ + struct sctp_association *asoc; + struct sctp_tmit_chunk *chk, *a_chk; + struct sctp_sack_chunk *sack; + struct sctp_nr_sack_chunk *nr_sack; + struct sctp_gap_ack_block *gap_descriptor; + struct sack_track *selector; + int mergeable = 0; + int offset; + caddr_t limit; + uint32_t *dup; + int limit_reached = 0; + unsigned int i, siz, j; + unsigned int num_gap_blocks = 0, num_nr_gap_blocks = 0, space; + int num_dups = 0; + int space_req; + uint32_t highest_tsn; + uint8_t flags; + uint8_t type; + uint8_t tsn_map; + + if ((stcb->asoc.sctp_nr_sack_on_off == 1) && + (stcb->asoc.peer_supports_nr_sack == 1)) { + type = SCTP_NR_SELECTIVE_ACK; + } else { + type = SCTP_SELECTIVE_ACK; + } + a_chk = NULL; + asoc = &stcb->asoc; + SCTP_TCB_LOCK_ASSERT(stcb); + if (asoc->last_data_chunk_from == NULL) { + /* Hmm we never received anything */ + return; + } + sctp_slide_mapping_arrays(stcb); + sctp_set_rwnd(stcb, asoc); + TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { + if (chk->rec.chunk_id.id == type) { + /* Hmm, found a sack already on queue, remove it */ + TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); + asoc->ctrl_queue_cnt--; + a_chk = chk; + if (a_chk->data) { + sctp_m_freem(a_chk->data); + a_chk->data = NULL; + } + sctp_free_remote_addr(a_chk->whoTo); + a_chk->whoTo = NULL; + break; + } + } + if (a_chk == NULL) { + sctp_alloc_a_chunk(stcb, a_chk); + if (a_chk == NULL) { + /* No memory so we drop the idea, and set a timer */ + if (stcb->asoc.delayed_ack) { + sctp_timer_stop(SCTP_TIMER_TYPE_RECV, + stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_5); + sctp_timer_start(SCTP_TIMER_TYPE_RECV, + stcb->sctp_ep, stcb, NULL); + } else { + stcb->asoc.send_sack = 1; + } + return; + } + a_chk->copy_by_ref = 0; + a_chk->rec.chunk_id.id = type; + a_chk->rec.chunk_id.can_take_data = 1; + } + /* Clear our pkt counts */ + asoc->data_pkts_seen = 0; + + a_chk->asoc = asoc; + a_chk->snd_count = 0; + a_chk->send_size = 0; /* fill in later */ + a_chk->sent = SCTP_DATAGRAM_UNSENT; + a_chk->whoTo = NULL; + + if ((asoc->numduptsns) || + (asoc->last_data_chunk_from->dest_state & SCTP_ADDR_NOT_REACHABLE)) { + /*- + * Ok, we have some duplicates or the destination for the + * sack is unreachable, lets see if we can select an + * alternate than asoc->last_data_chunk_from + */ + if ((!(asoc->last_data_chunk_from->dest_state & SCTP_ADDR_NOT_REACHABLE)) && + (asoc->used_alt_onsack > asoc->numnets)) { + /* We used an alt last time, don't this time */ + a_chk->whoTo = NULL; + } else { + asoc->used_alt_onsack++; + a_chk->whoTo = sctp_find_alternate_net(stcb, asoc->last_data_chunk_from, 0); + } + if (a_chk->whoTo == NULL) { + /* Nope, no alternate */ + a_chk->whoTo = asoc->last_data_chunk_from; + asoc->used_alt_onsack = 0; + } + } else { + /* + * No duplicates so we use the last place we received data + * from. + */ + asoc->used_alt_onsack = 0; + a_chk->whoTo = asoc->last_data_chunk_from; + } + if (a_chk->whoTo) { + atomic_add_int(&a_chk->whoTo->ref_count, 1); + } + if (compare_with_wrap(asoc->highest_tsn_inside_map, asoc->highest_tsn_inside_nr_map, MAX_TSN)) { + highest_tsn = asoc->highest_tsn_inside_map; + } else { + highest_tsn = asoc->highest_tsn_inside_nr_map; + } + if (highest_tsn == asoc->cumulative_tsn) { + /* no gaps */ + if (type == SCTP_SELECTIVE_ACK) { + space_req = sizeof(struct sctp_sack_chunk); + } else { + space_req = sizeof(struct sctp_nr_sack_chunk); + } + } else { + /* gaps get a cluster */ + space_req = MCLBYTES; + } + /* Ok now lets formulate a MBUF with our sack */ + a_chk->data = sctp_get_mbuf_for_msg(space_req, 0, M_DONTWAIT, 1, MT_DATA); + if ((a_chk->data == NULL) || + (a_chk->whoTo == NULL)) { + /* rats, no mbuf memory */ + if (a_chk->data) { + /* was a problem with the destination */ + sctp_m_freem(a_chk->data); + a_chk->data = NULL; + } + sctp_free_a_chunk(stcb, a_chk); + /* sa_ignore NO_NULL_CHK */ + if (stcb->asoc.delayed_ack) { + sctp_timer_stop(SCTP_TIMER_TYPE_RECV, + stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_6); + sctp_timer_start(SCTP_TIMER_TYPE_RECV, + stcb->sctp_ep, stcb, NULL); + } else { + stcb->asoc.send_sack = 1; + } + return; + } + /* ok, lets go through and fill it in */ + SCTP_BUF_RESV_UF(a_chk->data, SCTP_MIN_OVERHEAD); + space = M_TRAILINGSPACE(a_chk->data); + if (space > (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD)) { + space = (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD); + } + limit = mtod(a_chk->data, caddr_t); + limit += space; + + /* 0x01 is used by nonce for ecn */ + if ((SCTP_BASE_SYSCTL(sctp_ecn_enable)) && + (SCTP_BASE_SYSCTL(sctp_ecn_nonce)) && + (asoc->peer_supports_ecn_nonce)) + flags = (asoc->receiver_nonce_sum & SCTP_SACK_NONCE_SUM); + else + flags = 0; + + if ((asoc->sctp_cmt_on_off == 1) && + SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { + /*- + * CMT DAC algorithm: If 2 (i.e., 0x10) packets have been + * received, then set high bit to 1, else 0. Reset + * pkts_rcvd. + */ + flags |= (asoc->cmt_dac_pkts_rcvd << 6); + asoc->cmt_dac_pkts_rcvd = 0; + } +#ifdef SCTP_ASOCLOG_OF_TSNS + stcb->asoc.cumack_logsnt[stcb->asoc.cumack_log_atsnt] = asoc->cumulative_tsn; + stcb->asoc.cumack_log_atsnt++; + if (stcb->asoc.cumack_log_atsnt >= SCTP_TSN_LOG_SIZE) { + stcb->asoc.cumack_log_atsnt = 0; + } +#endif + /* reset the readers interpretation */ + stcb->freed_by_sorcv_sincelast = 0; + + if (type == SCTP_SELECTIVE_ACK) { + sack = mtod(a_chk->data, struct sctp_sack_chunk *); + nr_sack = NULL; + gap_descriptor = (struct sctp_gap_ack_block *)((caddr_t)sack + sizeof(struct sctp_sack_chunk)); + if (highest_tsn > asoc->mapping_array_base_tsn) { + siz = (((highest_tsn - asoc->mapping_array_base_tsn) + 1) + 7) / 8; + } else { + siz = (((MAX_TSN - highest_tsn) + 1) + highest_tsn + 7) / 8; + } + } else { + sack = NULL; + nr_sack = mtod(a_chk->data, struct sctp_nr_sack_chunk *); + gap_descriptor = (struct sctp_gap_ack_block *)((caddr_t)nr_sack + sizeof(struct sctp_nr_sack_chunk)); + if (asoc->highest_tsn_inside_map > asoc->mapping_array_base_tsn) { + siz = (((asoc->highest_tsn_inside_map - asoc->mapping_array_base_tsn) + 1) + 7) / 8; + } else { + siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + asoc->highest_tsn_inside_map + 7) / 8; + } + } + + if (compare_with_wrap(asoc->mapping_array_base_tsn, asoc->cumulative_tsn, MAX_TSN)) { + offset = 1; + } else { + offset = asoc->mapping_array_base_tsn - asoc->cumulative_tsn; + } + if (((type == SCTP_SELECTIVE_ACK) && + compare_with_wrap(highest_tsn, asoc->cumulative_tsn, MAX_TSN)) || + ((type == SCTP_NR_SELECTIVE_ACK) && + compare_with_wrap(asoc->highest_tsn_inside_map, asoc->cumulative_tsn, MAX_TSN))) { + /* we have a gap .. maybe */ + for (i = 0; i < siz; i++) { + tsn_map = asoc->mapping_array[i]; + if (type == SCTP_SELECTIVE_ACK) { + tsn_map |= asoc->nr_mapping_array[i]; + } + if (i == 0) { + /* + * Clear all bits corresponding to TSNs + * smaller or equal to the cumulative TSN. + */ + tsn_map &= (~0 << (1 - offset)); + } + selector = &sack_array[tsn_map]; + if (mergeable && selector->right_edge) { + /* + * Backup, left and right edges were ok to + * merge. + */ + num_gap_blocks--; + gap_descriptor--; + } + if (selector->num_entries == 0) + mergeable = 0; + else { + for (j = 0; j < selector->num_entries; j++) { + if (mergeable && selector->right_edge) { + /* + * do a merge by NOT setting + * the left side + */ + mergeable = 0; + } else { + /* + * no merge, set the left + * side + */ + mergeable = 0; + gap_descriptor->start = htons((selector->gaps[j].start + offset)); + } + gap_descriptor->end = htons((selector->gaps[j].end + offset)); + num_gap_blocks++; + gap_descriptor++; + if (((caddr_t)gap_descriptor + sizeof(struct sctp_gap_ack_block)) > limit) { + /* no more room */ + limit_reached = 1; + break; + } + } + if (selector->left_edge) { + mergeable = 1; + } + } + if (limit_reached) { + /* Reached the limit stop */ + break; + } + offset += 8; + } + } + if ((type == SCTP_NR_SELECTIVE_ACK) && + (limit_reached == 0)) { + + mergeable = 0; + + if (asoc->highest_tsn_inside_nr_map > asoc->mapping_array_base_tsn) { + siz = (((asoc->highest_tsn_inside_nr_map - asoc->mapping_array_base_tsn) + 1) + 7) / 8; + } else { + siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + asoc->highest_tsn_inside_nr_map + 7) / 8; + } + + if (compare_with_wrap(asoc->mapping_array_base_tsn, asoc->cumulative_tsn, MAX_TSN)) { + offset = 1; + } else { + offset = asoc->mapping_array_base_tsn - asoc->cumulative_tsn; + } + if (compare_with_wrap(asoc->highest_tsn_inside_nr_map, asoc->cumulative_tsn, MAX_TSN)) { + /* we have a gap .. maybe */ + for (i = 0; i < siz; i++) { + tsn_map = asoc->nr_mapping_array[i]; + if (i == 0) { + /* + * Clear all bits corresponding to + * TSNs smaller or equal to the + * cumulative TSN. + */ + tsn_map &= (~0 << (1 - offset)); + } + selector = &sack_array[tsn_map]; + if (mergeable && selector->right_edge) { + /* + * Backup, left and right edges were + * ok to merge. + */ + num_nr_gap_blocks--; + gap_descriptor--; + } + if (selector->num_entries == 0) + mergeable = 0; + else { + for (j = 0; j < selector->num_entries; j++) { + if (mergeable && selector->right_edge) { + /* + * do a merge by NOT + * setting the left + * side + */ + mergeable = 0; + } else { + /* + * no merge, set the + * left side + */ + mergeable = 0; + gap_descriptor->start = htons((selector->gaps[j].start + offset)); + } + gap_descriptor->end = htons((selector->gaps[j].end + offset)); + num_nr_gap_blocks++; + gap_descriptor++; + if (((caddr_t)gap_descriptor + sizeof(struct sctp_gap_ack_block)) > limit) { + /* no more room */ + limit_reached = 1; + break; + } + } + if (selector->left_edge) { + mergeable = 1; + } + } + if (limit_reached) { + /* Reached the limit stop */ + break; + } + offset += 8; + } + } + } + /* now we must add any dups we are going to report. */ + if ((limit_reached == 0) && (asoc->numduptsns)) { + dup = (uint32_t *) gap_descriptor; + for (i = 0; i < asoc->numduptsns; i++) { + *dup = htonl(asoc->dup_tsns[i]); + dup++; + num_dups++; + if (((caddr_t)dup + sizeof(uint32_t)) > limit) { + /* no more room */ + break; + } + } + asoc->numduptsns = 0; + } + /* + * now that the chunk is prepared queue it to the control chunk + * queue. + */ + if (type == SCTP_SELECTIVE_ACK) { + a_chk->send_size = sizeof(struct sctp_sack_chunk) + + (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) + + num_dups * sizeof(int32_t); + SCTP_BUF_LEN(a_chk->data) = a_chk->send_size; + sack->sack.cum_tsn_ack = htonl(asoc->cumulative_tsn); + sack->sack.a_rwnd = htonl(asoc->my_rwnd); + sack->sack.num_gap_ack_blks = htons(num_gap_blocks); + sack->sack.num_dup_tsns = htons(num_dups); + sack->ch.chunk_type = type; + sack->ch.chunk_flags = flags; + sack->ch.chunk_length = htons(a_chk->send_size); + } else { + a_chk->send_size = sizeof(struct sctp_nr_sack_chunk) + + (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) + + num_dups * sizeof(int32_t); + SCTP_BUF_LEN(a_chk->data) = a_chk->send_size; + nr_sack->nr_sack.cum_tsn_ack = htonl(asoc->cumulative_tsn); + nr_sack->nr_sack.a_rwnd = htonl(asoc->my_rwnd); + nr_sack->nr_sack.num_gap_ack_blks = htons(num_gap_blocks); + nr_sack->nr_sack.num_nr_gap_ack_blks = htons(num_nr_gap_blocks); + nr_sack->nr_sack.num_dup_tsns = htons(num_dups); + nr_sack->nr_sack.reserved = 0; + nr_sack->ch.chunk_type = type; + nr_sack->ch.chunk_flags = flags; + nr_sack->ch.chunk_length = htons(a_chk->send_size); + } + TAILQ_INSERT_TAIL(&asoc->control_send_queue, a_chk, sctp_next); + asoc->my_last_reported_rwnd = asoc->my_rwnd; + asoc->ctrl_queue_cnt++; + asoc->send_sack = 0; + SCTP_STAT_INCR(sctps_sendsacks); + return; +} + +void +sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + struct mbuf *m_abort; + struct mbuf *m_out = NULL, *m_end = NULL; + struct sctp_abort_chunk *abort = NULL; + int sz; + uint32_t auth_offset = 0; + struct sctp_auth_chunk *auth = NULL; + + /*- + * Add an AUTH chunk, if chunk requires it and save the offset into + * the chain for AUTH + */ + if (sctp_auth_is_required_chunk(SCTP_ABORT_ASSOCIATION, + stcb->asoc.peer_auth_chunks)) { + m_out = sctp_add_auth_chunk(m_out, &m_end, &auth, &auth_offset, + stcb, SCTP_ABORT_ASSOCIATION); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + } + SCTP_TCB_LOCK_ASSERT(stcb); + m_abort = sctp_get_mbuf_for_msg(sizeof(struct sctp_abort_chunk), 0, M_DONTWAIT, 1, MT_HEADER); + if (m_abort == NULL) { + /* no mbuf's */ + if (m_out) + sctp_m_freem(m_out); + return; + } + /* link in any error */ + SCTP_BUF_NEXT(m_abort) = operr; + sz = 0; + if (operr) { + struct mbuf *n; + + n = operr; + while (n) { + sz += SCTP_BUF_LEN(n); + n = SCTP_BUF_NEXT(n); + } + } + SCTP_BUF_LEN(m_abort) = sizeof(*abort); + if (m_out == NULL) { + /* NO Auth chunk prepended, so reserve space in front */ + SCTP_BUF_RESV_UF(m_abort, SCTP_MIN_OVERHEAD); + m_out = m_abort; + } else { + /* Put AUTH chunk at the front of the chain */ + SCTP_BUF_NEXT(m_end) = m_abort; + } + + /* fill in the ABORT chunk */ + abort = mtod(m_abort, struct sctp_abort_chunk *); + abort->ch.chunk_type = SCTP_ABORT_ASSOCIATION; + abort->ch.chunk_flags = 0; + abort->ch.chunk_length = htons(sizeof(*abort) + sz); + + (void)sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, + stcb->asoc.primary_destination, + (struct sockaddr *)&stcb->asoc.primary_destination->ro._l_addr, + m_out, auth_offset, auth, stcb->asoc.authinfo.active_keyid, 1, 0, NULL, 0, + stcb->sctp_ep->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), + stcb->asoc.primary_destination->port, so_locked, NULL); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); +} + +void +sctp_send_shutdown_complete(struct sctp_tcb *stcb, + struct sctp_nets *net, + int reflect_vtag) +{ + /* formulate and SEND a SHUTDOWN-COMPLETE */ + struct mbuf *m_shutdown_comp; + struct sctp_shutdown_complete_chunk *shutdown_complete; + uint32_t vtag; + uint8_t flags; + + m_shutdown_comp = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_DONTWAIT, 1, MT_HEADER); + if (m_shutdown_comp == NULL) { + /* no mbuf's */ + return; + } + if (reflect_vtag) { + flags = SCTP_HAD_NO_TCB; + vtag = stcb->asoc.my_vtag; + } else { + flags = 0; + vtag = stcb->asoc.peer_vtag; + } + shutdown_complete = mtod(m_shutdown_comp, struct sctp_shutdown_complete_chunk *); + shutdown_complete->ch.chunk_type = SCTP_SHUTDOWN_COMPLETE; + shutdown_complete->ch.chunk_flags = flags; + shutdown_complete->ch.chunk_length = htons(sizeof(struct sctp_shutdown_complete_chunk)); + SCTP_BUF_LEN(m_shutdown_comp) = sizeof(struct sctp_shutdown_complete_chunk); + (void)sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, net, + (struct sockaddr *)&net->ro._l_addr, + m_shutdown_comp, 0, NULL, 0, 1, 0, NULL, 0, + stcb->sctp_ep->sctp_lport, stcb->rport, + htonl(vtag), + net->port, SCTP_SO_NOT_LOCKED, NULL); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + return; +} + +void +sctp_send_shutdown_complete2(struct mbuf *m, int iphlen, struct sctphdr *sh, + uint32_t vrf_id, uint16_t port) +{ + /* formulate and SEND a SHUTDOWN-COMPLETE */ + struct mbuf *o_pak; + struct mbuf *mout; + struct ip *iph, *iph_out; + struct udphdr *udp = NULL; + +#ifdef INET6 + struct ip6_hdr *ip6, *ip6_out; + +#endif + int offset_out, len, mlen; + struct sctp_shutdown_complete_msg *comp_cp; + + iph = mtod(m, struct ip *); + switch (iph->ip_v) { + case IPVERSION: + len = (sizeof(struct ip) + sizeof(struct sctp_shutdown_complete_msg)); + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + len = (sizeof(struct ip6_hdr) + sizeof(struct sctp_shutdown_complete_msg)); + break; +#endif + default: + return; + } + if (port) { + len += sizeof(struct udphdr); + } + mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA); + if (mout == NULL) { + return; + } + SCTP_BUF_RESV_UF(mout, max_linkhdr); + SCTP_BUF_LEN(mout) = len; + SCTP_BUF_NEXT(mout) = NULL; + iph_out = NULL; +#ifdef INET6 + ip6_out = NULL; +#endif + offset_out = 0; + + switch (iph->ip_v) { + case IPVERSION: + iph_out = mtod(mout, struct ip *); + + /* Fill in the IP header for the ABORT */ + iph_out->ip_v = IPVERSION; + iph_out->ip_hl = (sizeof(struct ip) / 4); + iph_out->ip_tos = (u_char)0; + iph_out->ip_id = 0; + iph_out->ip_off = 0; + iph_out->ip_ttl = MAXTTL; + if (port) { + iph_out->ip_p = IPPROTO_UDP; + } else { + iph_out->ip_p = IPPROTO_SCTP; + } + iph_out->ip_src.s_addr = iph->ip_dst.s_addr; + iph_out->ip_dst.s_addr = iph->ip_src.s_addr; + + /* let IP layer calculate this */ + iph_out->ip_sum = 0; + offset_out += sizeof(*iph_out); + comp_cp = (struct sctp_shutdown_complete_msg *)( + (caddr_t)iph_out + offset_out); + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + ip6 = (struct ip6_hdr *)iph; + ip6_out = mtod(mout, struct ip6_hdr *); + + /* Fill in the IPv6 header for the ABORT */ + ip6_out->ip6_flow = ip6->ip6_flow; + ip6_out->ip6_hlim = MODULE_GLOBAL(ip6_defhlim); + if (port) { + ip6_out->ip6_nxt = IPPROTO_UDP; + } else { + ip6_out->ip6_nxt = IPPROTO_SCTP; + } + ip6_out->ip6_src = ip6->ip6_dst; + ip6_out->ip6_dst = ip6->ip6_src; + /* + * ?? The old code had both the iph len + payload, I think + * this is wrong and would never have worked + */ + ip6_out->ip6_plen = sizeof(struct sctp_shutdown_complete_msg); + offset_out += sizeof(*ip6_out); + comp_cp = (struct sctp_shutdown_complete_msg *)( + (caddr_t)ip6_out + offset_out); + break; +#endif /* INET6 */ + default: + /* Currently not supported. */ + sctp_m_freem(mout); + return; + } + if (port) { + udp = (struct udphdr *)comp_cp; + udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); + udp->uh_dport = port; + udp->uh_ulen = htons(sizeof(struct sctp_shutdown_complete_msg) + sizeof(struct udphdr)); + if (iph_out) + udp->uh_sum = in_pseudo(iph_out->ip_src.s_addr, iph_out->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP)); + offset_out += sizeof(struct udphdr); + comp_cp = (struct sctp_shutdown_complete_msg *)((caddr_t)comp_cp + sizeof(struct udphdr)); + } + if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { + /* no mbuf's */ + sctp_m_freem(mout); + return; + } + /* Now copy in and fill in the ABORT tags etc. */ + comp_cp->sh.src_port = sh->dest_port; + comp_cp->sh.dest_port = sh->src_port; + comp_cp->sh.checksum = 0; + comp_cp->sh.v_tag = sh->v_tag; + comp_cp->shut_cmp.ch.chunk_flags = SCTP_HAD_NO_TCB; + comp_cp->shut_cmp.ch.chunk_type = SCTP_SHUTDOWN_COMPLETE; + comp_cp->shut_cmp.ch.chunk_length = htons(sizeof(struct sctp_shutdown_complete_chunk)); + + if (iph_out != NULL) { + sctp_route_t ro; + int ret; + + mlen = SCTP_BUF_LEN(mout); + bzero(&ro, sizeof ro); + /* set IPv4 length */ + iph_out->ip_len = mlen; +#ifdef SCTP_PACKET_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) + sctp_packet_log(mout, mlen); +#endif + if (port) { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + comp_cp->sh.checksum = sctp_calculate_cksum(mout, offset_out); + SCTP_STAT_INCR(sctps_sendswcrc); +#endif + SCTP_ENABLE_UDP_CSUM(mout); + } else { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + mout->m_pkthdr.csum_flags = CSUM_SCTP; + mout->m_pkthdr.csum_data = 0; + SCTP_STAT_INCR(sctps_sendhwcrc); +#endif + } + SCTP_ATTACH_CHAIN(o_pak, mout, mlen); + /* out it goes */ + SCTP_IP_OUTPUT(ret, o_pak, &ro, NULL, vrf_id); + + /* Free the route if we got one back */ + if (ro.ro_rt) + RTFREE(ro.ro_rt); + } +#ifdef INET6 + if (ip6_out != NULL) { + struct route_in6 ro; + int ret; + struct ifnet *ifp = NULL; + + bzero(&ro, sizeof(ro)); + mlen = SCTP_BUF_LEN(mout); +#ifdef SCTP_PACKET_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) + sctp_packet_log(mout, mlen); +#endif + SCTP_ATTACH_CHAIN(o_pak, mout, mlen); + if (port) { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + comp_cp->sh.checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr)); + SCTP_STAT_INCR(sctps_sendswcrc); +#endif + if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), mlen - sizeof(struct ip6_hdr))) == 0) { + udp->uh_sum = 0xffff; + } + } else { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + mout->m_pkthdr.csum_flags = CSUM_SCTP; + mout->m_pkthdr.csum_data = 0; + SCTP_STAT_INCR(sctps_sendhwcrc); +#endif + } + SCTP_IP6_OUTPUT(ret, o_pak, &ro, &ifp, NULL, vrf_id); + + /* Free the route if we got one back */ + if (ro.ro_rt) + RTFREE(ro.ro_rt); + } +#endif + SCTP_STAT_INCR(sctps_sendpackets); + SCTP_STAT_INCR_COUNTER64(sctps_outpackets); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); + return; + +} + +static struct sctp_nets * +sctp_select_hb_destination(struct sctp_tcb *stcb, struct timeval *now) +{ + struct sctp_nets *net, *hnet; + int ms_goneby, highest_ms, state_overide = 0; + + (void)SCTP_GETTIME_TIMEVAL(now); + highest_ms = 0; + hnet = NULL; + SCTP_TCB_LOCK_ASSERT(stcb); + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if ( + ((net->dest_state & SCTP_ADDR_NOHB) && ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) || + (net->dest_state & SCTP_ADDR_OUT_OF_SCOPE) + ) { + /* + * Skip this guy from consideration if HB is off AND + * its confirmed + */ + continue; + } + if (sctp_destination_is_reachable(stcb, (struct sockaddr *)&net->ro._l_addr) == 0) { + /* skip this dest net from consideration */ + continue; + } + if (net->last_sent_time.tv_sec) { + /* Sent to so we subtract */ + ms_goneby = (now->tv_sec - net->last_sent_time.tv_sec) * 1000; + } else + /* Never been sent to */ + ms_goneby = 0x7fffffff; + /*- + * When the address state is unconfirmed but still + * considered reachable, we HB at a higher rate. Once it + * goes confirmed OR reaches the "unreachable" state, thenw + * we cut it back to HB at a more normal pace. + */ + if ((net->dest_state & (SCTP_ADDR_UNCONFIRMED | SCTP_ADDR_NOT_REACHABLE)) == SCTP_ADDR_UNCONFIRMED) { + state_overide = 1; + } else { + state_overide = 0; + } + + if ((((unsigned int)ms_goneby >= net->RTO) || (state_overide)) && + (ms_goneby > highest_ms)) { + highest_ms = ms_goneby; + hnet = net; + } + } + if (hnet && + ((hnet->dest_state & (SCTP_ADDR_UNCONFIRMED | SCTP_ADDR_NOT_REACHABLE)) == SCTP_ADDR_UNCONFIRMED)) { + state_overide = 1; + } else { + state_overide = 0; + } + + if (hnet && highest_ms && (((unsigned int)highest_ms >= hnet->RTO) || state_overide)) { + /*- + * Found the one with longest delay bounds OR it is + * unconfirmed and still not marked unreachable. + */ + SCTPDBG(SCTP_DEBUG_OUTPUT4, "net:%p is the hb winner -", hnet); +#ifdef SCTP_DEBUG + if (hnet) { + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT4, + (struct sockaddr *)&hnet->ro._l_addr); + } else { + SCTPDBG(SCTP_DEBUG_OUTPUT4, " none\n"); + } +#endif + /* update the timer now */ + hnet->last_sent_time = *now; + return (hnet); + } + /* Nothing to HB */ + return (NULL); +} + +int +sctp_send_hb(struct sctp_tcb *stcb, int user_req, struct sctp_nets *u_net) +{ + struct sctp_tmit_chunk *chk; + struct sctp_nets *net; + struct sctp_heartbeat_chunk *hb; + struct timeval now; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + + SCTP_TCB_LOCK_ASSERT(stcb); + if (user_req == 0) { + net = sctp_select_hb_destination(stcb, &now); + if (net == NULL) { + /*- + * All our busy none to send to, just start the + * timer again. + */ + if (stcb->asoc.state == 0) { + return (0); + } + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, + stcb->sctp_ep, + stcb, + net); + return (0); + } + } else { + net = u_net; + if (net == NULL) { + return (0); + } + (void)SCTP_GETTIME_TIMEVAL(&now); + } + sin = (struct sockaddr_in *)&net->ro._l_addr; + if (sin->sin_family != AF_INET) { + if (sin->sin_family != AF_INET6) { + /* huh */ + return (0); + } + } + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + SCTPDBG(SCTP_DEBUG_OUTPUT4, "Gak, can't get a chunk for hb\n"); + return (0); + } + chk->copy_by_ref = 0; + chk->rec.chunk_id.id = SCTP_HEARTBEAT_REQUEST; + chk->rec.chunk_id.can_take_data = 1; + chk->asoc = &stcb->asoc; + chk->send_size = sizeof(struct sctp_heartbeat_chunk); + + chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER); + if (chk->data == NULL) { + sctp_free_a_chunk(stcb, chk); + return (0); + } + SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); + SCTP_BUF_LEN(chk->data) = chk->send_size; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->whoTo = net; + atomic_add_int(&chk->whoTo->ref_count, 1); + /* Now we have a mbuf that we can fill in with the details */ + hb = mtod(chk->data, struct sctp_heartbeat_chunk *); + memset(hb, 0, sizeof(struct sctp_heartbeat_chunk)); + /* fill out chunk header */ + hb->ch.chunk_type = SCTP_HEARTBEAT_REQUEST; + hb->ch.chunk_flags = 0; + hb->ch.chunk_length = htons(chk->send_size); + /* Fill out hb parameter */ + hb->heartbeat.hb_info.ph.param_type = htons(SCTP_HEARTBEAT_INFO); + hb->heartbeat.hb_info.ph.param_length = htons(sizeof(struct sctp_heartbeat_info_param)); + hb->heartbeat.hb_info.time_value_1 = now.tv_sec; + hb->heartbeat.hb_info.time_value_2 = now.tv_usec; + /* Did our user request this one, put it in */ + hb->heartbeat.hb_info.user_req = user_req; + hb->heartbeat.hb_info.addr_family = sin->sin_family; + hb->heartbeat.hb_info.addr_len = sin->sin_len; + if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { + /* + * we only take from the entropy pool if the address is not + * confirmed. + */ + net->heartbeat_random1 = hb->heartbeat.hb_info.random_value1 = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep); + net->heartbeat_random2 = hb->heartbeat.hb_info.random_value2 = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep); + } else { + net->heartbeat_random1 = hb->heartbeat.hb_info.random_value1 = 0; + net->heartbeat_random2 = hb->heartbeat.hb_info.random_value2 = 0; + } + if (sin->sin_family == AF_INET) { + memcpy(hb->heartbeat.hb_info.address, &sin->sin_addr, sizeof(sin->sin_addr)); + } else if (sin->sin_family == AF_INET6) { + /* We leave the scope the way it is in our lookup table. */ + sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; + memcpy(hb->heartbeat.hb_info.address, &sin6->sin6_addr, sizeof(sin6->sin6_addr)); + } else { + /* huh compiler bug */ + return (0); + } + + /* + * JRS 5/14/07 - In CMT PF, the T3 timer is used to track + * PF-heartbeats. Because of this, threshold management is done by + * the t3 timer handler, and does not need to be done upon the send + * of a PF-heartbeat. If CMT PF is on and the destination to which a + * heartbeat is being sent is in PF state, do NOT do threshold + * management. + */ + if ((stcb->asoc.sctp_cmt_pf == 0) || + ((net->dest_state & SCTP_ADDR_PF) != SCTP_ADDR_PF)) { + /* ok we have a destination that needs a beat */ + /* lets do the theshold management Qiaobing style */ + if (sctp_threshold_management(stcb->sctp_ep, stcb, net, + stcb->asoc.max_send_times)) { + /*- + * we have lost the association, in a way this is + * quite bad since we really are one less time since + * we really did not send yet. This is the down side + * to the Q's style as defined in the RFC and not my + * alternate style defined in the RFC. + */ + if (chk->data != NULL) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + /* + * Here we do NOT use the macro since the + * association is now gone. + */ + if (chk->whoTo) { + sctp_free_remote_addr(chk->whoTo); + chk->whoTo = NULL; + } + sctp_free_a_chunk((struct sctp_tcb *)NULL, chk); + return (-1); + } + } + net->hb_responded = 0; + TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); + stcb->asoc.ctrl_queue_cnt++; + SCTP_STAT_INCR(sctps_sendheartbeat); + /*- + * Call directly med level routine to put out the chunk. It will + * always tumble out control chunks aka HB but it may even tumble + * out data too. + */ + return (1); +} + +void +sctp_send_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net, + uint32_t high_tsn) +{ + struct sctp_association *asoc; + struct sctp_ecne_chunk *ecne; + struct sctp_tmit_chunk *chk; + + asoc = &stcb->asoc; + SCTP_TCB_LOCK_ASSERT(stcb); + TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { + if (chk->rec.chunk_id.id == SCTP_ECN_ECHO) { + /* found a previous ECN_ECHO update it if needed */ + ecne = mtod(chk->data, struct sctp_ecne_chunk *); + ecne->tsn = htonl(high_tsn); + return; + } + } + /* nope could not find one to update so we must build one */ + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + return; + } + chk->copy_by_ref = 0; + SCTP_STAT_INCR(sctps_sendecne); + chk->rec.chunk_id.id = SCTP_ECN_ECHO; + chk->rec.chunk_id.can_take_data = 0; + chk->asoc = &stcb->asoc; + chk->send_size = sizeof(struct sctp_ecne_chunk); + chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER); + if (chk->data == NULL) { + sctp_free_a_chunk(stcb, chk); + return; + } + SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); + SCTP_BUF_LEN(chk->data) = chk->send_size; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->whoTo = net; + atomic_add_int(&chk->whoTo->ref_count, 1); + stcb->asoc.ecn_echo_cnt_onq++; + ecne = mtod(chk->data, struct sctp_ecne_chunk *); + ecne->ch.chunk_type = SCTP_ECN_ECHO; + ecne->ch.chunk_flags = 0; + ecne->ch.chunk_length = htons(sizeof(struct sctp_ecne_chunk)); + ecne->tsn = htonl(high_tsn); + TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); + asoc->ctrl_queue_cnt++; +} + +void +sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net, + struct mbuf *m, int iphlen, int bad_crc) +{ + struct sctp_association *asoc; + struct sctp_pktdrop_chunk *drp; + struct sctp_tmit_chunk *chk; + uint8_t *datap; + int len; + int was_trunc = 0; + struct ip *iph; + +#ifdef INET6 + struct ip6_hdr *ip6h; + +#endif + int fullsz = 0, extra = 0; + long spc; + int offset; + struct sctp_chunkhdr *ch, chunk_buf; + unsigned int chk_length; + + if (!stcb) { + return; + } + asoc = &stcb->asoc; + SCTP_TCB_LOCK_ASSERT(stcb); + if (asoc->peer_supports_pktdrop == 0) { + /*- + * peer must declare support before I send one. + */ + return; + } + if (stcb->sctp_socket == NULL) { + return; + } + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + return; + } + chk->copy_by_ref = 0; + iph = mtod(m, struct ip *); + if (iph == NULL) { + sctp_free_a_chunk(stcb, chk); + return; + } + switch (iph->ip_v) { + case IPVERSION: + /* IPv4 */ + len = chk->send_size = iph->ip_len; + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + /* IPv6 */ + ip6h = mtod(m, struct ip6_hdr *); + len = chk->send_size = htons(ip6h->ip6_plen); + break; +#endif + default: + return; + } + /* Validate that we do not have an ABORT in here. */ + offset = iphlen + sizeof(struct sctphdr); + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, + sizeof(*ch), (uint8_t *) & chunk_buf); + while (ch != NULL) { + chk_length = ntohs(ch->chunk_length); + if (chk_length < sizeof(*ch)) { + /* break to abort land */ + break; + } + switch (ch->chunk_type) { + case SCTP_PACKET_DROPPED: + case SCTP_ABORT_ASSOCIATION: + case SCTP_INITIATION_ACK: + /** + * We don't respond with an PKT-DROP to an ABORT + * or PKT-DROP. We also do not respond to an + * INIT-ACK, because we can't know if the initiation + * tag is correct or not. + */ + sctp_free_a_chunk(stcb, chk); + return; + default: + break; + } + offset += SCTP_SIZE32(chk_length); + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, + sizeof(*ch), (uint8_t *) & chunk_buf); + } + + if ((len + SCTP_MAX_OVERHEAD + sizeof(struct sctp_pktdrop_chunk)) > + min(stcb->asoc.smallest_mtu, MCLBYTES)) { + /* + * only send 1 mtu worth, trim off the excess on the end. + */ + fullsz = len - extra; + len = min(stcb->asoc.smallest_mtu, MCLBYTES) - SCTP_MAX_OVERHEAD; + was_trunc = 1; + } + chk->asoc = &stcb->asoc; + chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + if (chk->data == NULL) { +jump_out: + sctp_free_a_chunk(stcb, chk); + return; + } + SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); + drp = mtod(chk->data, struct sctp_pktdrop_chunk *); + if (drp == NULL) { + sctp_m_freem(chk->data); + chk->data = NULL; + goto jump_out; + } + chk->book_size = SCTP_SIZE32((chk->send_size + sizeof(struct sctp_pktdrop_chunk) + + sizeof(struct sctphdr) + SCTP_MED_OVERHEAD)); + chk->book_size_scale = 0; + if (was_trunc) { + drp->ch.chunk_flags = SCTP_PACKET_TRUNCATED; + drp->trunc_len = htons(fullsz); + /* + * Len is already adjusted to size minus overhead above take + * out the pkt_drop chunk itself from it. + */ + chk->send_size = len - sizeof(struct sctp_pktdrop_chunk); + len = chk->send_size; + } else { + /* no truncation needed */ + drp->ch.chunk_flags = 0; + drp->trunc_len = htons(0); + } + if (bad_crc) { + drp->ch.chunk_flags |= SCTP_BADCRC; + } + chk->send_size += sizeof(struct sctp_pktdrop_chunk); + SCTP_BUF_LEN(chk->data) = chk->send_size; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + if (net) { + /* we should hit here */ + chk->whoTo = net; + } else { + chk->whoTo = asoc->primary_destination; + } + atomic_add_int(&chk->whoTo->ref_count, 1); + chk->rec.chunk_id.id = SCTP_PACKET_DROPPED; + chk->rec.chunk_id.can_take_data = 1; + drp->ch.chunk_type = SCTP_PACKET_DROPPED; + drp->ch.chunk_length = htons(chk->send_size); + spc = SCTP_SB_LIMIT_RCV(stcb->sctp_socket); + if (spc < 0) { + spc = 0; + } + drp->bottle_bw = htonl(spc); + if (asoc->my_rwnd) { + drp->current_onq = htonl(asoc->size_on_reasm_queue + + asoc->size_on_all_streams + + asoc->my_rwnd_control_len + + stcb->sctp_socket->so_rcv.sb_cc); + } else { + /*- + * If my rwnd is 0, possibly from mbuf depletion as well as + * space used, tell the peer there is NO space aka onq == bw + */ + drp->current_onq = htonl(spc); + } + drp->reserved = 0; + datap = drp->data; + m_copydata(m, iphlen, len, (caddr_t)datap); + TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); + asoc->ctrl_queue_cnt++; +} + +void +sctp_send_cwr(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn) +{ + struct sctp_association *asoc; + struct sctp_cwr_chunk *cwr; + struct sctp_tmit_chunk *chk; + + asoc = &stcb->asoc; + SCTP_TCB_LOCK_ASSERT(stcb); + TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { + if (chk->rec.chunk_id.id == SCTP_ECN_CWR) { + /* found a previous ECN_CWR update it if needed */ + cwr = mtod(chk->data, struct sctp_cwr_chunk *); + if (compare_with_wrap(high_tsn, ntohl(cwr->tsn), + MAX_TSN)) { + cwr->tsn = htonl(high_tsn); + } + return; + } + } + /* nope could not find one to update so we must build one */ + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + return; + } + chk->copy_by_ref = 0; + chk->rec.chunk_id.id = SCTP_ECN_CWR; + chk->rec.chunk_id.can_take_data = 1; + chk->asoc = &stcb->asoc; + chk->send_size = sizeof(struct sctp_cwr_chunk); + chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_DONTWAIT, 1, MT_HEADER); + if (chk->data == NULL) { + sctp_free_a_chunk(stcb, chk); + return; + } + SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); + SCTP_BUF_LEN(chk->data) = chk->send_size; + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->whoTo = net; + atomic_add_int(&chk->whoTo->ref_count, 1); + cwr = mtod(chk->data, struct sctp_cwr_chunk *); + cwr->ch.chunk_type = SCTP_ECN_CWR; + cwr->ch.chunk_flags = 0; + cwr->ch.chunk_length = htons(sizeof(struct sctp_cwr_chunk)); + cwr->tsn = htonl(high_tsn); + TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); + asoc->ctrl_queue_cnt++; +} + +void +sctp_add_stream_reset_out(struct sctp_tmit_chunk *chk, + int number_entries, uint16_t * list, + uint32_t seq, uint32_t resp_seq, uint32_t last_sent) +{ + int len, old_len, i; + struct sctp_stream_reset_out_request *req_out; + struct sctp_chunkhdr *ch; + + ch = mtod(chk->data, struct sctp_chunkhdr *); + + + old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); + + /* get to new offset for the param. */ + req_out = (struct sctp_stream_reset_out_request *)((caddr_t)ch + len); + /* now how long will this param be? */ + len = (sizeof(struct sctp_stream_reset_out_request) + (sizeof(uint16_t) * number_entries)); + req_out->ph.param_type = htons(SCTP_STR_RESET_OUT_REQUEST); + req_out->ph.param_length = htons(len); + req_out->request_seq = htonl(seq); + req_out->response_seq = htonl(resp_seq); + req_out->send_reset_at_tsn = htonl(last_sent); + if (number_entries) { + for (i = 0; i < number_entries; i++) { + req_out->list_of_streams[i] = htons(list[i]); + } + } + if (SCTP_SIZE32(len) > len) { + /*- + * Need to worry about the pad we may end up adding to the + * end. This is easy since the struct is either aligned to 4 + * bytes or 2 bytes off. + */ + req_out->list_of_streams[number_entries] = 0; + } + /* now fix the chunk length */ + ch->chunk_length = htons(len + old_len); + chk->book_size = len + old_len; + chk->book_size_scale = 0; + chk->send_size = SCTP_SIZE32(chk->book_size); + SCTP_BUF_LEN(chk->data) = chk->send_size; + return; +} + + +void +sctp_add_stream_reset_in(struct sctp_tmit_chunk *chk, + int number_entries, uint16_t * list, + uint32_t seq) +{ + int len, old_len, i; + struct sctp_stream_reset_in_request *req_in; + struct sctp_chunkhdr *ch; + + ch = mtod(chk->data, struct sctp_chunkhdr *); + + + old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); + + /* get to new offset for the param. */ + req_in = (struct sctp_stream_reset_in_request *)((caddr_t)ch + len); + /* now how long will this param be? */ + len = (sizeof(struct sctp_stream_reset_in_request) + (sizeof(uint16_t) * number_entries)); + req_in->ph.param_type = htons(SCTP_STR_RESET_IN_REQUEST); + req_in->ph.param_length = htons(len); + req_in->request_seq = htonl(seq); + if (number_entries) { + for (i = 0; i < number_entries; i++) { + req_in->list_of_streams[i] = htons(list[i]); + } + } + if (SCTP_SIZE32(len) > len) { + /*- + * Need to worry about the pad we may end up adding to the + * end. This is easy since the struct is either aligned to 4 + * bytes or 2 bytes off. + */ + req_in->list_of_streams[number_entries] = 0; + } + /* now fix the chunk length */ + ch->chunk_length = htons(len + old_len); + chk->book_size = len + old_len; + chk->book_size_scale = 0; + chk->send_size = SCTP_SIZE32(chk->book_size); + SCTP_BUF_LEN(chk->data) = chk->send_size; + return; +} + + +void +sctp_add_stream_reset_tsn(struct sctp_tmit_chunk *chk, + uint32_t seq) +{ + int len, old_len; + struct sctp_stream_reset_tsn_request *req_tsn; + struct sctp_chunkhdr *ch; + + ch = mtod(chk->data, struct sctp_chunkhdr *); + + + old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); + + /* get to new offset for the param. */ + req_tsn = (struct sctp_stream_reset_tsn_request *)((caddr_t)ch + len); + /* now how long will this param be? */ + len = sizeof(struct sctp_stream_reset_tsn_request); + req_tsn->ph.param_type = htons(SCTP_STR_RESET_TSN_REQUEST); + req_tsn->ph.param_length = htons(len); + req_tsn->request_seq = htonl(seq); + + /* now fix the chunk length */ + ch->chunk_length = htons(len + old_len); + chk->send_size = len + old_len; + chk->book_size = SCTP_SIZE32(chk->send_size); + chk->book_size_scale = 0; + SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); + return; +} + +void +sctp_add_stream_reset_result(struct sctp_tmit_chunk *chk, + uint32_t resp_seq, uint32_t result) +{ + int len, old_len; + struct sctp_stream_reset_response *resp; + struct sctp_chunkhdr *ch; + + ch = mtod(chk->data, struct sctp_chunkhdr *); + + + old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); + + /* get to new offset for the param. */ + resp = (struct sctp_stream_reset_response *)((caddr_t)ch + len); + /* now how long will this param be? */ + len = sizeof(struct sctp_stream_reset_response); + resp->ph.param_type = htons(SCTP_STR_RESET_RESPONSE); + resp->ph.param_length = htons(len); + resp->response_seq = htonl(resp_seq); + resp->result = ntohl(result); + + /* now fix the chunk length */ + ch->chunk_length = htons(len + old_len); + chk->book_size = len + old_len; + chk->book_size_scale = 0; + chk->send_size = SCTP_SIZE32(chk->book_size); + SCTP_BUF_LEN(chk->data) = chk->send_size; + return; + +} + + +void +sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *chk, + uint32_t resp_seq, uint32_t result, + uint32_t send_una, uint32_t recv_next) +{ + int len, old_len; + struct sctp_stream_reset_response_tsn *resp; + struct sctp_chunkhdr *ch; + + ch = mtod(chk->data, struct sctp_chunkhdr *); + + + old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); + + /* get to new offset for the param. */ + resp = (struct sctp_stream_reset_response_tsn *)((caddr_t)ch + len); + /* now how long will this param be? */ + len = sizeof(struct sctp_stream_reset_response_tsn); + resp->ph.param_type = htons(SCTP_STR_RESET_RESPONSE); + resp->ph.param_length = htons(len); + resp->response_seq = htonl(resp_seq); + resp->result = htonl(result); + resp->senders_next_tsn = htonl(send_una); + resp->receivers_next_tsn = htonl(recv_next); + + /* now fix the chunk length */ + ch->chunk_length = htons(len + old_len); + chk->book_size = len + old_len; + chk->send_size = SCTP_SIZE32(chk->book_size); + chk->book_size_scale = 0; + SCTP_BUF_LEN(chk->data) = chk->send_size; + return; +} + +static void +sctp_add_a_stream(struct sctp_tmit_chunk *chk, + uint32_t seq, + uint16_t adding) +{ + int len, old_len; + struct sctp_chunkhdr *ch; + struct sctp_stream_reset_add_strm *addstr; + + ch = mtod(chk->data, struct sctp_chunkhdr *); + old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); + + /* get to new offset for the param. */ + addstr = (struct sctp_stream_reset_add_strm *)((caddr_t)ch + len); + /* now how long will this param be? */ + len = sizeof(struct sctp_stream_reset_add_strm); + + /* Fill it out. */ + addstr->ph.param_type = htons(SCTP_STR_RESET_ADD_STREAMS); + addstr->ph.param_length = htons(len); + addstr->request_seq = htonl(seq); + addstr->number_of_streams = htons(adding); + addstr->reserved = 0; + + /* now fix the chunk length */ + ch->chunk_length = htons(len + old_len); + chk->send_size = len + old_len; + chk->book_size = SCTP_SIZE32(chk->send_size); + chk->book_size_scale = 0; + SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); + return; +} + +int +sctp_send_str_reset_req(struct sctp_tcb *stcb, + int number_entries, uint16_t * list, + uint8_t send_out_req, + uint32_t resp_seq, + uint8_t send_in_req, + uint8_t send_tsn_req, + uint8_t add_stream, + uint16_t adding +) +{ + + struct sctp_association *asoc; + struct sctp_tmit_chunk *chk; + struct sctp_chunkhdr *ch; + uint32_t seq; + + asoc = &stcb->asoc; + if (asoc->stream_reset_outstanding) { + /*- + * Already one pending, must get ACK back to clear the flag. + */ + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EBUSY); + return (EBUSY); + } + if ((send_out_req == 0) && (send_in_req == 0) && (send_tsn_req == 0) && + (add_stream == 0)) { + /* nothing to do */ + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); + return (EINVAL); + } + if (send_tsn_req && (send_out_req || send_in_req)) { + /* error, can't do that */ + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); + return (EINVAL); + } + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + chk->copy_by_ref = 0; + chk->rec.chunk_id.id = SCTP_STREAM_RESET; + chk->rec.chunk_id.can_take_data = 0; + chk->asoc = &stcb->asoc; + chk->book_size = sizeof(struct sctp_chunkhdr); + chk->send_size = SCTP_SIZE32(chk->book_size); + chk->book_size_scale = 0; + + chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + if (chk->data == NULL) { + sctp_free_a_chunk(stcb, chk); + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); + + /* setup chunk parameters */ + chk->sent = SCTP_DATAGRAM_UNSENT; + chk->snd_count = 0; + chk->whoTo = asoc->primary_destination; + atomic_add_int(&chk->whoTo->ref_count, 1); + + ch = mtod(chk->data, struct sctp_chunkhdr *); + ch->chunk_type = SCTP_STREAM_RESET; + ch->chunk_flags = 0; + ch->chunk_length = htons(chk->book_size); + SCTP_BUF_LEN(chk->data) = chk->send_size; + + seq = stcb->asoc.str_reset_seq_out; + if (send_out_req) { + sctp_add_stream_reset_out(chk, number_entries, list, + seq, resp_seq, (stcb->asoc.sending_seq - 1)); + asoc->stream_reset_out_is_outstanding = 1; + seq++; + asoc->stream_reset_outstanding++; + } + if (add_stream) { + sctp_add_a_stream(chk, seq, adding); + seq++; + asoc->stream_reset_outstanding++; + } + if (send_in_req) { + sctp_add_stream_reset_in(chk, number_entries, list, seq); + asoc->stream_reset_outstanding++; + } + if (send_tsn_req) { + sctp_add_stream_reset_tsn(chk, seq); + asoc->stream_reset_outstanding++; + } + asoc->str_reset = chk; + + /* insert the chunk for sending */ + TAILQ_INSERT_TAIL(&asoc->control_send_queue, + chk, + sctp_next); + asoc->ctrl_queue_cnt++; + sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo); + return (0); +} + +void +sctp_send_abort(struct mbuf *m, int iphlen, struct sctphdr *sh, uint32_t vtag, + struct mbuf *err_cause, uint32_t vrf_id, uint16_t port) +{ + /*- + * Formulate the abort message, and send it back down. + */ + struct mbuf *o_pak; + struct mbuf *mout; + struct sctp_abort_msg *abm; + struct ip *iph, *iph_out; + struct udphdr *udp; + +#ifdef INET6 + struct ip6_hdr *ip6, *ip6_out; + +#endif + int iphlen_out, len; + + /* don't respond to ABORT with ABORT */ + if (sctp_is_there_an_abort_here(m, iphlen, &vtag)) { + if (err_cause) + sctp_m_freem(err_cause); + return; + } + iph = mtod(m, struct ip *); + switch (iph->ip_v) { + case IPVERSION: + len = (sizeof(struct ip) + sizeof(struct sctp_abort_msg)); + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + len = (sizeof(struct ip6_hdr) + sizeof(struct sctp_abort_msg)); + break; +#endif + default: + if (err_cause) { + sctp_m_freem(err_cause); + } + return; + } + if (port) { + len += sizeof(struct udphdr); + } + mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA); + if (mout == NULL) { + if (err_cause) { + sctp_m_freem(err_cause); + } + return; + } + SCTP_BUF_RESV_UF(mout, max_linkhdr); + SCTP_BUF_LEN(mout) = len; + SCTP_BUF_NEXT(mout) = err_cause; + iph_out = NULL; +#ifdef INET6 + ip6_out = NULL; +#endif + switch (iph->ip_v) { + case IPVERSION: + iph_out = mtod(mout, struct ip *); + + /* Fill in the IP header for the ABORT */ + iph_out->ip_v = IPVERSION; + iph_out->ip_hl = (sizeof(struct ip) / 4); + iph_out->ip_tos = (u_char)0; + iph_out->ip_id = 0; + iph_out->ip_off = 0; + iph_out->ip_ttl = MAXTTL; + if (port) { + iph_out->ip_p = IPPROTO_UDP; + } else { + iph_out->ip_p = IPPROTO_SCTP; + } + iph_out->ip_src.s_addr = iph->ip_dst.s_addr; + iph_out->ip_dst.s_addr = iph->ip_src.s_addr; + /* let IP layer calculate this */ + iph_out->ip_sum = 0; + + iphlen_out = sizeof(*iph_out); + abm = (struct sctp_abort_msg *)((caddr_t)iph_out + iphlen_out); + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + ip6 = (struct ip6_hdr *)iph; + ip6_out = mtod(mout, struct ip6_hdr *); + + /* Fill in the IP6 header for the ABORT */ + ip6_out->ip6_flow = ip6->ip6_flow; + ip6_out->ip6_hlim = MODULE_GLOBAL(ip6_defhlim); + if (port) { + ip6_out->ip6_nxt = IPPROTO_UDP; + } else { + ip6_out->ip6_nxt = IPPROTO_SCTP; + } + ip6_out->ip6_src = ip6->ip6_dst; + ip6_out->ip6_dst = ip6->ip6_src; + + iphlen_out = sizeof(*ip6_out); + abm = (struct sctp_abort_msg *)((caddr_t)ip6_out + iphlen_out); + break; +#endif /* INET6 */ + default: + /* Currently not supported */ + sctp_m_freem(mout); + return; + } + + udp = (struct udphdr *)abm; + if (port) { + udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); + udp->uh_dport = port; + /* set udp->uh_ulen later */ + udp->uh_sum = 0; + iphlen_out += sizeof(struct udphdr); + abm = (struct sctp_abort_msg *)((caddr_t)abm + sizeof(struct udphdr)); + } + abm->sh.src_port = sh->dest_port; + abm->sh.dest_port = sh->src_port; + abm->sh.checksum = 0; + if (vtag == 0) { + abm->sh.v_tag = sh->v_tag; + abm->msg.ch.chunk_flags = SCTP_HAD_NO_TCB; + } else { + abm->sh.v_tag = htonl(vtag); + abm->msg.ch.chunk_flags = 0; + } + abm->msg.ch.chunk_type = SCTP_ABORT_ASSOCIATION; + + if (err_cause) { + struct mbuf *m_tmp = err_cause; + int err_len = 0; + + /* get length of the err_cause chain */ + while (m_tmp != NULL) { + err_len += SCTP_BUF_LEN(m_tmp); + m_tmp = SCTP_BUF_NEXT(m_tmp); + } + len = SCTP_BUF_LEN(mout) + err_len; + if (err_len % 4) { + /* need pad at end of chunk */ + uint32_t cpthis = 0; + int padlen; + + padlen = 4 - (len % 4); + m_copyback(mout, len, padlen, (caddr_t)&cpthis); + len += padlen; + } + abm->msg.ch.chunk_length = htons(sizeof(abm->msg.ch) + err_len); + } else { + len = SCTP_BUF_LEN(mout); + abm->msg.ch.chunk_length = htons(sizeof(abm->msg.ch)); + } + + if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { + /* no mbuf's */ + sctp_m_freem(mout); + return; + } + if (iph_out != NULL) { + sctp_route_t ro; + int ret; + + /* zap the stack pointer to the route */ + bzero(&ro, sizeof ro); + if (port) { + udp->uh_ulen = htons(len - sizeof(struct ip)); + udp->uh_sum = in_pseudo(iph_out->ip_src.s_addr, iph_out->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP)); + } + SCTPDBG(SCTP_DEBUG_OUTPUT2, "sctp_send_abort calling ip_output:\n"); + SCTPDBG_PKT(SCTP_DEBUG_OUTPUT2, iph_out, &abm->sh); + /* set IPv4 length */ + iph_out->ip_len = len; + /* out it goes */ +#ifdef SCTP_PACKET_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) + sctp_packet_log(mout, len); +#endif + SCTP_ATTACH_CHAIN(o_pak, mout, len); + if (port) { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + abm->sh.checksum = sctp_calculate_cksum(mout, iphlen_out); + SCTP_STAT_INCR(sctps_sendswcrc); +#endif + SCTP_ENABLE_UDP_CSUM(o_pak); + } else { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + mout->m_pkthdr.csum_flags = CSUM_SCTP; + mout->m_pkthdr.csum_data = 0; + SCTP_STAT_INCR(sctps_sendhwcrc); +#endif + } + SCTP_IP_OUTPUT(ret, o_pak, &ro, NULL, vrf_id); + + /* Free the route if we got one back */ + if (ro.ro_rt) + RTFREE(ro.ro_rt); + } +#ifdef INET6 + if (ip6_out != NULL) { + struct route_in6 ro; + int ret; + struct ifnet *ifp = NULL; + + /* zap the stack pointer to the route */ + bzero(&ro, sizeof(ro)); + if (port) { + udp->uh_ulen = htons(len - sizeof(struct ip6_hdr)); + } + SCTPDBG(SCTP_DEBUG_OUTPUT2, "sctp_send_abort calling ip6_output:\n"); + SCTPDBG_PKT(SCTP_DEBUG_OUTPUT2, (struct ip *)ip6_out, &abm->sh); + ip6_out->ip6_plen = len - sizeof(*ip6_out); +#ifdef SCTP_PACKET_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) + sctp_packet_log(mout, len); +#endif + SCTP_ATTACH_CHAIN(o_pak, mout, len); + if (port) { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + abm->sh.checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr)); + SCTP_STAT_INCR(sctps_sendswcrc); +#endif + if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), len - sizeof(struct ip6_hdr))) == 0) { + udp->uh_sum = 0xffff; + } + } else { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + mout->m_pkthdr.csum_flags = CSUM_SCTP; + mout->m_pkthdr.csum_data = 0; + SCTP_STAT_INCR(sctps_sendhwcrc); +#endif + } + SCTP_IP6_OUTPUT(ret, o_pak, &ro, &ifp, NULL, vrf_id); + + /* Free the route if we got one back */ + if (ro.ro_rt) + RTFREE(ro.ro_rt); + } +#endif + SCTP_STAT_INCR(sctps_sendpackets); + SCTP_STAT_INCR_COUNTER64(sctps_outpackets); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); +} + +void +sctp_send_operr_to(struct mbuf *m, int iphlen, struct mbuf *scm, uint32_t vtag, + uint32_t vrf_id, uint16_t port) +{ + struct mbuf *o_pak; + struct sctphdr *sh, *sh_out; + struct sctp_chunkhdr *ch; + struct ip *iph, *iph_out; + struct udphdr *udp = NULL; + struct mbuf *mout; + +#ifdef INET6 + struct ip6_hdr *ip6, *ip6_out; + +#endif + int iphlen_out, len; + + iph = mtod(m, struct ip *); + sh = (struct sctphdr *)((caddr_t)iph + iphlen); + switch (iph->ip_v) { + case IPVERSION: + len = (sizeof(struct ip) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)); + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + len = (sizeof(struct ip6_hdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)); + break; +#endif + default: + if (scm) { + sctp_m_freem(scm); + } + return; + } + if (port) { + len += sizeof(struct udphdr); + } + mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_DONTWAIT, 1, MT_DATA); + if (mout == NULL) { + if (scm) { + sctp_m_freem(scm); + } + return; + } + SCTP_BUF_RESV_UF(mout, max_linkhdr); + SCTP_BUF_LEN(mout) = len; + SCTP_BUF_NEXT(mout) = scm; + iph_out = NULL; +#ifdef INET6 + ip6_out = NULL; +#endif + switch (iph->ip_v) { + case IPVERSION: + iph_out = mtod(mout, struct ip *); + + /* Fill in the IP header for the ABORT */ + iph_out->ip_v = IPVERSION; + iph_out->ip_hl = (sizeof(struct ip) / 4); + iph_out->ip_tos = (u_char)0; + iph_out->ip_id = 0; + iph_out->ip_off = 0; + iph_out->ip_ttl = MAXTTL; + if (port) { + iph_out->ip_p = IPPROTO_UDP; + } else { + iph_out->ip_p = IPPROTO_SCTP; + } + iph_out->ip_src.s_addr = iph->ip_dst.s_addr; + iph_out->ip_dst.s_addr = iph->ip_src.s_addr; + /* let IP layer calculate this */ + iph_out->ip_sum = 0; + + iphlen_out = sizeof(struct ip); + sh_out = (struct sctphdr *)((caddr_t)iph_out + iphlen_out); + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + ip6 = (struct ip6_hdr *)iph; + ip6_out = mtod(mout, struct ip6_hdr *); + + /* Fill in the IP6 header for the ABORT */ + ip6_out->ip6_flow = ip6->ip6_flow; + ip6_out->ip6_hlim = MODULE_GLOBAL(ip6_defhlim); + if (port) { + ip6_out->ip6_nxt = IPPROTO_UDP; + } else { + ip6_out->ip6_nxt = IPPROTO_SCTP; + } + ip6_out->ip6_src = ip6->ip6_dst; + ip6_out->ip6_dst = ip6->ip6_src; + + iphlen_out = sizeof(struct ip6_hdr); + sh_out = (struct sctphdr *)((caddr_t)ip6_out + iphlen_out); + break; +#endif /* INET6 */ + default: + /* Currently not supported */ + sctp_m_freem(mout); + return; + } + + udp = (struct udphdr *)sh_out; + if (port) { + udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); + udp->uh_dport = port; + /* set udp->uh_ulen later */ + udp->uh_sum = 0; + iphlen_out += sizeof(struct udphdr); + sh_out = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr)); + } + sh_out->src_port = sh->dest_port; + sh_out->dest_port = sh->src_port; + sh_out->v_tag = vtag; + sh_out->checksum = 0; + + ch = (struct sctp_chunkhdr *)((caddr_t)sh_out + sizeof(struct sctphdr)); + ch->chunk_type = SCTP_OPERATION_ERROR; + ch->chunk_flags = 0; + + if (scm) { + struct mbuf *m_tmp = scm; + int cause_len = 0; + + /* get length of the err_cause chain */ + while (m_tmp != NULL) { + cause_len += SCTP_BUF_LEN(m_tmp); + m_tmp = SCTP_BUF_NEXT(m_tmp); + } + len = SCTP_BUF_LEN(mout) + cause_len; + if (cause_len % 4) { + /* need pad at end of chunk */ + uint32_t cpthis = 0; + int padlen; + + padlen = 4 - (len % 4); + m_copyback(mout, len, padlen, (caddr_t)&cpthis); + len += padlen; + } + ch->chunk_length = htons(sizeof(struct sctp_chunkhdr) + cause_len); + } else { + len = SCTP_BUF_LEN(mout); + ch->chunk_length = htons(sizeof(struct sctp_chunkhdr)); + } + + if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { + /* no mbuf's */ + sctp_m_freem(mout); + return; + } + if (iph_out != NULL) { + sctp_route_t ro; + int ret; + + /* zap the stack pointer to the route */ + bzero(&ro, sizeof ro); + if (port) { + udp->uh_ulen = htons(len - sizeof(struct ip)); + udp->uh_sum = in_pseudo(iph_out->ip_src.s_addr, iph_out->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP)); + } + /* set IPv4 length */ + iph_out->ip_len = len; + /* out it goes */ +#ifdef SCTP_PACKET_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) + sctp_packet_log(mout, len); +#endif + SCTP_ATTACH_CHAIN(o_pak, mout, len); + if (port) { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + sh_out->checksum = sctp_calculate_cksum(mout, iphlen_out); + SCTP_STAT_INCR(sctps_sendswcrc); +#endif + SCTP_ENABLE_UDP_CSUM(o_pak); + } else { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + mout->m_pkthdr.csum_flags = CSUM_SCTP; + mout->m_pkthdr.csum_data = 0; + SCTP_STAT_INCR(sctps_sendhwcrc); +#endif + } + SCTP_IP_OUTPUT(ret, o_pak, &ro, NULL, vrf_id); + + /* Free the route if we got one back */ + if (ro.ro_rt) + RTFREE(ro.ro_rt); + } +#ifdef INET6 + if (ip6_out != NULL) { + struct route_in6 ro; + int ret; + struct ifnet *ifp = NULL; + + /* zap the stack pointer to the route */ + bzero(&ro, sizeof(ro)); + if (port) { + udp->uh_ulen = htons(len - sizeof(struct ip6_hdr)); + } + ip6_out->ip6_plen = len - sizeof(*ip6_out); +#ifdef SCTP_PACKET_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) + sctp_packet_log(mout, len); +#endif + SCTP_ATTACH_CHAIN(o_pak, mout, len); + if (port) { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + sh_out->checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr)); + SCTP_STAT_INCR(sctps_sendswcrc); +#endif + if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), len - sizeof(struct ip6_hdr))) == 0) { + udp->uh_sum = 0xffff; + } + } else { +#if defined(SCTP_WITH_NO_CSUM) + SCTP_STAT_INCR(sctps_sendnocrc); +#else + mout->m_pkthdr.csum_flags = CSUM_SCTP; + mout->m_pkthdr.csum_data = 0; + SCTP_STAT_INCR(sctps_sendhwcrc); +#endif + } + SCTP_IP6_OUTPUT(ret, o_pak, &ro, &ifp, NULL, vrf_id); + + /* Free the route if we got one back */ + if (ro.ro_rt) + RTFREE(ro.ro_rt); + } +#endif + SCTP_STAT_INCR(sctps_sendpackets); + SCTP_STAT_INCR_COUNTER64(sctps_outpackets); + SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); +} + +static struct mbuf * +sctp_copy_resume(struct sctp_stream_queue_pending *sp, + struct uio *uio, + struct sctp_sndrcvinfo *srcv, + int max_send_len, + int user_marks_eor, + int *error, + uint32_t * sndout, + struct mbuf **new_tail) +{ + struct mbuf *m; + + m = m_uiotombuf(uio, M_WAITOK, max_send_len, 0, + (M_PKTHDR | (user_marks_eor ? M_EOR : 0))); + if (m == NULL) { + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + *error = ENOMEM; + } else { + *sndout = m_length(m, NULL); + *new_tail = m_last(m); + } + return (m); +} + +static int +sctp_copy_one(struct sctp_stream_queue_pending *sp, + struct uio *uio, + int resv_upfront) +{ + int left; + + left = sp->length; + sp->data = m_uiotombuf(uio, M_WAITOK, sp->length, + resv_upfront, 0); + if (sp->data == NULL) { + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + return (ENOMEM); + } + sp->tail_mbuf = m_last(sp->data); + return (0); +} + + + +static struct sctp_stream_queue_pending * +sctp_copy_it_in(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_sndrcvinfo *srcv, + struct uio *uio, + struct sctp_nets *net, + int max_send_len, + int user_marks_eor, + int *error, + int non_blocking) +{ + /*- + * This routine must be very careful in its work. Protocol + * processing is up and running so care must be taken to spl...() + * when you need to do something that may effect the stcb/asoc. The + * sb is locked however. When data is copied the protocol processing + * should be enabled since this is a slower operation... + */ + struct sctp_stream_queue_pending *sp = NULL; + int resv_in_first; + + *error = 0; + /* Now can we send this? */ + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) || + (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { + /* got data while shutting down */ + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); + *error = ECONNRESET; + goto out_now; + } + sctp_alloc_a_strmoq(stcb, sp); + if (sp == NULL) { + SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + *error = ENOMEM; + goto out_now; + } + sp->act_flags = 0; + sp->sender_all_done = 0; + sp->sinfo_flags = srcv->sinfo_flags; + sp->timetolive = srcv->sinfo_timetolive; + sp->ppid = srcv->sinfo_ppid; + sp->context = srcv->sinfo_context; + sp->strseq = 0; + (void)SCTP_GETTIME_TIMEVAL(&sp->ts); + + sp->stream = srcv->sinfo_stream; + sp->length = min(uio->uio_resid, max_send_len); + if ((sp->length == (uint32_t) uio->uio_resid) && + ((user_marks_eor == 0) || + (srcv->sinfo_flags & SCTP_EOF) || + (user_marks_eor && (srcv->sinfo_flags & SCTP_EOR)))) { + sp->msg_is_complete = 1; + } else { + sp->msg_is_complete = 0; + } + sp->sender_all_done = 0; + sp->some_taken = 0; + sp->put_last_out = 0; + resv_in_first = sizeof(struct sctp_data_chunk); + sp->data = sp->tail_mbuf = NULL; + if (sp->length == 0) { + *error = 0; + goto skip_copy; + } + sp->auth_keyid = stcb->asoc.authinfo.active_keyid; + if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { + sctp_auth_key_acquire(stcb, stcb->asoc.authinfo.active_keyid); + sp->holds_key_ref = 1; + } + *error = sctp_copy_one(sp, uio, resv_in_first); +skip_copy: + if (*error) { + sctp_free_a_strmoq(stcb, sp); + sp = NULL; + } else { + if (sp->sinfo_flags & SCTP_ADDR_OVER) { + sp->net = net; + atomic_add_int(&sp->net->ref_count, 1); + } else { + sp->net = NULL; + } + sctp_set_prsctp_policy(sp); + } +out_now: + return (sp); +} + + +int +sctp_sosend(struct socket *so, + struct sockaddr *addr, + struct uio *uio, + struct mbuf *top, + struct mbuf *control, + int flags, + struct thread *p +) +{ + int error, use_rcvinfo = 0; + struct sctp_sndrcvinfo srcv; + struct sockaddr *addr_to_use; + +#if defined(INET) && defined(INET6) + struct sockaddr_in sin; + +#endif + + if (control) { + /* process cmsg snd/rcv info (maybe a assoc-id) */ + if (sctp_find_cmsg(SCTP_SNDRCV, (void *)&srcv, control, + sizeof(srcv))) { + /* got one */ + use_rcvinfo = 1; + } + } + addr_to_use = addr; +#if defined(INET) && defined(INET6) + if ((addr) && (addr->sa_family == AF_INET6)) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)addr; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + in6_sin6_2_sin(&sin, sin6); + addr_to_use = (struct sockaddr *)&sin; + } + } +#endif + error = sctp_lower_sosend(so, addr_to_use, uio, top, + control, + flags, + use_rcvinfo ? &srcv : NULL + ,p + ); + return (error); +} + + +int +sctp_lower_sosend(struct socket *so, + struct sockaddr *addr, + struct uio *uio, + struct mbuf *i_pak, + struct mbuf *control, + int flags, + struct sctp_sndrcvinfo *srcv + , + struct thread *p +) +{ + unsigned int sndlen = 0, max_len; + int error, len; + struct mbuf *top = NULL; + int queue_only = 0, queue_only_for_init = 0; + int free_cnt_applied = 0; + int un_sent; + int now_filled = 0; + unsigned int inqueue_bytes = 0; + struct sctp_block_entry be; + struct sctp_inpcb *inp; + struct sctp_tcb *stcb = NULL; + struct timeval now; + struct sctp_nets *net; + struct sctp_association *asoc; + struct sctp_inpcb *t_inp; + int user_marks_eor; + int create_lock_applied = 0; + int nagle_applies = 0; + int some_on_control = 0; + int got_all_of_the_send = 0; + int hold_tcblock = 0; + int non_blocking = 0; + uint32_t local_add_more, local_soresv = 0; + uint16_t port; + uint16_t sinfo_flags; + sctp_assoc_t sinfo_assoc_id; + + error = 0; + net = NULL; + stcb = NULL; + asoc = NULL; + + t_inp = inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == NULL) { + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + if (i_pak) { + SCTP_RELEASE_PKT(i_pak); + } + return (error); + } + if ((uio == NULL) && (i_pak == NULL)) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + return (EINVAL); + } + user_marks_eor = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR); + atomic_add_int(&inp->total_sends, 1); + if (uio) { + if (uio->uio_resid < 0) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + return (EINVAL); + } + sndlen = uio->uio_resid; + } else { + top = SCTP_HEADER_TO_CHAIN(i_pak); + sndlen = SCTP_HEADER_LEN(i_pak); + } + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Send called addr:%p send length %d\n", + addr, + sndlen); + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && + (inp->sctp_socket->so_qlimit)) { + /* The listener can NOT send */ + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); + error = ENOTCONN; + goto out_unlocked; + } + /** + * Pre-screen address, if one is given the sin-len + * must be set correctly! + */ + if (addr) { + union sctp_sockstore *raddr = (union sctp_sockstore *)addr; + + switch (raddr->sa.sa_family) { +#if defined(INET) + case AF_INET: + if (raddr->sin.sin_len != sizeof(struct sockaddr_in)) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_unlocked; + } + port = raddr->sin.sin_port; + break; +#endif +#if defined(INET6) + case AF_INET6: + if (raddr->sin6.sin6_len != sizeof(struct sockaddr_in6)) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_unlocked; + } + port = raddr->sin6.sin6_port; + break; +#endif + default: + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EAFNOSUPPORT); + error = EAFNOSUPPORT; + goto out_unlocked; + } + } else + port = 0; + + if (srcv) { + sinfo_flags = srcv->sinfo_flags; + sinfo_assoc_id = srcv->sinfo_assoc_id; + if (INVALID_SINFO_FLAG(sinfo_flags) || + PR_SCTP_INVALID_POLICY(sinfo_flags)) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_unlocked; + } + if (srcv->sinfo_flags) + SCTP_STAT_INCR(sctps_sends_with_flags); + } else { + sinfo_flags = inp->def_send.sinfo_flags; + sinfo_assoc_id = inp->def_send.sinfo_assoc_id; + } + if (sinfo_flags & SCTP_SENDALL) { + /* its a sendall */ + error = sctp_sendall(inp, uio, top, srcv); + top = NULL; + goto out_unlocked; + } + if ((sinfo_flags & SCTP_ADDR_OVER) && (addr == NULL)) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_unlocked; + } + /* now we must find the assoc */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + SCTP_INP_RLOCK(inp); + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb == NULL) { + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); + error = ENOTCONN; + goto out_unlocked; + } + SCTP_TCB_LOCK(stcb); + hold_tcblock = 1; + SCTP_INP_RUNLOCK(inp); + } else if (sinfo_assoc_id) { + stcb = sctp_findassociation_ep_asocid(inp, sinfo_assoc_id, 0); + } else if (addr) { + /*- + * Since we did not use findep we must + * increment it, and if we don't find a tcb + * decrement it. + */ + SCTP_INP_WLOCK(inp); + SCTP_INP_INCR_REF(inp); + SCTP_INP_WUNLOCK(inp); + stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL); + if (stcb == NULL) { + SCTP_INP_WLOCK(inp); + SCTP_INP_DECR_REF(inp); + SCTP_INP_WUNLOCK(inp); + } else { + hold_tcblock = 1; + } + } + if ((stcb == NULL) && (addr)) { + /* Possible implicit send? */ + SCTP_ASOC_CREATE_LOCK(inp); + create_lock_applied = 1; + if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { + /* Should I really unlock ? */ + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_unlocked; + + } + if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) && + (addr->sa_family == AF_INET6)) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_unlocked; + } + SCTP_INP_WLOCK(inp); + SCTP_INP_INCR_REF(inp); + SCTP_INP_WUNLOCK(inp); + /* With the lock applied look again */ + stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL); + if (stcb == NULL) { + SCTP_INP_WLOCK(inp); + SCTP_INP_DECR_REF(inp); + SCTP_INP_WUNLOCK(inp); + } else { + hold_tcblock = 1; + } + if (t_inp != inp) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); + error = ENOTCONN; + goto out_unlocked; + } + } + if (stcb == NULL) { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); + error = ENOTCONN; + goto out_unlocked; + } + if (addr == NULL) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOENT); + error = ENOENT; + goto out_unlocked; + } else { + /* + * UDP style, we must go ahead and start the INIT + * process + */ + uint32_t vrf_id; + + if ((sinfo_flags & SCTP_ABORT) || + ((sinfo_flags & SCTP_EOF) && (sndlen == 0))) { + /*- + * User asks to abort a non-existant assoc, + * or EOF a non-existant assoc with no data + */ + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOENT); + error = ENOENT; + goto out_unlocked; + } + /* get an asoc/stcb struct */ + vrf_id = inp->def_vrf_id; +#ifdef INVARIANTS + if (create_lock_applied == 0) { + panic("Error, should hold create lock and I don't?"); + } +#endif + stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, + p + ); + if (stcb == NULL) { + /* Error is setup for us in the call */ + goto out_unlocked; + } + if (create_lock_applied) { + SCTP_ASOC_CREATE_UNLOCK(inp); + create_lock_applied = 0; + } else { + SCTP_PRINTF("Huh-3? create lock should have been on??\n"); + } + /* + * Turn on queue only flag to prevent data from + * being sent + */ + queue_only = 1; + asoc = &stcb->asoc; + SCTP_SET_STATE(asoc, SCTP_STATE_COOKIE_WAIT); + (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); + + /* initialize authentication params for the assoc */ + sctp_initialize_auth_params(inp, stcb); + + if (control) { + /* + * see if a init structure exists in cmsg + * headers + */ + struct sctp_initmsg initm; + int i; + + if (sctp_find_cmsg(SCTP_INIT, (void *)&initm, control, + sizeof(initm))) { + /* + * we have an INIT override of the + * default + */ + if (initm.sinit_max_attempts) + asoc->max_init_times = initm.sinit_max_attempts; + if (initm.sinit_num_ostreams) + asoc->pre_open_streams = initm.sinit_num_ostreams; + if (initm.sinit_max_instreams) + asoc->max_inbound_streams = initm.sinit_max_instreams; + if (initm.sinit_max_init_timeo) + asoc->initial_init_rto_max = initm.sinit_max_init_timeo; + if (asoc->streamoutcnt < asoc->pre_open_streams) { + struct sctp_stream_out *tmp_str; + int had_lock = 0; + + /* Default is NOT correct */ + SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, defout:%d pre_open:%d\n", + asoc->streamoutcnt, asoc->pre_open_streams); + /* + * What happens if this + * fails? we panic ... + */ + + if (hold_tcblock) { + had_lock = 1; + SCTP_TCB_UNLOCK(stcb); + } + SCTP_MALLOC(tmp_str, + struct sctp_stream_out *, + (asoc->pre_open_streams * + sizeof(struct sctp_stream_out)), + SCTP_M_STRMO); + if (had_lock) { + SCTP_TCB_LOCK(stcb); + } + if (tmp_str != NULL) { + SCTP_FREE(asoc->strmout, SCTP_M_STRMO); + asoc->strmout = tmp_str; + asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams; + } else { + asoc->pre_open_streams = asoc->streamoutcnt; + } + for (i = 0; i < asoc->streamoutcnt; i++) { + /*- + * inbound side must be set + * to 0xffff, also NOTE when + * we get the INIT-ACK back + * (for INIT sender) we MUST + * reduce the count + * (streamoutcnt) but first + * check if we sent to any + * of the upper streams that + * were dropped (if some + * were). Those that were + * dropped must be notified + * to the upper layer as + * failed to send. + */ + asoc->strmout[i].next_sequence_sent = 0x0; + TAILQ_INIT(&asoc->strmout[i].outqueue); + asoc->strmout[i].stream_no = i; + asoc->strmout[i].last_msg_incomplete = 0; + asoc->strmout[i].next_spoke.tqe_next = 0; + asoc->strmout[i].next_spoke.tqe_prev = 0; + } + } + } + } + hold_tcblock = 1; + /* out with the INIT */ + queue_only_for_init = 1; + /*- + * we may want to dig in after this call and adjust the MTU + * value. It defaulted to 1500 (constant) but the ro + * structure may now have an update and thus we may need to + * change it BEFORE we append the message. + */ + } + } else + asoc = &stcb->asoc; + if (srcv == NULL) + srcv = (struct sctp_sndrcvinfo *)&asoc->def_send; + if (srcv->sinfo_flags & SCTP_ADDR_OVER) { + if (addr) + net = sctp_findnet(stcb, addr); + else + net = NULL; + if ((net == NULL) || + ((port != 0) && (port != stcb->rport))) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_unlocked; + } + } else { + net = stcb->asoc.primary_destination; + } + atomic_add_int(&stcb->total_sends, 1); + /* Keep the stcb from being freed under our feet */ + atomic_add_int(&asoc->refcnt, 1); + free_cnt_applied = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT)) { + if (sndlen > asoc->smallest_mtu) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); + error = EMSGSIZE; + goto out_unlocked; + } + } + if ((SCTP_SO_IS_NBIO(so) + || (flags & MSG_NBIO) + )) { + non_blocking = 1; + } + /* would we block? */ + if (non_blocking) { + if (hold_tcblock == 0) { + SCTP_TCB_LOCK(stcb); + hold_tcblock = 1; + } + inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); + if ((SCTP_SB_LIMIT_SND(so) < (sndlen + inqueue_bytes + stcb->asoc.sb_send_resv)) || + (stcb->asoc.chunks_on_out_queue >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EWOULDBLOCK); + if (sndlen > SCTP_SB_LIMIT_SND(so)) + error = EMSGSIZE; + else + error = EWOULDBLOCK; + goto out_unlocked; + } + stcb->asoc.sb_send_resv += sndlen; + SCTP_TCB_UNLOCK(stcb); + hold_tcblock = 0; + } else { + atomic_add_int(&stcb->asoc.sb_send_resv, sndlen); + } + local_soresv = sndlen; + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); + error = ECONNRESET; + goto out_unlocked; + } + if (create_lock_applied) { + SCTP_ASOC_CREATE_UNLOCK(inp); + create_lock_applied = 0; + } + if (asoc->stream_reset_outstanding) { + /* + * Can't queue any data while stream reset is underway. + */ + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EAGAIN); + error = EAGAIN; + goto out_unlocked; + } + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { + queue_only = 1; + } + /* we are now done with all control */ + if (control) { + sctp_m_freem(control); + control = NULL; + } + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_SENT) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_ACK_SENT) || + (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { + if (srcv->sinfo_flags & SCTP_ABORT) { + ; + } else { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); + error = ECONNRESET; + goto out_unlocked; + } + } +#ifndef __rtems__ + /* Ok, we will attempt a msgsnd :> */ + if (p) { + p->td_ru.ru_msgsnd++; + } +#endif /* __rtems__ */ + /* Are we aborting? */ + if (srcv->sinfo_flags & SCTP_ABORT) { + struct mbuf *mm; + int tot_demand, tot_out = 0, max_out; + + SCTP_STAT_INCR(sctps_sends_with_abort); + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) { + /* It has to be up before we abort */ + /* how big is the user initiated abort? */ + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out; + } + if (hold_tcblock) { + SCTP_TCB_UNLOCK(stcb); + hold_tcblock = 0; + } + if (top) { + struct mbuf *cntm = NULL; + + mm = sctp_get_mbuf_for_msg(1, 0, M_WAIT, 1, MT_DATA); + if (sndlen != 0) { + cntm = top; + while (cntm) { + tot_out += SCTP_BUF_LEN(cntm); + cntm = SCTP_BUF_NEXT(cntm); + } + } + tot_demand = (tot_out + sizeof(struct sctp_paramhdr)); + } else { + /* Must fit in a MTU */ + tot_out = sndlen; + tot_demand = (tot_out + sizeof(struct sctp_paramhdr)); + if (tot_demand > SCTP_DEFAULT_ADD_MORE) { + /* To big */ + SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); + error = EMSGSIZE; + goto out; + } + mm = sctp_get_mbuf_for_msg(tot_demand, 0, M_WAIT, 1, MT_DATA); + } + if (mm == NULL) { + SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM); + error = ENOMEM; + goto out; + } + max_out = asoc->smallest_mtu - sizeof(struct sctp_paramhdr); + max_out -= sizeof(struct sctp_abort_msg); + if (tot_out > max_out) { + tot_out = max_out; + } + if (mm) { + struct sctp_paramhdr *ph; + + /* now move forward the data pointer */ + ph = mtod(mm, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); + ph->param_length = htons((sizeof(struct sctp_paramhdr) + tot_out)); + ph++; + SCTP_BUF_LEN(mm) = tot_out + sizeof(struct sctp_paramhdr); + if (top == NULL) { + error = uiomove((caddr_t)ph, (int)tot_out, uio); + if (error) { + /*- + * Here if we can't get his data we + * still abort we just don't get to + * send the users note :-0 + */ + sctp_m_freem(mm); + mm = NULL; + } + } else { + if (sndlen != 0) { + SCTP_BUF_NEXT(mm) = top; + } + } + } + if (hold_tcblock == 0) { + SCTP_TCB_LOCK(stcb); + hold_tcblock = 1; + } + atomic_add_int(&stcb->asoc.refcnt, -1); + free_cnt_applied = 0; + /* release this lock, otherwise we hang on ourselves */ + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_RESPONSE_TO_USER_REQ, + mm, SCTP_SO_LOCKED); + /* now relock the stcb so everything is sane */ + hold_tcblock = 0; + stcb = NULL; + /* + * In this case top is already chained to mm avoid double + * free, since we free it below if top != NULL and driver + * would free it after sending the packet out + */ + if (sndlen != 0) { + top = NULL; + } + goto out_unlocked; + } + /* Calculate the maximum we can send */ + inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); + if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) { + if (non_blocking) { + /* we already checked for non-blocking above. */ + max_len = sndlen; + } else { + max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; + } + } else { + max_len = 0; + } + if (hold_tcblock) { + SCTP_TCB_UNLOCK(stcb); + hold_tcblock = 0; + } + /* Is the stream no. valid? */ + if (srcv->sinfo_stream >= asoc->streamoutcnt) { + /* Invalid stream number */ + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_unlocked; + } + if (asoc->strmout == NULL) { + /* huh? software error */ + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT); + error = EFAULT; + goto out_unlocked; + } + /* Unless E_EOR mode is on, we must make a send FIT in one call. */ + if ((user_marks_eor == 0) && + (sndlen > SCTP_SB_LIMIT_SND(stcb->sctp_socket))) { + /* It will NEVER fit */ + SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); + error = EMSGSIZE; + goto out_unlocked; + } + if ((uio == NULL) && user_marks_eor) { + /*- + * We do not support eeor mode for + * sending with mbuf chains (like sendfile). + */ + SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out_unlocked; + } + if (user_marks_eor) { + local_add_more = min(SCTP_SB_LIMIT_SND(so), SCTP_BASE_SYSCTL(sctp_add_more_threshold)); + } else { + /*- + * For non-eeor the whole message must fit in + * the socket send buffer. + */ + local_add_more = sndlen; + } + len = 0; + if (non_blocking) { + goto skip_preblock; + } + if (((max_len <= local_add_more) && + (SCTP_SB_LIMIT_SND(so) >= local_add_more)) || + (max_len == 0) || + ((stcb->asoc.chunks_on_out_queue + stcb->asoc.stream_queue_cnt) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { + /* No room right now ! */ + SOCKBUF_LOCK(&so->so_snd); + inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); + while ((SCTP_SB_LIMIT_SND(so) < (inqueue_bytes + local_add_more)) || + ((stcb->asoc.stream_queue_cnt + stcb->asoc.chunks_on_out_queue) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "pre_block limit:%u <(inq:%d + %d) || (%d+%d > %d)\n", + (unsigned int)SCTP_SB_LIMIT_SND(so), + inqueue_bytes, + local_add_more, + stcb->asoc.stream_queue_cnt, + stcb->asoc.chunks_on_out_queue, + SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { + sctp_log_block(SCTP_BLOCK_LOG_INTO_BLKA, so, asoc, sndlen); + } + be.error = 0; + stcb->block_entry = &be; + error = sbwait(&so->so_snd); + stcb->block_entry = NULL; + if (error || so->so_error || be.error) { + if (error == 0) { + if (so->so_error) + error = so->so_error; + if (be.error) { + error = be.error; + } + } + SOCKBUF_UNLOCK(&so->so_snd); + goto out_unlocked; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { + sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK, + so, asoc, stcb->asoc.total_output_queue_size); + } + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + goto out_unlocked; + } + inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); + } + if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) { + max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; + } else { + max_len = 0; + } + SOCKBUF_UNLOCK(&so->so_snd); + } +skip_preblock: + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + goto out_unlocked; + } + /* + * sndlen covers for mbuf case uio_resid covers for the non-mbuf + * case NOTE: uio will be null when top/mbuf is passed + */ + if (sndlen == 0) { + if (srcv->sinfo_flags & SCTP_EOF) { + got_all_of_the_send = 1; + goto dataless_eof; + } else { + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out; + } + } + if (top == NULL) { + struct sctp_stream_queue_pending *sp; + struct sctp_stream_out *strm; + uint32_t sndout; + + SCTP_TCB_SEND_LOCK(stcb); + if ((asoc->stream_locked) && + (asoc->stream_locked_on != srcv->sinfo_stream)) { + SCTP_TCB_SEND_UNLOCK(stcb); + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out; + } + SCTP_TCB_SEND_UNLOCK(stcb); + + strm = &stcb->asoc.strmout[srcv->sinfo_stream]; + if (strm->last_msg_incomplete == 0) { + do_a_copy_in: + sp = sctp_copy_it_in(stcb, asoc, srcv, uio, net, max_len, user_marks_eor, &error, non_blocking); + if ((sp == NULL) || (error)) { + goto out; + } + SCTP_TCB_SEND_LOCK(stcb); + if (sp->msg_is_complete) { + strm->last_msg_incomplete = 0; + asoc->stream_locked = 0; + } else { + /* + * Just got locked to this guy in case of an + * interrupt. + */ + strm->last_msg_incomplete = 1; + asoc->stream_locked = 1; + asoc->stream_locked_on = srcv->sinfo_stream; + sp->sender_all_done = 0; + } + sctp_snd_sb_alloc(stcb, sp->length); + atomic_add_int(&asoc->stream_queue_cnt, 1); + if ((srcv->sinfo_flags & SCTP_UNORDERED) == 0) { + sp->strseq = strm->next_sequence_sent; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_AT_SEND_2_SCTP) { + sctp_misc_ints(SCTP_STRMOUT_LOG_ASSIGN, + (uintptr_t) stcb, sp->length, + (uint32_t) ((srcv->sinfo_stream << 16) | sp->strseq), 0); + } + strm->next_sequence_sent++; + } else { + SCTP_STAT_INCR(sctps_sends_with_unord); + } + TAILQ_INSERT_TAIL(&strm->outqueue, sp, next); + if ((strm->next_spoke.tqe_next == NULL) && + (strm->next_spoke.tqe_prev == NULL)) { + /* Not on wheel, insert */ + sctp_insert_on_wheel(stcb, asoc, strm, 1); + } + SCTP_TCB_SEND_UNLOCK(stcb); + } else { + SCTP_TCB_SEND_LOCK(stcb); + sp = TAILQ_LAST(&strm->outqueue, sctp_streamhead); + SCTP_TCB_SEND_UNLOCK(stcb); + if (sp == NULL) { + /* ???? Huh ??? last msg is gone */ +#ifdef INVARIANTS + panic("Warning: Last msg marked incomplete, yet nothing left?"); +#else + SCTP_PRINTF("Warning: Last msg marked incomplete, yet nothing left?\n"); + strm->last_msg_incomplete = 0; +#endif + goto do_a_copy_in; + + } + } + while (uio->uio_resid > 0) { + /* How much room do we have? */ + struct mbuf *new_tail, *mm; + + if (SCTP_SB_LIMIT_SND(so) > stcb->asoc.total_output_queue_size) + max_len = SCTP_SB_LIMIT_SND(so) - stcb->asoc.total_output_queue_size; + else + max_len = 0; + + if ((max_len > SCTP_BASE_SYSCTL(sctp_add_more_threshold)) || + (max_len && (SCTP_SB_LIMIT_SND(so) < SCTP_BASE_SYSCTL(sctp_add_more_threshold))) || + (uio->uio_resid && (uio->uio_resid <= (int)max_len))) { + sndout = 0; + new_tail = NULL; + if (hold_tcblock) { + SCTP_TCB_UNLOCK(stcb); + hold_tcblock = 0; + } + mm = sctp_copy_resume(sp, uio, srcv, max_len, user_marks_eor, &error, &sndout, &new_tail); + if ((mm == NULL) || error) { + if (mm) { + sctp_m_freem(mm); + } + goto out; + } + /* Update the mbuf and count */ + SCTP_TCB_SEND_LOCK(stcb); + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + /* + * we need to get out. Peer probably + * aborted. + */ + sctp_m_freem(mm); + if (stcb->asoc.state & SCTP_PCB_FLAGS_WAS_ABORTED) { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); + error = ECONNRESET; + } + SCTP_TCB_SEND_UNLOCK(stcb); + goto out; + } + if (sp->tail_mbuf) { + /* tack it to the end */ + SCTP_BUF_NEXT(sp->tail_mbuf) = mm; + sp->tail_mbuf = new_tail; + } else { + /* A stolen mbuf */ + sp->data = mm; + sp->tail_mbuf = new_tail; + } + sctp_snd_sb_alloc(stcb, sndout); + atomic_add_int(&sp->length, sndout); + len += sndout; + + /* Did we reach EOR? */ + if ((uio->uio_resid == 0) && + ((user_marks_eor == 0) || + (srcv->sinfo_flags & SCTP_EOF) || + (user_marks_eor && (srcv->sinfo_flags & SCTP_EOR)))) { + sp->msg_is_complete = 1; + } else { + sp->msg_is_complete = 0; + } + SCTP_TCB_SEND_UNLOCK(stcb); + } + if (uio->uio_resid == 0) { + /* got it all? */ + continue; + } + /* PR-SCTP? */ + if ((asoc->peer_supports_prsctp) && (asoc->sent_queue_cnt_removeable > 0)) { + /* + * This is ugly but we must assure locking + * order + */ + if (hold_tcblock == 0) { + SCTP_TCB_LOCK(stcb); + hold_tcblock = 1; + } + sctp_prune_prsctp(stcb, asoc, srcv, sndlen); + inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * sizeof(struct sctp_data_chunk)); + if (SCTP_SB_LIMIT_SND(so) > stcb->asoc.total_output_queue_size) + max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; + else + max_len = 0; + if (max_len > 0) { + continue; + } + SCTP_TCB_UNLOCK(stcb); + hold_tcblock = 0; + } + /* wait for space now */ + if (non_blocking) { + /* Non-blocking io in place out */ + goto skip_out_eof; + } + /* What about the INIT, send it maybe */ + if (queue_only_for_init) { + if (hold_tcblock == 0) { + SCTP_TCB_LOCK(stcb); + hold_tcblock = 1; + } + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { + /* a collision took us forward? */ + queue_only = 0; + } else { + sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); + SCTP_SET_STATE(asoc, SCTP_STATE_COOKIE_WAIT); + queue_only = 1; + } + } + if ((net->flight_size > net->cwnd) && + (asoc->sctp_cmt_on_off == 0)) { + SCTP_STAT_INCR(sctps_send_cwnd_avoid); + queue_only = 1; + } else if (asoc->ifp_had_enobuf) { + SCTP_STAT_INCR(sctps_ifnomemqueued); + if (net->flight_size > (2 * net->mtu)) { + queue_only = 1; + } + asoc->ifp_had_enobuf = 0; + } + un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) + + (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk))); + if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) && + (stcb->asoc.total_flight > 0) && + (stcb->asoc.stream_queue_cnt < SCTP_MAX_DATA_BUNDLING) && + (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) { + + /*- + * Ok, Nagle is set on and we have data outstanding. + * Don't send anything and let SACKs drive out the + * data unless wen have a "full" segment to send. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { + sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED); + } + SCTP_STAT_INCR(sctps_naglequeued); + nagle_applies = 1; + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { + if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) + sctp_log_nagle_event(stcb, SCTP_NAGLE_SKIPPED); + } + SCTP_STAT_INCR(sctps_naglesent); + nagle_applies = 0; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { + + sctp_misc_ints(SCTP_CWNDLOG_PRESEND, queue_only_for_init, queue_only, + nagle_applies, un_sent); + sctp_misc_ints(SCTP_CWNDLOG_PRESEND, stcb->asoc.total_output_queue_size, + stcb->asoc.total_flight, + stcb->asoc.chunks_on_out_queue, stcb->asoc.total_flight_count); + } + if (queue_only_for_init) + queue_only_for_init = 0; + if ((queue_only == 0) && (nagle_applies == 0)) { + /*- + * need to start chunk output + * before blocking.. note that if + * a lock is already applied, then + * the input via the net is happening + * and I don't need to start output :-D + */ + if (hold_tcblock == 0) { + if (SCTP_TCB_TRYLOCK(stcb)) { + hold_tcblock = 1; + sctp_chunk_output(inp, + stcb, + SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); + } + } else { + sctp_chunk_output(inp, + stcb, + SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); + } + if (hold_tcblock == 1) { + SCTP_TCB_UNLOCK(stcb); + hold_tcblock = 0; + } + } + SOCKBUF_LOCK(&so->so_snd); + /*- + * This is a bit strange, but I think it will + * work. The total_output_queue_size is locked and + * protected by the TCB_LOCK, which we just released. + * There is a race that can occur between releasing it + * above, and me getting the socket lock, where sacks + * come in but we have not put the SB_WAIT on the + * so_snd buffer to get the wakeup. After the LOCK + * is applied the sack_processing will also need to + * LOCK the so->so_snd to do the actual sowwakeup(). So + * once we have the socket buffer lock if we recheck the + * size we KNOW we will get to sleep safely with the + * wakeup flag in place. + */ + if (SCTP_SB_LIMIT_SND(so) <= (stcb->asoc.total_output_queue_size + + min(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTP_SB_LIMIT_SND(so)))) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { + sctp_log_block(SCTP_BLOCK_LOG_INTO_BLK, + so, asoc, uio->uio_resid); + } + be.error = 0; + stcb->block_entry = &be; + error = sbwait(&so->so_snd); + stcb->block_entry = NULL; + + if (error || so->so_error || be.error) { + if (error == 0) { + if (so->so_error) + error = so->so_error; + if (be.error) { + error = be.error; + } + } + SOCKBUF_UNLOCK(&so->so_snd); + goto out_unlocked; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { + sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK, + so, asoc, stcb->asoc.total_output_queue_size); + } + } + SOCKBUF_UNLOCK(&so->so_snd); + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + goto out_unlocked; + } + } + SCTP_TCB_SEND_LOCK(stcb); + if (sp) { + if (sp->msg_is_complete == 0) { + strm->last_msg_incomplete = 1; + asoc->stream_locked = 1; + asoc->stream_locked_on = srcv->sinfo_stream; + } else { + sp->sender_all_done = 1; + strm->last_msg_incomplete = 0; + asoc->stream_locked = 0; + } + } else { + SCTP_PRINTF("Huh no sp TSNH?\n"); + strm->last_msg_incomplete = 0; + asoc->stream_locked = 0; + } + SCTP_TCB_SEND_UNLOCK(stcb); + if (uio->uio_resid == 0) { + got_all_of_the_send = 1; + } + } else { + /* We send in a 0, since we do NOT have any locks */ + error = sctp_msg_append(stcb, net, top, srcv, 0); + top = NULL; + if (srcv->sinfo_flags & SCTP_EOF) { + /* + * This should only happen for Panda for the mbuf + * send case, which does NOT yet support EEOR mode. + * Thus, we can just set this flag to do the proper + * EOF handling. + */ + got_all_of_the_send = 1; + } + } + if (error) { + goto out; + } +dataless_eof: + /* EOF thing ? */ + if ((srcv->sinfo_flags & SCTP_EOF) && + (got_all_of_the_send == 1) && + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) { + int cnt; + + SCTP_STAT_INCR(sctps_sends_with_eof); + error = 0; + if (hold_tcblock == 0) { + SCTP_TCB_LOCK(stcb); + hold_tcblock = 1; + } + cnt = sctp_is_there_unsent_data(stcb); + if (TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->sent_queue) && + (cnt == 0)) { + if (asoc->locked_on_sending) { + goto abort_anyway; + } + /* there is nothing queued to send, so I'm done... */ + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + /* only send SHUTDOWN the first time through */ + sctp_send_shutdown(stcb, stcb->asoc.primary_destination); + if (SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, + asoc->primary_destination); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, + asoc->primary_destination); + } + } else { + /*- + * we still got (or just got) data to send, so set + * SHUTDOWN_PENDING + */ + /*- + * XXX sockets draft says that SCTP_EOF should be + * sent with no data. currently, we will allow user + * data to be sent first and move to + * SHUTDOWN-PENDING + */ + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_RECEIVED) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + if (hold_tcblock == 0) { + SCTP_TCB_LOCK(stcb); + hold_tcblock = 1; + } + if (asoc->locked_on_sending) { + /* Locked to send out the data */ + struct sctp_stream_queue_pending *sp; + + sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead); + if (sp) { + if ((sp->length == 0) && (sp->msg_is_complete == 0)) + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + } + } + asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; + if (TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->sent_queue) && + (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { + abort_anyway: + if (free_cnt_applied) { + atomic_add_int(&stcb->asoc.refcnt, -1); + free_cnt_applied = 0; + } + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_RESPONSE_TO_USER_REQ, + NULL, SCTP_SO_LOCKED); + /* + * now relock the stcb so everything + * is sane + */ + hold_tcblock = 0; + stcb = NULL; + goto out; + } + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, + asoc->primary_destination); + sctp_feature_off(inp, SCTP_PCB_FLAGS_NODELAY); + } + } + } +skip_out_eof: + if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { + some_on_control = 1; + } + if (queue_only_for_init) { + if (hold_tcblock == 0) { + SCTP_TCB_LOCK(stcb); + hold_tcblock = 1; + } + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) { + /* a collision took us forward? */ + queue_only = 0; + } else { + sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); + SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT); + queue_only = 1; + } + } + if ((net->flight_size > net->cwnd) && + (stcb->asoc.sctp_cmt_on_off == 0)) { + SCTP_STAT_INCR(sctps_send_cwnd_avoid); + queue_only = 1; + } else if (asoc->ifp_had_enobuf) { + SCTP_STAT_INCR(sctps_ifnomemqueued); + if (net->flight_size > (2 * net->mtu)) { + queue_only = 1; + } + asoc->ifp_had_enobuf = 0; + } + un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) + + (stcb->asoc.stream_queue_cnt * sizeof(struct sctp_data_chunk))); + if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) && + (stcb->asoc.total_flight > 0) && + (stcb->asoc.stream_queue_cnt < SCTP_MAX_DATA_BUNDLING) && + (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) { + /*- + * Ok, Nagle is set on and we have data outstanding. + * Don't send anything and let SACKs drive out the + * data unless wen have a "full" segment to send. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { + sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED); + } + SCTP_STAT_INCR(sctps_naglequeued); + nagle_applies = 1; + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { + if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) + sctp_log_nagle_event(stcb, SCTP_NAGLE_SKIPPED); + } + SCTP_STAT_INCR(sctps_naglesent); + nagle_applies = 0; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_CWNDLOG_PRESEND, queue_only_for_init, queue_only, + nagle_applies, un_sent); + sctp_misc_ints(SCTP_CWNDLOG_PRESEND, stcb->asoc.total_output_queue_size, + stcb->asoc.total_flight, + stcb->asoc.chunks_on_out_queue, stcb->asoc.total_flight_count); + } + if (queue_only_for_init) + queue_only_for_init = 0; + if ((queue_only == 0) && (nagle_applies == 0) && (stcb->asoc.peers_rwnd && un_sent)) { + /* we can attempt to send too. */ + if (hold_tcblock == 0) { + /* + * If there is activity recv'ing sacks no need to + * send + */ + if (SCTP_TCB_TRYLOCK(stcb)) { + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); + hold_tcblock = 1; + } + } else { + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); + } + } else if ((queue_only == 0) && + (stcb->asoc.peers_rwnd == 0) && + (stcb->asoc.total_flight == 0)) { + /* We get to have a probe outstanding */ + if (hold_tcblock == 0) { + hold_tcblock = 1; + SCTP_TCB_LOCK(stcb); + } + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); + } else if (some_on_control) { + int num_out, reason, frag_point; + + /* Here we do control only */ + if (hold_tcblock == 0) { + hold_tcblock = 1; + SCTP_TCB_LOCK(stcb); + } + frag_point = sctp_get_frag_point(stcb, &stcb->asoc); + (void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out, + &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_LOCKED); + } + SCTPDBG(SCTP_DEBUG_OUTPUT1, "USR Send complete qo:%d prw:%d unsent:%d tf:%d cooq:%d toqs:%d err:%d\n", + queue_only, stcb->asoc.peers_rwnd, un_sent, + stcb->asoc.total_flight, stcb->asoc.chunks_on_out_queue, + stcb->asoc.total_output_queue_size, error); + +out: +out_unlocked: + + if (local_soresv && stcb) { + atomic_subtract_int(&stcb->asoc.sb_send_resv, sndlen); + local_soresv = 0; + } + if (create_lock_applied) { + SCTP_ASOC_CREATE_UNLOCK(inp); + create_lock_applied = 0; + } + if ((stcb) && hold_tcblock) { + SCTP_TCB_UNLOCK(stcb); + } + if (stcb && free_cnt_applied) { + atomic_add_int(&stcb->asoc.refcnt, -1); + } +#ifdef INVARIANTS + if (stcb) { + if (mtx_owned(&stcb->tcb_mtx)) { + panic("Leaving with tcb mtx owned?"); + } + if (mtx_owned(&stcb->tcb_send_mtx)) { + panic("Leaving with tcb send mtx owned?"); + } + } +#endif +#ifdef INVARIANTS + if (inp) { + sctp_validate_no_locks(inp); + } else { + printf("Warning - inp is NULL so cant validate locks\n"); + } +#endif + if (top) { + sctp_m_freem(top); + } + if (control) { + sctp_m_freem(control); + } + return (error); +} + + +/* + * generate an AUTHentication chunk, if required + */ +struct mbuf * +sctp_add_auth_chunk(struct mbuf *m, struct mbuf **m_end, + struct sctp_auth_chunk **auth_ret, uint32_t * offset, + struct sctp_tcb *stcb, uint8_t chunk) +{ + struct mbuf *m_auth; + struct sctp_auth_chunk *auth; + int chunk_len; + + if ((m_end == NULL) || (auth_ret == NULL) || (offset == NULL) || + (stcb == NULL)) + return (m); + + /* sysctl disabled auth? */ + if (SCTP_BASE_SYSCTL(sctp_auth_disable)) + return (m); + + /* peer doesn't do auth... */ + if (!stcb->asoc.peer_supports_auth) { + return (m); + } + /* does the requested chunk require auth? */ + if (!sctp_auth_is_required_chunk(chunk, stcb->asoc.peer_auth_chunks)) { + return (m); + } + m_auth = sctp_get_mbuf_for_msg(sizeof(*auth), 0, M_DONTWAIT, 1, MT_HEADER); + if (m_auth == NULL) { + /* no mbuf's */ + return (m); + } + /* reserve some space if this will be the first mbuf */ + if (m == NULL) + SCTP_BUF_RESV_UF(m_auth, SCTP_MIN_OVERHEAD); + /* fill in the AUTH chunk details */ + auth = mtod(m_auth, struct sctp_auth_chunk *); + bzero(auth, sizeof(*auth)); + auth->ch.chunk_type = SCTP_AUTHENTICATION; + auth->ch.chunk_flags = 0; + chunk_len = sizeof(*auth) + + sctp_get_hmac_digest_len(stcb->asoc.peer_hmac_id); + auth->ch.chunk_length = htons(chunk_len); + auth->hmac_id = htons(stcb->asoc.peer_hmac_id); + /* key id and hmac digest will be computed and filled in upon send */ + + /* save the offset where the auth was inserted into the chain */ + if (m != NULL) { + struct mbuf *cn; + + *offset = 0; + cn = m; + while (cn) { + *offset += SCTP_BUF_LEN(cn); + cn = SCTP_BUF_NEXT(cn); + } + } else + *offset = 0; + + /* update length and return pointer to the auth chunk */ + SCTP_BUF_LEN(m_auth) = chunk_len; + m = sctp_copy_mbufchain(m_auth, m, m_end, 1, chunk_len, 0); + if (auth_ret != NULL) + *auth_ret = auth; + + return (m); +} + +#ifdef INET6 +int +sctp_v6src_match_nexthop(struct sockaddr_in6 *src6, sctp_route_t * ro) +{ + struct nd_prefix *pfx = NULL; + struct nd_pfxrouter *pfxrtr = NULL; + struct sockaddr_in6 gw6; + + if (ro == NULL || ro->ro_rt == NULL || src6->sin6_family != AF_INET6) + return (0); + + /* get prefix entry of address */ + LIST_FOREACH(pfx, &MODULE_GLOBAL(nd_prefix), ndpr_entry) { + if (pfx->ndpr_stateflags & NDPRF_DETACHED) + continue; + if (IN6_ARE_MASKED_ADDR_EQUAL(&pfx->ndpr_prefix.sin6_addr, + &src6->sin6_addr, &pfx->ndpr_mask)) + break; + } + /* no prefix entry in the prefix list */ + if (pfx == NULL) { + SCTPDBG(SCTP_DEBUG_OUTPUT2, "No prefix entry for "); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6); + return (0); + } + SCTPDBG(SCTP_DEBUG_OUTPUT2, "v6src_match_nexthop(), Prefix entry is "); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6); + + /* search installed gateway from prefix entry */ + for (pfxrtr = pfx->ndpr_advrtrs.lh_first; pfxrtr; pfxrtr = + pfxrtr->pfr_next) { + memset(&gw6, 0, sizeof(struct sockaddr_in6)); + gw6.sin6_family = AF_INET6; + gw6.sin6_len = sizeof(struct sockaddr_in6); + memcpy(&gw6.sin6_addr, &pfxrtr->router->rtaddr, + sizeof(struct in6_addr)); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "prefix router is "); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)&gw6); + SCTPDBG(SCTP_DEBUG_OUTPUT2, "installed router is "); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, ro->ro_rt->rt_gateway); + if (sctp_cmpaddr((struct sockaddr *)&gw6, + ro->ro_rt->rt_gateway)) { + SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is installed\n"); + return (1); + } + } + SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is not installed\n"); + return (0); +} + +#endif + +int +sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t * ro) +{ + struct sockaddr_in *sin, *mask; + struct ifaddr *ifa; + struct in_addr srcnetaddr, gwnetaddr; + + if (ro == NULL || ro->ro_rt == NULL || + sifa->address.sa.sa_family != AF_INET) { + return (0); + } + ifa = (struct ifaddr *)sifa->ifa; + mask = (struct sockaddr_in *)(ifa->ifa_netmask); + sin = (struct sockaddr_in *)&sifa->address.sin; + srcnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr); + SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: src address is "); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa); + SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", srcnetaddr.s_addr); + + sin = (struct sockaddr_in *)ro->ro_rt->rt_gateway; + gwnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr); + SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: nexthop is "); + SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, ro->ro_rt->rt_gateway); + SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", gwnetaddr.s_addr); + if (srcnetaddr.s_addr == gwnetaddr.s_addr) { + return (1); + } + return (0); +} diff --git a/freebsd/sys/netinet/sctp_output.h b/freebsd/sys/netinet/sctp_output.h new file mode 100644 index 00000000..d9051ee7 --- /dev/null +++ b/freebsd/sys/netinet/sctp_output.h @@ -0,0 +1,229 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_output.h,v 1.14 2005/03/06 16:04:18 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_output_h__ +#define __sctp_output_h__ + +#include + +#if defined(_KERNEL) || defined(__Userspace__) + + +struct mbuf * +sctp_add_addresses_to_i_ia(struct sctp_inpcb *inp, + struct sctp_scoping *scope, + struct mbuf *m_at, + int cnt_inits_to); + + +int sctp_is_addr_restricted(struct sctp_tcb *, struct sctp_ifa *); + + +int +sctp_is_address_in_scope(struct sctp_ifa *ifa, + int ipv4_addr_legal, + int ipv6_addr_legal, + int loopback_scope, + int ipv4_local_scope, + int local_scope, + int site_scope, + int do_update); +int + sctp_is_addr_in_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa); + +struct sctp_ifa * +sctp_source_address_selection(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + sctp_route_t * ro, struct sctp_nets *net, + int non_asoc_addr_ok, uint32_t vrf_id); + +int + sctp_v6src_match_nexthop(struct sockaddr_in6 *src6, sctp_route_t * ro); +int + sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t * ro); + +void +sctp_send_initiate(struct sctp_inpcb *, struct sctp_tcb *, int +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); + +void +sctp_send_initiate_ack(struct sctp_inpcb *, struct sctp_tcb *, + struct mbuf *, int, int, struct sctphdr *, struct sctp_init_chunk *, + uint32_t, uint16_t, int); + +struct mbuf * +sctp_arethere_unrecognized_parameters(struct mbuf *, int, int *, + struct sctp_chunkhdr *, int *); +void sctp_queue_op_err(struct sctp_tcb *, struct mbuf *); + +int +sctp_send_cookie_echo(struct mbuf *, int, struct sctp_tcb *, + struct sctp_nets *); + +void sctp_send_cookie_ack(struct sctp_tcb *); + +void +sctp_send_heartbeat_ack(struct sctp_tcb *, struct mbuf *, int, int, + struct sctp_nets *); + +void +sctp_remove_from_wheel(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_stream_out *strq, int holds_lock); + + +void sctp_send_shutdown(struct sctp_tcb *, struct sctp_nets *); + +void sctp_send_shutdown_ack(struct sctp_tcb *, struct sctp_nets *); + +void sctp_send_shutdown_complete(struct sctp_tcb *, struct sctp_nets *, int); + +void +sctp_send_shutdown_complete2(struct mbuf *, int, struct sctphdr *, + uint32_t, uint16_t); + +void sctp_send_asconf(struct sctp_tcb *, struct sctp_nets *, int addr_locked); + +void sctp_send_asconf_ack(struct sctp_tcb *); + +int sctp_get_frag_point(struct sctp_tcb *, struct sctp_association *); + +void sctp_toss_old_cookies(struct sctp_tcb *, struct sctp_association *); + +void sctp_toss_old_asconf(struct sctp_tcb *); + +void sctp_fix_ecn_echo(struct sctp_association *); + +void sctp_move_chunks_from_net(struct sctp_tcb *stcb, struct sctp_nets *net); + +int +sctp_output(struct sctp_inpcb *, struct mbuf *, struct sockaddr *, + struct mbuf *, struct thread *, int); + +void +sctp_insert_on_wheel(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_stream_out *strq, int holdslock); + +void +sctp_chunk_output(struct sctp_inpcb *, struct sctp_tcb *, int, int +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); +void +sctp_send_abort_tcb(struct sctp_tcb *, struct mbuf *, int +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); + +void send_forward_tsn(struct sctp_tcb *, struct sctp_association *); + +void sctp_send_sack(struct sctp_tcb *); + +int sctp_send_hb(struct sctp_tcb *, int, struct sctp_nets *); + +void sctp_send_ecn_echo(struct sctp_tcb *, struct sctp_nets *, uint32_t); + + +void +sctp_send_packet_dropped(struct sctp_tcb *, struct sctp_nets *, struct mbuf *, + int, int); + + + +void sctp_send_cwr(struct sctp_tcb *, struct sctp_nets *, uint32_t); + + +void +sctp_add_stream_reset_out(struct sctp_tmit_chunk *chk, + int number_entries, uint16_t * list, + uint32_t seq, uint32_t resp_seq, uint32_t last_sent); + +void +sctp_add_stream_reset_in(struct sctp_tmit_chunk *chk, + int number_entries, uint16_t * list, + uint32_t seq); + +void +sctp_add_stream_reset_tsn(struct sctp_tmit_chunk *chk, + uint32_t seq); + +void +sctp_add_stream_reset_result(struct sctp_tmit_chunk *chk, + uint32_t resp_seq, uint32_t result); + +void +sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *chk, + uint32_t resp_seq, uint32_t result, + uint32_t send_una, uint32_t recv_next); + +int +sctp_send_str_reset_req(struct sctp_tcb *stcb, + int number_entries, + uint16_t * list, + uint8_t send_out_req, + uint32_t resp_seq, + uint8_t send_in_req, + uint8_t send_tsn_req, + uint8_t add_str, + uint16_t adding); + + +void +sctp_send_abort(struct mbuf *, int, struct sctphdr *, uint32_t, + struct mbuf *, uint32_t, uint16_t); + +void sctp_send_operr_to(struct mbuf *, int, struct mbuf *, uint32_t, uint32_t, uint16_t); + +#endif /* _KERNEL || __Userspace__ */ + +#if defined(_KERNEL) || defined (__Userspace__) +int +sctp_sosend(struct socket *so, + struct sockaddr *addr, + struct uio *uio, + struct mbuf *top, + struct mbuf *control, + int flags, + struct thread *p +); + +#endif +#endif diff --git a/freebsd/sys/netinet/sctp_pcb.c b/freebsd/sys/netinet/sctp_pcb.c new file mode 100644 index 00000000..fccbda00 --- /dev/null +++ b/freebsd/sys/netinet/sctp_pcb.c @@ -0,0 +1,6810 @@ +#include + +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_pcb.c,v 1.38 2005/03/06 16:04:18 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +VNET_DEFINE(struct sctp_base_info, system_base_info); + +/* FIX: we don't handle multiple link local scopes */ +/* "scopeless" replacement IN6_ARE_ADDR_EQUAL */ +#ifdef INET6 +int +SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b) +{ + struct sockaddr_in6 tmp_a, tmp_b; + + memcpy(&tmp_a, a, sizeof(struct sockaddr_in6)); + if (sa6_embedscope(&tmp_a, MODULE_GLOBAL(ip6_use_defzone)) != 0) { + return 0; + } + memcpy(&tmp_b, b, sizeof(struct sockaddr_in6)); + if (sa6_embedscope(&tmp_b, MODULE_GLOBAL(ip6_use_defzone)) != 0) { + return 0; + } + return (IN6_ARE_ADDR_EQUAL(&tmp_a.sin6_addr, &tmp_b.sin6_addr)); +} + +#endif + +void +sctp_fill_pcbinfo(struct sctp_pcbinfo *spcb) +{ + /* + * We really don't need to lock this, but I will just because it + * does not hurt. + */ + SCTP_INP_INFO_RLOCK(); + spcb->ep_count = SCTP_BASE_INFO(ipi_count_ep); + spcb->asoc_count = SCTP_BASE_INFO(ipi_count_asoc); + spcb->laddr_count = SCTP_BASE_INFO(ipi_count_laddr); + spcb->raddr_count = SCTP_BASE_INFO(ipi_count_raddr); + spcb->chk_count = SCTP_BASE_INFO(ipi_count_chunk); + spcb->readq_count = SCTP_BASE_INFO(ipi_count_readq); + spcb->stream_oque = SCTP_BASE_INFO(ipi_count_strmoq); + spcb->free_chunks = SCTP_BASE_INFO(ipi_free_chunks); + + SCTP_INP_INFO_RUNLOCK(); +} + +/* + * Addresses are added to VRF's (Virtual Router's). For BSD we + * have only the default VRF 0. We maintain a hash list of + * VRF's. Each VRF has its own list of sctp_ifn's. Each of + * these has a list of addresses. When we add a new address + * to a VRF we lookup the ifn/ifn_index, if the ifn does + * not exist we create it and add it to the list of IFN's + * within the VRF. Once we have the sctp_ifn, we add the + * address to the list. So we look something like: + * + * hash-vrf-table + * vrf-> ifn-> ifn -> ifn + * vrf | + * ... +--ifa-> ifa -> ifa + * vrf + * + * We keep these separate lists since the SCTP subsystem will + * point to these from its source address selection nets structure. + * When an address is deleted it does not happen right away on + * the SCTP side, it gets scheduled. What we do when a + * delete happens is immediately remove the address from + * the master list and decrement the refcount. As our + * addip iterator works through and frees the src address + * selection pointing to the sctp_ifa, eventually the refcount + * will reach 0 and we will delete it. Note that it is assumed + * that any locking on system level ifn/ifa is done at the + * caller of these functions and these routines will only + * lock the SCTP structures as they add or delete things. + * + * Other notes on VRF concepts. + * - An endpoint can be in multiple VRF's + * - An association lives within a VRF and only one VRF. + * - Any incoming packet we can deduce the VRF for by + * looking at the mbuf/pak inbound (for BSD its VRF=0 :D) + * - Any downward send call or connect call must supply the + * VRF via ancillary data or via some sort of set default + * VRF socket option call (again for BSD no brainer since + * the VRF is always 0). + * - An endpoint may add multiple VRF's to it. + * - Listening sockets can accept associations in any + * of the VRF's they are in but the assoc will end up + * in only one VRF (gotten from the packet or connect/send). + * + */ + +struct sctp_vrf * +sctp_allocate_vrf(int vrf_id) +{ + struct sctp_vrf *vrf = NULL; + struct sctp_vrflist *bucket; + + /* First allocate the VRF structure */ + vrf = sctp_find_vrf(vrf_id); + if (vrf) { + /* Already allocated */ + return (vrf); + } + SCTP_MALLOC(vrf, struct sctp_vrf *, sizeof(struct sctp_vrf), + SCTP_M_VRF); + if (vrf == NULL) { + /* No memory */ +#ifdef INVARIANTS + panic("No memory for VRF:%d", vrf_id); +#endif + return (NULL); + } + /* setup the VRF */ + memset(vrf, 0, sizeof(struct sctp_vrf)); + vrf->vrf_id = vrf_id; + LIST_INIT(&vrf->ifnlist); + vrf->total_ifa_count = 0; + vrf->refcount = 0; + /* now also setup table ids */ + SCTP_INIT_VRF_TABLEID(vrf); + /* Init the HASH of addresses */ + vrf->vrf_addr_hash = SCTP_HASH_INIT(SCTP_VRF_ADDR_HASH_SIZE, + &vrf->vrf_addr_hashmark); + if (vrf->vrf_addr_hash == NULL) { + /* No memory */ +#ifdef INVARIANTS + panic("No memory for VRF:%d", vrf_id); +#endif + SCTP_FREE(vrf, SCTP_M_VRF); + return (NULL); + } + /* Add it to the hash table */ + bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))]; + LIST_INSERT_HEAD(bucket, vrf, next_vrf); + atomic_add_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1); + return (vrf); +} + + +struct sctp_ifn * +sctp_find_ifn(void *ifn, uint32_t ifn_index) +{ + struct sctp_ifn *sctp_ifnp; + struct sctp_ifnlist *hash_ifn_head; + + /* + * We assume the lock is held for the addresses if that's wrong + * problems could occur :-) + */ + hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))]; + LIST_FOREACH(sctp_ifnp, hash_ifn_head, next_bucket) { + if (sctp_ifnp->ifn_index == ifn_index) { + return (sctp_ifnp); + } + if (sctp_ifnp->ifn_p && ifn && (sctp_ifnp->ifn_p == ifn)) { + return (sctp_ifnp); + } + } + return (NULL); +} + + + +struct sctp_vrf * +sctp_find_vrf(uint32_t vrf_id) +{ + struct sctp_vrflist *bucket; + struct sctp_vrf *liste; + + bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))]; + LIST_FOREACH(liste, bucket, next_vrf) { + if (vrf_id == liste->vrf_id) { + return (liste); + } + } + return (NULL); +} + +void +sctp_free_vrf(struct sctp_vrf *vrf) +{ + if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&vrf->refcount)) { + if (vrf->vrf_addr_hash) { + SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark); + vrf->vrf_addr_hash = NULL; + } + /* We zero'd the count */ + LIST_REMOVE(vrf, next_vrf); + SCTP_FREE(vrf, SCTP_M_VRF); + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1); + } +} + +void +sctp_free_ifn(struct sctp_ifn *sctp_ifnp) +{ + if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifnp->refcount)) { + /* We zero'd the count */ + if (sctp_ifnp->vrf) { + sctp_free_vrf(sctp_ifnp->vrf); + } + SCTP_FREE(sctp_ifnp, SCTP_M_IFN); + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifns), 1); + } +} + +void +sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu) +{ + struct sctp_ifn *sctp_ifnp; + + sctp_ifnp = sctp_find_ifn((void *)NULL, ifn_index); + if (sctp_ifnp != NULL) { + sctp_ifnp->ifn_mtu = mtu; + } +} + + +void +sctp_free_ifa(struct sctp_ifa *sctp_ifap) +{ + if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifap->refcount)) { + /* We zero'd the count */ + if (sctp_ifap->ifn_p) { + sctp_free_ifn(sctp_ifap->ifn_p); + } + SCTP_FREE(sctp_ifap, SCTP_M_IFA); + atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifas), 1); + } +} + +static void +sctp_delete_ifn(struct sctp_ifn *sctp_ifnp, int hold_addr_lock) +{ + struct sctp_ifn *found; + + found = sctp_find_ifn(sctp_ifnp->ifn_p, sctp_ifnp->ifn_index); + if (found == NULL) { + /* Not in the list.. sorry */ + return; + } + if (hold_addr_lock == 0) + SCTP_IPI_ADDR_WLOCK(); + LIST_REMOVE(sctp_ifnp, next_bucket); + LIST_REMOVE(sctp_ifnp, next_ifn); + SCTP_DEREGISTER_INTERFACE(sctp_ifnp->ifn_index, + sctp_ifnp->registered_af); + if (hold_addr_lock == 0) + SCTP_IPI_ADDR_WUNLOCK(); + /* Take away the reference, and possibly free it */ + sctp_free_ifn(sctp_ifnp); +} + +void +sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr, + const char *if_name, uint32_t ifn_index) +{ + struct sctp_vrf *vrf; + struct sctp_ifa *sctp_ifap = NULL; + + SCTP_IPI_ADDR_RLOCK(); + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) { + SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); + goto out; + + } + sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); + if (sctp_ifap == NULL) { + SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n"); + goto out; + } + if (sctp_ifap->ifn_p == NULL) { + SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unuseable\n"); + goto out; + } + if (if_name) { + int len1, len2; + + len1 = strlen(if_name); + len2 = strlen(sctp_ifap->ifn_p->ifn_name); + if (len1 != len2) { + SCTPDBG(SCTP_DEBUG_PCB4, "IFN of ifa names different length %d vs %d - ignored\n", + len1, len2); + goto out; + } + if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, len1) != 0) { + SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n", + sctp_ifap->ifn_p->ifn_name, + if_name); + goto out; + } + } else { + if (sctp_ifap->ifn_p->ifn_index != ifn_index) { + SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n", + sctp_ifap->ifn_p->ifn_index, ifn_index); + goto out; + } + } + + sctp_ifap->localifa_flags &= (~SCTP_ADDR_VALID); + sctp_ifap->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; +out: + SCTP_IPI_ADDR_RUNLOCK(); +} + +void +sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr, + const char *if_name, uint32_t ifn_index) +{ + struct sctp_vrf *vrf; + struct sctp_ifa *sctp_ifap = NULL; + + SCTP_IPI_ADDR_RLOCK(); + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) { + SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); + goto out; + + } + sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); + if (sctp_ifap == NULL) { + SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n"); + goto out; + } + if (sctp_ifap->ifn_p == NULL) { + SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unuseable\n"); + goto out; + } + if (if_name) { + int len1, len2; + + len1 = strlen(if_name); + len2 = strlen(sctp_ifap->ifn_p->ifn_name); + if (len1 != len2) { + SCTPDBG(SCTP_DEBUG_PCB4, "IFN of ifa names different length %d vs %d - ignored\n", + len1, len2); + goto out; + } + if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, len1) != 0) { + SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n", + sctp_ifap->ifn_p->ifn_name, + if_name); + goto out; + } + } else { + if (sctp_ifap->ifn_p->ifn_index != ifn_index) { + SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n", + sctp_ifap->ifn_p->ifn_index, ifn_index); + goto out; + } + } + + sctp_ifap->localifa_flags &= (~SCTP_ADDR_IFA_UNUSEABLE); + sctp_ifap->localifa_flags |= SCTP_ADDR_VALID; +out: + SCTP_IPI_ADDR_RUNLOCK(); +} + +/*- + * Add an ifa to an ifn. + * Register the interface as necessary. + * NOTE: ADDR write lock MUST be held. + */ +static void +sctp_add_ifa_to_ifn(struct sctp_ifn *sctp_ifnp, struct sctp_ifa *sctp_ifap) +{ + int ifa_af; + + LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa); + sctp_ifap->ifn_p = sctp_ifnp; + atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); + /* update address counts */ + sctp_ifnp->ifa_count++; + ifa_af = sctp_ifap->address.sa.sa_family; + if (ifa_af == AF_INET) + sctp_ifnp->num_v4++; + else + sctp_ifnp->num_v6++; + if (sctp_ifnp->ifa_count == 1) { + /* register the new interface */ + SCTP_REGISTER_INTERFACE(sctp_ifnp->ifn_index, ifa_af); + sctp_ifnp->registered_af = ifa_af; + } +} + +/*- + * Remove an ifa from its ifn. + * If no more addresses exist, remove the ifn too. Otherwise, re-register + * the interface based on the remaining address families left. + * NOTE: ADDR write lock MUST be held. + */ +static void +sctp_remove_ifa_from_ifn(struct sctp_ifa *sctp_ifap) +{ + uint32_t ifn_index; + + LIST_REMOVE(sctp_ifap, next_ifa); + if (sctp_ifap->ifn_p) { + /* update address counts */ + sctp_ifap->ifn_p->ifa_count--; + if (sctp_ifap->address.sa.sa_family == AF_INET6) + sctp_ifap->ifn_p->num_v6--; + else if (sctp_ifap->address.sa.sa_family == AF_INET) + sctp_ifap->ifn_p->num_v4--; + + ifn_index = sctp_ifap->ifn_p->ifn_index; + if (LIST_EMPTY(&sctp_ifap->ifn_p->ifalist)) { + /* remove the ifn, possibly freeing it */ + sctp_delete_ifn(sctp_ifap->ifn_p, SCTP_ADDR_LOCKED); + } else { + /* re-register address family type, if needed */ + if ((sctp_ifap->ifn_p->num_v6 == 0) && + (sctp_ifap->ifn_p->registered_af == AF_INET6)) { + SCTP_DEREGISTER_INTERFACE(ifn_index, AF_INET6); + SCTP_REGISTER_INTERFACE(ifn_index, AF_INET); + sctp_ifap->ifn_p->registered_af = AF_INET; + } else if ((sctp_ifap->ifn_p->num_v4 == 0) && + (sctp_ifap->ifn_p->registered_af == AF_INET)) { + SCTP_DEREGISTER_INTERFACE(ifn_index, AF_INET); + SCTP_REGISTER_INTERFACE(ifn_index, AF_INET6); + sctp_ifap->ifn_p->registered_af = AF_INET6; + } + /* free the ifn refcount */ + sctp_free_ifn(sctp_ifap->ifn_p); + } + sctp_ifap->ifn_p = NULL; + } +} + +struct sctp_ifa * +sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, + uint32_t ifn_type, const char *if_name, void *ifa, + struct sockaddr *addr, uint32_t ifa_flags, + int dynamic_add) +{ + struct sctp_vrf *vrf; + struct sctp_ifn *sctp_ifnp = NULL; + struct sctp_ifa *sctp_ifap = NULL; + struct sctp_ifalist *hash_addr_head; + struct sctp_ifnlist *hash_ifn_head; + uint32_t hash_of_addr; + int new_ifn_af = 0; + +#ifdef SCTP_DEBUG + SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: adding address: ", vrf_id); + SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); +#endif + SCTP_IPI_ADDR_WLOCK(); + sctp_ifnp = sctp_find_ifn(ifn, ifn_index); + if (sctp_ifnp) { + vrf = sctp_ifnp->vrf; + } else { + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) { + vrf = sctp_allocate_vrf(vrf_id); + if (vrf == NULL) { + SCTP_IPI_ADDR_WUNLOCK(); + return (NULL); + } + } + } + if (sctp_ifnp == NULL) { + /* + * build one and add it, can't hold lock until after malloc + * done though. + */ + SCTP_IPI_ADDR_WUNLOCK(); + SCTP_MALLOC(sctp_ifnp, struct sctp_ifn *, + sizeof(struct sctp_ifn), SCTP_M_IFN); + if (sctp_ifnp == NULL) { +#ifdef INVARIANTS + panic("No memory for IFN"); +#endif + return (NULL); + } + memset(sctp_ifnp, 0, sizeof(struct sctp_ifn)); + sctp_ifnp->ifn_index = ifn_index; + sctp_ifnp->ifn_p = ifn; + sctp_ifnp->ifn_type = ifn_type; + sctp_ifnp->refcount = 0; + sctp_ifnp->vrf = vrf; + atomic_add_int(&vrf->refcount, 1); + sctp_ifnp->ifn_mtu = SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, addr->sa_family); + if (if_name != NULL) { + memcpy(sctp_ifnp->ifn_name, if_name, SCTP_IFNAMSIZ); + } else { + memcpy(sctp_ifnp->ifn_name, "unknown", min(7, SCTP_IFNAMSIZ)); + } + hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))]; + LIST_INIT(&sctp_ifnp->ifalist); + SCTP_IPI_ADDR_WLOCK(); + LIST_INSERT_HEAD(hash_ifn_head, sctp_ifnp, next_bucket); + LIST_INSERT_HEAD(&vrf->ifnlist, sctp_ifnp, next_ifn); + atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifns), 1); + new_ifn_af = 1; + } + sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); + if (sctp_ifap) { + /* Hmm, it already exists? */ + if ((sctp_ifap->ifn_p) && + (sctp_ifap->ifn_p->ifn_index == ifn_index)) { + SCTPDBG(SCTP_DEBUG_PCB4, "Using existing ifn %s (0x%x) for ifa %p\n", + sctp_ifap->ifn_p->ifn_name, ifn_index, + sctp_ifap); + if (new_ifn_af) { + /* Remove the created one that we don't want */ + sctp_delete_ifn(sctp_ifnp, SCTP_ADDR_LOCKED); + } + if (sctp_ifap->localifa_flags & SCTP_BEING_DELETED) { + /* easy to solve, just switch back to active */ + SCTPDBG(SCTP_DEBUG_PCB4, "Clearing deleted ifa flag\n"); + sctp_ifap->localifa_flags = SCTP_ADDR_VALID; + sctp_ifap->ifn_p = sctp_ifnp; + atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); + } + exit_stage_left: + SCTP_IPI_ADDR_WUNLOCK(); + return (sctp_ifap); + } else { + if (sctp_ifap->ifn_p) { + /* + * The last IFN gets the address, remove the + * old one + */ + SCTPDBG(SCTP_DEBUG_PCB4, "Moving ifa %p from %s (0x%x) to %s (0x%x)\n", + sctp_ifap, sctp_ifap->ifn_p->ifn_name, + sctp_ifap->ifn_p->ifn_index, if_name, + ifn_index); + /* remove the address from the old ifn */ + sctp_remove_ifa_from_ifn(sctp_ifap); + /* move the address over to the new ifn */ + sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); + goto exit_stage_left; + } else { + /* repair ifnp which was NULL ? */ + sctp_ifap->localifa_flags = SCTP_ADDR_VALID; + SCTPDBG(SCTP_DEBUG_PCB4, "Repairing ifn %p for ifa %p\n", + sctp_ifnp, sctp_ifap); + sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); + } + goto exit_stage_left; + } + } + SCTP_IPI_ADDR_WUNLOCK(); + SCTP_MALLOC(sctp_ifap, struct sctp_ifa *, sizeof(struct sctp_ifa), SCTP_M_IFA); + if (sctp_ifap == NULL) { +#ifdef INVARIANTS + panic("No memory for IFA"); +#endif + return (NULL); + } + memset(sctp_ifap, 0, sizeof(struct sctp_ifa)); + sctp_ifap->ifn_p = sctp_ifnp; + atomic_add_int(&sctp_ifnp->refcount, 1); + sctp_ifap->vrf_id = vrf_id; + sctp_ifap->ifa = ifa; + memcpy(&sctp_ifap->address, addr, addr->sa_len); + sctp_ifap->localifa_flags = SCTP_ADDR_VALID | SCTP_ADDR_DEFER_USE; + sctp_ifap->flags = ifa_flags; + /* Set scope */ + switch (sctp_ifap->address.sa.sa_family) { + case AF_INET: + { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&sctp_ifap->address.sin; + if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) || + (IN4_ISLOOPBACK_ADDRESS(&sin->sin_addr))) { + sctp_ifap->src_is_loop = 1; + } + if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { + sctp_ifap->src_is_priv = 1; + } + sctp_ifnp->num_v4++; + if (new_ifn_af) + new_ifn_af = AF_INET; + break; + } +#ifdef INET6 + case AF_INET6: + { + /* ok to use deprecated addresses? */ + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&sctp_ifap->address.sin6; + if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) || + (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))) { + sctp_ifap->src_is_loop = 1; + } + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { + sctp_ifap->src_is_priv = 1; + } + sctp_ifnp->num_v6++; + if (new_ifn_af) + new_ifn_af = AF_INET6; + break; + } +#endif + default: + new_ifn_af = 0; + break; + } + hash_of_addr = sctp_get_ifa_hash_val(&sctp_ifap->address.sa); + + if ((sctp_ifap->src_is_priv == 0) && + (sctp_ifap->src_is_loop == 0)) { + sctp_ifap->src_is_glob = 1; + } + SCTP_IPI_ADDR_WLOCK(); + hash_addr_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)]; + LIST_INSERT_HEAD(hash_addr_head, sctp_ifap, next_bucket); + sctp_ifap->refcount = 1; + LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa); + sctp_ifnp->ifa_count++; + vrf->total_ifa_count++; + atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifas), 1); + if (new_ifn_af) { + SCTP_REGISTER_INTERFACE(ifn_index, new_ifn_af); + sctp_ifnp->registered_af = new_ifn_af; + } + SCTP_IPI_ADDR_WUNLOCK(); + if (dynamic_add) { + /* + * Bump up the refcount so that when the timer completes it + * will drop back down. + */ + struct sctp_laddr *wi; + + atomic_add_int(&sctp_ifap->refcount, 1); + wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); + if (wi == NULL) { + /* + * Gak, what can we do? We have lost an address + * change can you say HOSED? + */ + SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n"); + /* Opps, must decrement the count */ + sctp_del_addr_from_vrf(vrf_id, addr, ifn_index, + if_name); + return (NULL); + } + SCTP_INCR_LADDR_COUNT(); + bzero(wi, sizeof(*wi)); + (void)SCTP_GETTIME_TIMEVAL(&wi->start_time); + wi->ifa = sctp_ifap; + wi->action = SCTP_ADD_IP_ADDRESS; + + SCTP_WQ_ADDR_LOCK(); + LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); + SCTP_WQ_ADDR_UNLOCK(); + + sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, + (struct sctp_inpcb *)NULL, + (struct sctp_tcb *)NULL, + (struct sctp_nets *)NULL); + } else { + /* it's ready for use */ + sctp_ifap->localifa_flags &= ~SCTP_ADDR_DEFER_USE; + } + return (sctp_ifap); +} + +void +sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr, + uint32_t ifn_index, const char *if_name) +{ + struct sctp_vrf *vrf; + struct sctp_ifa *sctp_ifap = NULL; + + SCTP_IPI_ADDR_WLOCK(); + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) { + SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); + goto out_now; + } +#ifdef SCTP_DEBUG + SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: deleting address:", vrf_id); + SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); +#endif + sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); + if (sctp_ifap) { + /* Validate the delete */ + if (sctp_ifap->ifn_p) { + int valid = 0; + + /*- + * The name has priority over the ifn_index + * if its given. We do this especially for + * panda who might recycle indexes fast. + */ + if (if_name) { + int len1, len2; + + len1 = min(SCTP_IFNAMSIZ, strlen(if_name)); + len2 = min(SCTP_IFNAMSIZ, strlen(sctp_ifap->ifn_p->ifn_name)); + if (len1 && len2 && (len1 == len2)) { + /* we can compare them */ + if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, len1) == 0) { + /* + * They match its a correct + * delete + */ + valid = 1; + } + } + } + if (!valid) { + /* last ditch check ifn_index */ + if (ifn_index == sctp_ifap->ifn_p->ifn_index) { + valid = 1; + } + } + if (!valid) { + SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s does not match addresses\n", + ifn_index, ((if_name == NULL) ? "NULL" : if_name)); + SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s - ignoring delete\n", + sctp_ifap->ifn_p->ifn_index, sctp_ifap->ifn_p->ifn_name); + SCTP_IPI_ADDR_WUNLOCK(); + return; + } + } + SCTPDBG(SCTP_DEBUG_PCB4, "Deleting ifa %p\n", sctp_ifap); + sctp_ifap->localifa_flags &= SCTP_ADDR_VALID; + sctp_ifap->localifa_flags |= SCTP_BEING_DELETED; + vrf->total_ifa_count--; + LIST_REMOVE(sctp_ifap, next_bucket); + sctp_remove_ifa_from_ifn(sctp_ifap); + } +#ifdef SCTP_DEBUG + else { + SCTPDBG(SCTP_DEBUG_PCB4, "Del Addr-ifn:%d Could not find address:", + ifn_index); + SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr); + } +#endif + +out_now: + SCTP_IPI_ADDR_WUNLOCK(); + if (sctp_ifap) { + struct sctp_laddr *wi; + + wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); + if (wi == NULL) { + /* + * Gak, what can we do? We have lost an address + * change can you say HOSED? + */ + SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n"); + + /* Oops, must decrement the count */ + sctp_free_ifa(sctp_ifap); + return; + } + SCTP_INCR_LADDR_COUNT(); + bzero(wi, sizeof(*wi)); + (void)SCTP_GETTIME_TIMEVAL(&wi->start_time); + wi->ifa = sctp_ifap; + wi->action = SCTP_DEL_IP_ADDRESS; + SCTP_WQ_ADDR_LOCK(); + /* + * Should this really be a tailq? As it is we will process + * the newest first :-0 + */ + LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); + SCTP_WQ_ADDR_UNLOCK(); + + sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, + (struct sctp_inpcb *)NULL, + (struct sctp_tcb *)NULL, + (struct sctp_nets *)NULL); + } + return; +} + + +static struct sctp_tcb * +sctp_tcb_special_locate(struct sctp_inpcb **inp_p, struct sockaddr *from, + struct sockaddr *to, struct sctp_nets **netp, uint32_t vrf_id) +{ + /**** ASSUMES THE CALLER holds the INP_INFO_RLOCK */ + /* + * If we support the TCP model, then we must now dig through to see + * if we can find our endpoint in the list of tcp ep's. + */ + uint16_t lport, rport; + struct sctppcbhead *ephead; + struct sctp_inpcb *inp; + struct sctp_laddr *laddr; + struct sctp_tcb *stcb; + struct sctp_nets *net; + + if ((to == NULL) || (from == NULL)) { + return (NULL); + } + if (to->sa_family == AF_INET && from->sa_family == AF_INET) { + lport = ((struct sockaddr_in *)to)->sin_port; + rport = ((struct sockaddr_in *)from)->sin_port; + } else if (to->sa_family == AF_INET6 && from->sa_family == AF_INET6) { + lport = ((struct sockaddr_in6 *)to)->sin6_port; + rport = ((struct sockaddr_in6 *)from)->sin6_port; + } else { + return NULL; + } + ephead = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))]; + /* + * Ok now for each of the guys in this bucket we must look and see: + * - Does the remote port match. - Does there single association's + * addresses match this address (to). If so we update p_ep to point + * to this ep and return the tcb from it. + */ + LIST_FOREACH(inp, ephead, sctp_hash) { + SCTP_INP_RLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + SCTP_INP_RUNLOCK(inp); + continue; + } + if (lport != inp->sctp_lport) { + SCTP_INP_RUNLOCK(inp); + continue; + } + if (inp->def_vrf_id != vrf_id) { + SCTP_INP_RUNLOCK(inp); + continue; + } + /* check to see if the ep has one of the addresses */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { + /* We are NOT bound all, so look further */ + int match = 0; + + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + + if (laddr->ifa == NULL) { + SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __FUNCTION__); + continue; + } + if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { + SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n"); + continue; + } + if (laddr->ifa->address.sa.sa_family == + to->sa_family) { + /* see if it matches */ + struct sockaddr_in *intf_addr, *sin; + + intf_addr = &laddr->ifa->address.sin; + sin = (struct sockaddr_in *)to; + if (from->sa_family == AF_INET) { + if (sin->sin_addr.s_addr == + intf_addr->sin_addr.s_addr) { + match = 1; + break; + } + } +#ifdef INET6 + if (from->sa_family == AF_INET6) { + struct sockaddr_in6 *intf_addr6; + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *) + to; + intf_addr6 = &laddr->ifa->address.sin6; + + if (SCTP6_ARE_ADDR_EQUAL(sin6, + intf_addr6)) { + match = 1; + break; + } + } +#endif + } + } + if (match == 0) { + /* This endpoint does not have this address */ + SCTP_INP_RUNLOCK(inp); + continue; + } + } + /* + * Ok if we hit here the ep has the address, does it hold + * the tcb? + */ + + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb == NULL) { + SCTP_INP_RUNLOCK(inp); + continue; + } + SCTP_TCB_LOCK(stcb); + if (stcb->rport != rport) { + /* remote port does not match. */ + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_RUNLOCK(inp); + continue; + } + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_RUNLOCK(inp); + continue; + } + /* Does this TCB have a matching address? */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + + if (net->ro._l_addr.sa.sa_family != from->sa_family) { + /* not the same family, can't be a match */ + continue; + } + switch (from->sa_family) { + case AF_INET: + { + struct sockaddr_in *sin, *rsin; + + sin = (struct sockaddr_in *)&net->ro._l_addr; + rsin = (struct sockaddr_in *)from; + if (sin->sin_addr.s_addr == + rsin->sin_addr.s_addr) { + /* found it */ + if (netp != NULL) { + *netp = net; + } + /* + * Update the endpoint + * pointer + */ + *inp_p = inp; + SCTP_INP_RUNLOCK(inp); + return (stcb); + } + break; + } +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *sin6, *rsin6; + + sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; + rsin6 = (struct sockaddr_in6 *)from; + if (SCTP6_ARE_ADDR_EQUAL(sin6, + rsin6)) { + /* found it */ + if (netp != NULL) { + *netp = net; + } + /* + * Update the endpoint + * pointer + */ + *inp_p = inp; + SCTP_INP_RUNLOCK(inp); + return (stcb); + } + break; + } +#endif + default: + /* TSNH */ + break; + } + } + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_RUNLOCK(inp); + } + return (NULL); +} + +static int +sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to) +{ + int loopback_scope, ipv4_local_scope, local_scope, site_scope; + int ipv4_addr_legal, ipv6_addr_legal; + struct sctp_vrf *vrf; + struct sctp_ifn *sctp_ifn; + struct sctp_ifa *sctp_ifa; + + loopback_scope = stcb->asoc.loopback_scope; + ipv4_local_scope = stcb->asoc.ipv4_local_scope; + local_scope = stcb->asoc.local_scope; + site_scope = stcb->asoc.site_scope; + ipv4_addr_legal = ipv6_addr_legal = 0; + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ipv6_addr_legal = 1; + if (SCTP_IPV6_V6ONLY(stcb->sctp_ep) == 0) { + ipv4_addr_legal = 1; + } + } else { + ipv4_addr_legal = 1; + } + + SCTP_IPI_ADDR_RLOCK(); + vrf = sctp_find_vrf(stcb->asoc.vrf_id); + if (vrf == NULL) { + /* no vrf, no addresses */ + SCTP_IPI_ADDR_RUNLOCK(); + return (0); + } + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { + if ((loopback_scope == 0) && + SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { + continue; + } + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + if (sctp_is_addr_restricted(stcb, sctp_ifa)) + continue; + switch (sctp_ifa->address.sa.sa_family) { +#ifdef INET + case AF_INET: + if (ipv4_addr_legal) { + struct sockaddr_in *sin, + *rsin; + + sin = &sctp_ifa->address.sin; + rsin = (struct sockaddr_in *)to; + if ((ipv4_local_scope == 0) && + IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { + continue; + } + if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { + SCTP_IPI_ADDR_RUNLOCK(); + return (1); + } + } + break; +#endif +#ifdef INET6 + case AF_INET6: + if (ipv6_addr_legal) { + struct sockaddr_in6 *sin6, + *rsin6; + + sin6 = &sctp_ifa->address.sin6; + rsin6 = (struct sockaddr_in6 *)to; + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { + if (local_scope == 0) + continue; + if (sin6->sin6_scope_id == 0) { + if (sa6_recoverscope(sin6) != 0) + continue; + } + } + if ((site_scope == 0) && + (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { + continue; + } + if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { + SCTP_IPI_ADDR_RUNLOCK(); + return (1); + } + } + break; +#endif + default: + /* TSNH */ + break; + } + } + } + } else { + struct sctp_laddr *laddr; + + LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { + if (sctp_is_addr_restricted(stcb, laddr->ifa)) { + continue; + } + if (laddr->ifa->address.sa.sa_family != to->sa_family) { + continue; + } + switch (to->sa_family) { +#ifdef INET + case AF_INET: + { + struct sockaddr_in *sin, *rsin; + + sin = (struct sockaddr_in *)&laddr->ifa->address.sin; + rsin = (struct sockaddr_in *)to; + if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { + SCTP_IPI_ADDR_RUNLOCK(); + return (1); + } + break; + } +#endif +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *sin6, *rsin6; + + sin6 = (struct sockaddr_in6 *)&laddr->ifa->address.sin6; + rsin6 = (struct sockaddr_in6 *)to; + if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { + SCTP_IPI_ADDR_RUNLOCK(); + return (1); + } + break; + } + +#endif + default: + /* TSNH */ + break; + } + + } + } + SCTP_IPI_ADDR_RUNLOCK(); + return (0); +} + +/* + * rules for use + * + * 1) If I return a NULL you must decrement any INP ref cnt. 2) If I find an + * stcb, both will be locked (locked_tcb and stcb) but decrement will be done + * (if locked == NULL). 3) Decrement happens on return ONLY if locked == + * NULL. + */ + +struct sctp_tcb * +sctp_findassociation_ep_addr(struct sctp_inpcb **inp_p, struct sockaddr *remote, + struct sctp_nets **netp, struct sockaddr *local, struct sctp_tcb *locked_tcb) +{ + struct sctpasochead *head; + struct sctp_inpcb *inp; + struct sctp_tcb *stcb = NULL; + struct sctp_nets *net; + uint16_t rport; + + inp = *inp_p; + if (remote->sa_family == AF_INET) { + rport = (((struct sockaddr_in *)remote)->sin_port); + } else if (remote->sa_family == AF_INET6) { + rport = (((struct sockaddr_in6 *)remote)->sin6_port); + } else { + return (NULL); + } + if (locked_tcb) { + /* + * UN-lock so we can do proper locking here this occurs when + * called from load_addresses_from_init. + */ + atomic_add_int(&locked_tcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(locked_tcb); + } + SCTP_INP_INFO_RLOCK(); + if (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { + /*- + * Now either this guy is our listener or it's the + * connector. If it is the one that issued the connect, then + * it's only chance is to be the first TCB in the list. If + * it is the acceptor, then do the special_lookup to hash + * and find the real inp. + */ + if ((inp->sctp_socket) && (inp->sctp_socket->so_qlimit)) { + /* to is peer addr, from is my addr */ + stcb = sctp_tcb_special_locate(inp_p, remote, local, + netp, inp->def_vrf_id); + if ((stcb != NULL) && (locked_tcb == NULL)) { + /* we have a locked tcb, lower refcount */ + SCTP_INP_DECR_REF(inp); + } + if ((locked_tcb != NULL) && (locked_tcb != stcb)) { + SCTP_INP_RLOCK(locked_tcb->sctp_ep); + SCTP_TCB_LOCK(locked_tcb); + atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); + SCTP_INP_RUNLOCK(locked_tcb->sctp_ep); + } + SCTP_INP_INFO_RUNLOCK(); + return (stcb); + } else { + SCTP_INP_WLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + goto null_return; + } + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb == NULL) { + goto null_return; + } + SCTP_TCB_LOCK(stcb); + + if (stcb->rport != rport) { + /* remote port does not match. */ + SCTP_TCB_UNLOCK(stcb); + goto null_return; + } + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + SCTP_TCB_UNLOCK(stcb); + goto null_return; + } + if (local && !sctp_does_stcb_own_this_addr(stcb, local)) { + SCTP_TCB_UNLOCK(stcb); + goto null_return; + } + /* now look at the list of remote addresses */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { +#ifdef INVARIANTS + if (net == (TAILQ_NEXT(net, sctp_next))) { + panic("Corrupt net list"); + } +#endif + if (net->ro._l_addr.sa.sa_family != + remote->sa_family) { + /* not the same family */ + continue; + } + switch (remote->sa_family) { + case AF_INET: + { + struct sockaddr_in *sin, + *rsin; + + sin = (struct sockaddr_in *) + &net->ro._l_addr; + rsin = (struct sockaddr_in *)remote; + if (sin->sin_addr.s_addr == + rsin->sin_addr.s_addr) { + /* found it */ + if (netp != NULL) { + *netp = net; + } + if (locked_tcb == NULL) { + SCTP_INP_DECR_REF(inp); + } else if (locked_tcb != stcb) { + SCTP_TCB_LOCK(locked_tcb); + } + if (locked_tcb) { + atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); + } + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + return (stcb); + } + break; + } +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *sin6, + *rsin6; + + sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; + rsin6 = (struct sockaddr_in6 *)remote; + if (SCTP6_ARE_ADDR_EQUAL(sin6, + rsin6)) { + /* found it */ + if (netp != NULL) { + *netp = net; + } + if (locked_tcb == NULL) { + SCTP_INP_DECR_REF(inp); + } else if (locked_tcb != stcb) { + SCTP_TCB_LOCK(locked_tcb); + } + if (locked_tcb) { + atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); + } + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + return (stcb); + } + break; + } +#endif + default: + /* TSNH */ + break; + } + } + SCTP_TCB_UNLOCK(stcb); + } + } else { + SCTP_INP_WLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + goto null_return; + } + head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(rport, + inp->sctp_hashmark)]; + if (head == NULL) { + goto null_return; + } + LIST_FOREACH(stcb, head, sctp_tcbhash) { + if (stcb->rport != rport) { + /* remote port does not match */ + continue; + } + SCTP_TCB_LOCK(stcb); + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + SCTP_TCB_UNLOCK(stcb); + continue; + } + if (local && !sctp_does_stcb_own_this_addr(stcb, local)) { + SCTP_TCB_UNLOCK(stcb); + continue; + } + /* now look at the list of remote addresses */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { +#ifdef INVARIANTS + if (net == (TAILQ_NEXT(net, sctp_next))) { + panic("Corrupt net list"); + } +#endif + if (net->ro._l_addr.sa.sa_family != + remote->sa_family) { + /* not the same family */ + continue; + } + switch (remote->sa_family) { + case AF_INET: + { + struct sockaddr_in *sin, + *rsin; + + sin = (struct sockaddr_in *) + &net->ro._l_addr; + rsin = (struct sockaddr_in *)remote; + if (sin->sin_addr.s_addr == + rsin->sin_addr.s_addr) { + /* found it */ + if (netp != NULL) { + *netp = net; + } + if (locked_tcb == NULL) { + SCTP_INP_DECR_REF(inp); + } else if (locked_tcb != stcb) { + SCTP_TCB_LOCK(locked_tcb); + } + if (locked_tcb) { + atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); + } + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + return (stcb); + } + break; + } +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *sin6, + *rsin6; + + sin6 = (struct sockaddr_in6 *) + &net->ro._l_addr; + rsin6 = (struct sockaddr_in6 *)remote; + if (SCTP6_ARE_ADDR_EQUAL(sin6, + rsin6)) { + /* found it */ + if (netp != NULL) { + *netp = net; + } + if (locked_tcb == NULL) { + SCTP_INP_DECR_REF(inp); + } else if (locked_tcb != stcb) { + SCTP_TCB_LOCK(locked_tcb); + } + if (locked_tcb) { + atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); + } + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + return (stcb); + } + break; + } +#endif + default: + /* TSNH */ + break; + } + } + SCTP_TCB_UNLOCK(stcb); + } + } +null_return: + /* clean up for returning null */ + if (locked_tcb) { + SCTP_TCB_LOCK(locked_tcb); + atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); + } + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + /* not found */ + return (NULL); +} + +/* + * Find an association for a specific endpoint using the association id given + * out in the COMM_UP notification + */ + +struct sctp_tcb * +sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock) +{ + /* + * Use my the assoc_id to find a endpoint + */ + struct sctpasochead *head; + struct sctp_tcb *stcb; + uint32_t id; + + if (inp == NULL) { + SCTP_PRINTF("TSNH ep_associd\n"); + return (NULL); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + SCTP_PRINTF("TSNH ep_associd0\n"); + return (NULL); + } + id = (uint32_t) asoc_id; + head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)]; + if (head == NULL) { + /* invalid id TSNH */ + SCTP_PRINTF("TSNH ep_associd1\n"); + return (NULL); + } + LIST_FOREACH(stcb, head, sctp_tcbasocidhash) { + if (stcb->asoc.assoc_id == id) { + if (inp != stcb->sctp_ep) { + /* + * some other guy has the same id active (id + * collision ??). + */ + SCTP_PRINTF("TSNH ep_associd2\n"); + continue; + } + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + continue; + } + if (want_lock) { + SCTP_TCB_LOCK(stcb); + } + return (stcb); + } + } + return (NULL); +} + + +struct sctp_tcb * +sctp_findassociation_ep_asocid(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock) +{ + struct sctp_tcb *stcb; + + SCTP_INP_RLOCK(inp); + stcb = sctp_findasoc_ep_asocid_locked(inp, asoc_id, want_lock); + SCTP_INP_RUNLOCK(inp); + return (stcb); +} + + +static struct sctp_inpcb * +sctp_endpoint_probe(struct sockaddr *nam, struct sctppcbhead *head, + uint16_t lport, uint32_t vrf_id) +{ + struct sctp_inpcb *inp; + struct sockaddr_in *sin; + +#ifdef INET6 + struct sockaddr_in6 *sin6; + +#endif + struct sctp_laddr *laddr; + +#ifdef INET6 + struct sockaddr_in6 *intf_addr6; + +#endif + + int fnd; + + /* + * Endpoint probe expects that the INP_INFO is locked. + */ + sin = NULL; +#ifdef INET6 + sin6 = NULL; +#endif + switch (nam->sa_family) { + case AF_INET: + sin = (struct sockaddr_in *)nam; + break; +#ifdef INET6 + case AF_INET6: + sin6 = (struct sockaddr_in6 *)nam; + break; +#endif + default: + /* unsupported family */ + return (NULL); + } + + if (head == NULL) + return (NULL); + + LIST_FOREACH(inp, head, sctp_hash) { + SCTP_INP_RLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + SCTP_INP_RUNLOCK(inp); + continue; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) && + (inp->sctp_lport == lport)) { + /* got it */ + if ((nam->sa_family == AF_INET) && + (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(inp)) { + /* IPv4 on a IPv6 socket with ONLY IPv6 set */ + SCTP_INP_RUNLOCK(inp); + continue; + } + /* A V6 address and the endpoint is NOT bound V6 */ + if (nam->sa_family == AF_INET6 && + (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { + SCTP_INP_RUNLOCK(inp); + continue; + } + /* does a VRF id match? */ + fnd = 0; + if (inp->def_vrf_id == vrf_id) + fnd = 1; + + SCTP_INP_RUNLOCK(inp); + if (!fnd) + continue; + return (inp); + } + SCTP_INP_RUNLOCK(inp); + } + if ((nam->sa_family == AF_INET) && + (sin->sin_addr.s_addr == INADDR_ANY)) { + /* Can't hunt for one that has no address specified */ + return (NULL); + } +#ifdef INET6 + if ((nam->sa_family == AF_INET6) && + (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))) { + /* Can't hunt for one that has no address specified */ + return (NULL); + } +#endif + /* + * ok, not bound to all so see if we can find a EP bound to this + * address. + */ + LIST_FOREACH(inp, head, sctp_hash) { + SCTP_INP_RLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + SCTP_INP_RUNLOCK(inp); + continue; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL)) { + SCTP_INP_RUNLOCK(inp); + continue; + } + /* + * Ok this could be a likely candidate, look at all of its + * addresses + */ + if (inp->sctp_lport != lport) { + SCTP_INP_RUNLOCK(inp); + continue; + } + /* does a VRF id match? */ + fnd = 0; + if (inp->def_vrf_id == vrf_id) + fnd = 1; + + if (!fnd) { + SCTP_INP_RUNLOCK(inp); + continue; + } + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa == NULL) { + SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", + __FUNCTION__); + continue; + } + SCTPDBG(SCTP_DEBUG_PCB1, "Ok laddr->ifa:%p is possible, ", + laddr->ifa); + if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { + SCTPDBG(SCTP_DEBUG_PCB1, "Huh IFA being deleted\n"); + continue; + } + if (laddr->ifa->address.sa.sa_family == nam->sa_family) { + /* possible, see if it matches */ + struct sockaddr_in *intf_addr; + + intf_addr = &laddr->ifa->address.sin; + switch (nam->sa_family) { + case AF_INET: + if (sin->sin_addr.s_addr == + intf_addr->sin_addr.s_addr) { + SCTP_INP_RUNLOCK(inp); + return (inp); + } + break; +#ifdef INET6 + case AF_INET6: + intf_addr6 = &laddr->ifa->address.sin6; + if (SCTP6_ARE_ADDR_EQUAL(sin6, + intf_addr6)) { + SCTP_INP_RUNLOCK(inp); + return (inp); + } + break; +#endif + } + } + } + SCTP_INP_RUNLOCK(inp); + } + return (NULL); +} + + +static struct sctp_inpcb * +sctp_isport_inuse(struct sctp_inpcb *inp, uint16_t lport, uint32_t vrf_id) +{ + struct sctppcbhead *head; + struct sctp_inpcb *t_inp; + int fnd; + + head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, + SCTP_BASE_INFO(hashmark))]; + LIST_FOREACH(t_inp, head, sctp_hash) { + if (t_inp->sctp_lport != lport) { + continue; + } + /* is it in the VRF in question */ + fnd = 0; + if (t_inp->def_vrf_id == vrf_id) + fnd = 1; + if (!fnd) + continue; + + /* This one is in use. */ + /* check the v6/v4 binding issue */ + if ((t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(t_inp)) { + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + /* collision in V6 space */ + return (t_inp); + } else { + /* inp is BOUND_V4 no conflict */ + continue; + } + } else if (t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + /* t_inp is bound v4 and v6, conflict always */ + return (t_inp); + } else { + /* t_inp is bound only V4 */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(inp)) { + /* no conflict */ + continue; + } + /* else fall through to conflict */ + } + return (t_inp); + } + return (NULL); +} + + +int +sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp) +{ + /* For 1-2-1 with port reuse */ + struct sctppcbhead *head; + struct sctp_inpcb *tinp; + + if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) { + /* only works with port reuse on */ + return (-1); + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) == 0) { + return (0); + } + SCTP_INP_RUNLOCK(inp); + head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, + SCTP_BASE_INFO(hashmark))]; + /* Kick out all non-listeners to the TCP hash */ + LIST_FOREACH(tinp, head, sctp_hash) { + if (tinp->sctp_lport != inp->sctp_lport) { + continue; + } + if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + continue; + } + if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + continue; + } + if (tinp->sctp_socket->so_qlimit) { + continue; + } + SCTP_INP_WLOCK(tinp); + LIST_REMOVE(tinp, sctp_hash); + head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(tinp->sctp_lport, SCTP_BASE_INFO(hashtcpmark))]; + tinp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL; + LIST_INSERT_HEAD(head, tinp, sctp_hash); + SCTP_INP_WUNLOCK(tinp); + } + SCTP_INP_WLOCK(inp); + /* Pull from where he was */ + LIST_REMOVE(inp, sctp_hash); + inp->sctp_flags &= ~SCTP_PCB_FLAGS_IN_TCPPOOL; + head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))]; + LIST_INSERT_HEAD(head, inp, sctp_hash); + SCTP_INP_WUNLOCK(inp); + SCTP_INP_RLOCK(inp); + return (0); +} + + +struct sctp_inpcb * +sctp_pcb_findep(struct sockaddr *nam, int find_tcp_pool, int have_lock, + uint32_t vrf_id) +{ + /* + * First we check the hash table to see if someone has this port + * bound with just the port. + */ + struct sctp_inpcb *inp; + struct sctppcbhead *head; + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + int lport; + unsigned int i; + + if (nam->sa_family == AF_INET) { + sin = (struct sockaddr_in *)nam; + lport = ((struct sockaddr_in *)nam)->sin_port; + } else if (nam->sa_family == AF_INET6) { + sin6 = (struct sockaddr_in6 *)nam; + lport = ((struct sockaddr_in6 *)nam)->sin6_port; + } else { + /* unsupported family */ + return (NULL); + } + /* + * I could cheat here and just cast to one of the types but we will + * do it right. It also provides the check against an Unsupported + * type too. + */ + /* Find the head of the ALLADDR chain */ + if (have_lock == 0) { + SCTP_INP_INFO_RLOCK(); + } + head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, + SCTP_BASE_INFO(hashmark))]; + inp = sctp_endpoint_probe(nam, head, lport, vrf_id); + + /* + * If the TCP model exists it could be that the main listening + * endpoint is gone but there still exists a connected socket for + * this guy. If so we can return the first one that we find. This + * may NOT be the correct one so the caller should be wary on the + * returned INP. Currently the only caller that sets find_tcp_pool + * is in bindx where we are verifying that a user CAN bind the + * address. He either has bound it already, or someone else has, or + * its open to bind, so this is good enough. + */ + if (inp == NULL && find_tcp_pool) { + for (i = 0; i < SCTP_BASE_INFO(hashtcpmark) + 1; i++) { + head = &SCTP_BASE_INFO(sctp_tcpephash)[i]; + inp = sctp_endpoint_probe(nam, head, lport, vrf_id); + if (inp) { + break; + } + } + } + if (inp) { + SCTP_INP_INCR_REF(inp); + } + if (have_lock == 0) { + SCTP_INP_INFO_RUNLOCK(); + } + return (inp); +} + +/* + * Find an association for an endpoint with the pointer to whom you want to + * send to and the endpoint pointer. The address can be IPv4 or IPv6. We may + * need to change the *to to some other struct like a mbuf... + */ +struct sctp_tcb * +sctp_findassociation_addr_sa(struct sockaddr *to, struct sockaddr *from, + struct sctp_inpcb **inp_p, struct sctp_nets **netp, int find_tcp_pool, + uint32_t vrf_id) +{ + struct sctp_inpcb *inp = NULL; + struct sctp_tcb *retval; + + SCTP_INP_INFO_RLOCK(); + if (find_tcp_pool) { + if (inp_p != NULL) { + retval = sctp_tcb_special_locate(inp_p, from, to, netp, + vrf_id); + } else { + retval = sctp_tcb_special_locate(&inp, from, to, netp, + vrf_id); + } + if (retval != NULL) { + SCTP_INP_INFO_RUNLOCK(); + return (retval); + } + } + inp = sctp_pcb_findep(to, 0, 1, vrf_id); + if (inp_p != NULL) { + *inp_p = inp; + } + SCTP_INP_INFO_RUNLOCK(); + + if (inp == NULL) { + return (NULL); + } + /* + * ok, we have an endpoint, now lets find the assoc for it (if any) + * we now place the source address or from in the to of the find + * endpoint call. Since in reality this chain is used from the + * inbound packet side. + */ + if (inp_p != NULL) { + retval = sctp_findassociation_ep_addr(inp_p, from, netp, to, + NULL); + } else { + retval = sctp_findassociation_ep_addr(&inp, from, netp, to, + NULL); + } + return retval; +} + + +/* + * This routine will grub through the mbuf that is a INIT or INIT-ACK and + * find all addresses that the sender has specified in any address list. Each + * address will be used to lookup the TCB and see if one exits. + */ +static struct sctp_tcb * +sctp_findassociation_special_addr(struct mbuf *m, int iphlen, int offset, + struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp, + struct sockaddr *dest) +{ + struct sockaddr_in sin4; + struct sockaddr_in6 sin6; + struct sctp_paramhdr *phdr, parm_buf; + struct sctp_tcb *retval; + uint32_t ptype, plen; + + memset(&sin4, 0, sizeof(sin4)); + memset(&sin6, 0, sizeof(sin6)); + sin4.sin_len = sizeof(sin4); + sin4.sin_family = AF_INET; + sin4.sin_port = sh->src_port; + sin6.sin6_len = sizeof(sin6); + sin6.sin6_family = AF_INET6; + sin6.sin6_port = sh->src_port; + + retval = NULL; + offset += sizeof(struct sctp_init_chunk); + + phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf)); + while (phdr != NULL) { + /* now we must see if we want the parameter */ + ptype = ntohs(phdr->param_type); + plen = ntohs(phdr->param_length); + if (plen == 0) { + break; + } + if (ptype == SCTP_IPV4_ADDRESS && + plen == sizeof(struct sctp_ipv4addr_param)) { + /* Get the rest of the address */ + struct sctp_ipv4addr_param ip4_parm, *p4; + + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)&ip4_parm, min(plen, sizeof(ip4_parm))); + if (phdr == NULL) { + return (NULL); + } + p4 = (struct sctp_ipv4addr_param *)phdr; + memcpy(&sin4.sin_addr, &p4->addr, sizeof(p4->addr)); + /* look it up */ + retval = sctp_findassociation_ep_addr(inp_p, + (struct sockaddr *)&sin4, netp, dest, NULL); + if (retval != NULL) { + return (retval); + } + } else if (ptype == SCTP_IPV6_ADDRESS && + plen == sizeof(struct sctp_ipv6addr_param)) { + /* Get the rest of the address */ + struct sctp_ipv6addr_param ip6_parm, *p6; + + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)&ip6_parm, min(plen, sizeof(ip6_parm))); + if (phdr == NULL) { + return (NULL); + } + p6 = (struct sctp_ipv6addr_param *)phdr; + memcpy(&sin6.sin6_addr, &p6->addr, sizeof(p6->addr)); + /* look it up */ + retval = sctp_findassociation_ep_addr(inp_p, + (struct sockaddr *)&sin6, netp, dest, NULL); + if (retval != NULL) { + return (retval); + } + } + offset += SCTP_SIZE32(plen); + phdr = sctp_get_next_param(m, offset, &parm_buf, + sizeof(parm_buf)); + } + return (NULL); +} + +static struct sctp_tcb * +sctp_findassoc_by_vtag(struct sockaddr *from, struct sockaddr *to, uint32_t vtag, + struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint16_t rport, + uint16_t lport, int skip_src_check, uint32_t vrf_id, uint32_t remote_tag) +{ + /* + * Use my vtag to hash. If we find it we then verify the source addr + * is in the assoc. If all goes well we save a bit on rec of a + * packet. + */ + struct sctpasochead *head; + struct sctp_nets *net; + struct sctp_tcb *stcb; + + *netp = NULL; + *inp_p = NULL; + SCTP_INP_INFO_RLOCK(); + head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(vtag, + SCTP_BASE_INFO(hashasocmark))]; + if (head == NULL) { + /* invalid vtag */ + SCTP_INP_INFO_RUNLOCK(); + return (NULL); + } + LIST_FOREACH(stcb, head, sctp_asocs) { + SCTP_INP_RLOCK(stcb->sctp_ep); + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + SCTP_INP_RUNLOCK(stcb->sctp_ep); + continue; + } + SCTP_TCB_LOCK(stcb); + SCTP_INP_RUNLOCK(stcb->sctp_ep); + if (stcb->asoc.my_vtag == vtag) { + /* candidate */ + if (stcb->rport != rport) { + SCTP_TCB_UNLOCK(stcb); + continue; + } + if (stcb->sctp_ep->sctp_lport != lport) { + SCTP_TCB_UNLOCK(stcb); + continue; + } + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + SCTP_TCB_UNLOCK(stcb); + continue; + } + /* RRS:Need toaddr check here */ + if (sctp_does_stcb_own_this_addr(stcb, to) == 0) { + /* Endpoint does not own this address */ + SCTP_TCB_UNLOCK(stcb); + continue; + } + if (remote_tag) { + /* + * If we have both vtags that's all we match + * on + */ + if (stcb->asoc.peer_vtag == remote_tag) { + /* + * If both tags match we consider it + * conclusive and check NO + * source/destination addresses + */ + goto conclusive; + } + } + if (skip_src_check) { + conclusive: + if (from) { + net = sctp_findnet(stcb, from); + } else { + *netp = NULL; /* unknown */ + } + if (inp_p) + *inp_p = stcb->sctp_ep; + SCTP_INP_INFO_RUNLOCK(); + return (stcb); + } + net = sctp_findnet(stcb, from); + if (net) { + /* yep its him. */ + *netp = net; + SCTP_STAT_INCR(sctps_vtagexpress); + *inp_p = stcb->sctp_ep; + SCTP_INP_INFO_RUNLOCK(); + return (stcb); + } else { + /* + * not him, this should only happen in rare + * cases so I peg it. + */ + SCTP_STAT_INCR(sctps_vtagbogus); + } + } + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_INFO_RUNLOCK(); + return (NULL); +} + +/* + * Find an association with the pointer to the inbound IP packet. This can be + * a IPv4 or IPv6 packet. + */ +struct sctp_tcb * +sctp_findassociation_addr(struct mbuf *m, int iphlen, int offset, + struct sctphdr *sh, struct sctp_chunkhdr *ch, + struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id) +{ + int find_tcp_pool; + struct ip *iph; + struct sctp_tcb *retval; + struct sockaddr_storage to_store, from_store; + struct sockaddr *to = (struct sockaddr *)&to_store; + struct sockaddr *from = (struct sockaddr *)&from_store; + struct sctp_inpcb *inp; + + iph = mtod(m, struct ip *); + switch (iph->ip_v) { + case IPVERSION: + { + /* its IPv4 */ + struct sockaddr_in *from4; + + from4 = (struct sockaddr_in *)&from_store; + bzero(from4, sizeof(*from4)); + from4->sin_family = AF_INET; + from4->sin_len = sizeof(struct sockaddr_in); + from4->sin_addr.s_addr = iph->ip_src.s_addr; + from4->sin_port = sh->src_port; + break; + } +#ifdef INET6 + case IPV6_VERSION >> 4: + { + /* its IPv6 */ + struct ip6_hdr *ip6; + struct sockaddr_in6 *from6; + + ip6 = mtod(m, struct ip6_hdr *); + from6 = (struct sockaddr_in6 *)&from_store; + bzero(from6, sizeof(*from6)); + from6->sin6_family = AF_INET6; + from6->sin6_len = sizeof(struct sockaddr_in6); + from6->sin6_addr = ip6->ip6_src; + from6->sin6_port = sh->src_port; + /* Get the scopes in properly to the sin6 addr's */ + /* we probably don't need these operations */ + (void)sa6_recoverscope(from6); + sa6_embedscope(from6, MODULE_GLOBAL(ip6_use_defzone)); + break; + } +#endif + default: + /* Currently not supported. */ + return (NULL); + } + + + switch (iph->ip_v) { + case IPVERSION: + { + /* its IPv4 */ + struct sockaddr_in *to4; + + to4 = (struct sockaddr_in *)&to_store; + bzero(to4, sizeof(*to4)); + to4->sin_family = AF_INET; + to4->sin_len = sizeof(struct sockaddr_in); + to4->sin_addr.s_addr = iph->ip_dst.s_addr; + to4->sin_port = sh->dest_port; + break; + } +#ifdef INET6 + case IPV6_VERSION >> 4: + { + /* its IPv6 */ + struct ip6_hdr *ip6; + struct sockaddr_in6 *to6; + + ip6 = mtod(m, struct ip6_hdr *); + to6 = (struct sockaddr_in6 *)&to_store; + bzero(to6, sizeof(*to6)); + to6->sin6_family = AF_INET6; + to6->sin6_len = sizeof(struct sockaddr_in6); + to6->sin6_addr = ip6->ip6_dst; + to6->sin6_port = sh->dest_port; + /* Get the scopes in properly to the sin6 addr's */ + /* we probably don't need these operations */ + (void)sa6_recoverscope(to6); + sa6_embedscope(to6, MODULE_GLOBAL(ip6_use_defzone)); + break; + } +#endif + default: + /* TSNH */ + break; + } + if (sh->v_tag) { + /* we only go down this path if vtag is non-zero */ + retval = sctp_findassoc_by_vtag(from, to, ntohl(sh->v_tag), + inp_p, netp, sh->src_port, sh->dest_port, 0, vrf_id, 0); + if (retval) { + return (retval); + } + } + find_tcp_pool = 0; + if ((ch->chunk_type != SCTP_INITIATION) && + (ch->chunk_type != SCTP_INITIATION_ACK) && + (ch->chunk_type != SCTP_COOKIE_ACK) && + (ch->chunk_type != SCTP_COOKIE_ECHO)) { + /* Other chunk types go to the tcp pool. */ + find_tcp_pool = 1; + } + if (inp_p) { + retval = sctp_findassociation_addr_sa(to, from, inp_p, netp, + find_tcp_pool, vrf_id); + inp = *inp_p; + } else { + retval = sctp_findassociation_addr_sa(to, from, &inp, netp, + find_tcp_pool, vrf_id); + } + SCTPDBG(SCTP_DEBUG_PCB1, "retval:%p inp:%p\n", retval, inp); + if (retval == NULL && inp) { + /* Found a EP but not this address */ + if ((ch->chunk_type == SCTP_INITIATION) || + (ch->chunk_type == SCTP_INITIATION_ACK)) { + /*- + * special hook, we do NOT return linp or an + * association that is linked to an existing + * association that is under the TCP pool (i.e. no + * listener exists). The endpoint finding routine + * will always find a listener before examining the + * TCP pool. + */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) { + if (inp_p) { + *inp_p = NULL; + } + return (NULL); + } + retval = sctp_findassociation_special_addr(m, iphlen, + offset, sh, &inp, netp, to); + if (inp_p != NULL) { + *inp_p = inp; + } + } + } + SCTPDBG(SCTP_DEBUG_PCB1, "retval is %p\n", retval); + return (retval); +} + +/* + * lookup an association by an ASCONF lookup address. + * if the lookup address is 0.0.0.0 or ::0, use the vtag to do the lookup + */ +struct sctp_tcb * +sctp_findassociation_ep_asconf(struct mbuf *m, int iphlen, int offset, + struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id) +{ + struct sctp_tcb *stcb; + struct sockaddr_in *sin; + +#ifdef INET6 + struct sockaddr_in6 *sin6; + +#endif + struct sockaddr_storage local_store, remote_store; + struct sockaddr *to; + struct ip *iph; + +#ifdef INET6 + struct ip6_hdr *ip6; + +#endif + struct sctp_paramhdr parm_buf, *phdr; + int ptype; + int zero_address = 0; + + + memset(&local_store, 0, sizeof(local_store)); + memset(&remote_store, 0, sizeof(remote_store)); + to = (struct sockaddr *)&local_store; + /* First get the destination address setup too. */ + iph = mtod(m, struct ip *); + switch (iph->ip_v) { + case IPVERSION: + /* its IPv4 */ + sin = (struct sockaddr_in *)&local_store; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_port = sh->dest_port; + sin->sin_addr.s_addr = iph->ip_dst.s_addr; + break; +#ifdef INET6 + case IPV6_VERSION >> 4: + /* its IPv6 */ + ip6 = mtod(m, struct ip6_hdr *); + sin6 = (struct sockaddr_in6 *)&local_store; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_port = sh->dest_port; + sin6->sin6_addr = ip6->ip6_dst; + break; +#endif + default: + return NULL; + } + + phdr = sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk), + &parm_buf, sizeof(struct sctp_paramhdr)); + if (phdr == NULL) { + SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf lookup addr\n", + __FUNCTION__); + return NULL; + } + ptype = (int)((uint32_t) ntohs(phdr->param_type)); + /* get the correlation address */ + switch (ptype) { +#ifdef INET6 + case SCTP_IPV6_ADDRESS: + { + /* ipv6 address param */ + struct sctp_ipv6addr_param *p6, p6_buf; + + if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv6addr_param)) { + return NULL; + } + p6 = (struct sctp_ipv6addr_param *)sctp_get_next_param(m, + offset + sizeof(struct sctp_asconf_chunk), + &p6_buf.ph, sizeof(*p6)); + if (p6 == NULL) { + SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v6 lookup addr\n", + __FUNCTION__); + return (NULL); + } + sin6 = (struct sockaddr_in6 *)&remote_store; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_port = sh->src_port; + memcpy(&sin6->sin6_addr, &p6->addr, sizeof(struct in6_addr)); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + zero_address = 1; + break; + } +#endif + case SCTP_IPV4_ADDRESS: + { + /* ipv4 address param */ + struct sctp_ipv4addr_param *p4, p4_buf; + + if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv4addr_param)) { + return NULL; + } + p4 = (struct sctp_ipv4addr_param *)sctp_get_next_param(m, + offset + sizeof(struct sctp_asconf_chunk), + &p4_buf.ph, sizeof(*p4)); + if (p4 == NULL) { + SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v4 lookup addr\n", + __FUNCTION__); + return (NULL); + } + sin = (struct sockaddr_in *)&remote_store; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_port = sh->src_port; + memcpy(&sin->sin_addr, &p4->addr, sizeof(struct in_addr)); + if (sin->sin_addr.s_addr == INADDR_ANY) + zero_address = 1; + break; + } + default: + /* invalid address param type */ + return NULL; + } + + if (zero_address) { + stcb = sctp_findassoc_by_vtag(NULL, to, ntohl(sh->v_tag), inp_p, + netp, sh->src_port, sh->dest_port, 1, vrf_id, 0); + /* + * printf("findassociation_ep_asconf: zero lookup address + * finds stcb 0x%x\n", (uint32_t)stcb); + */ + } else { + stcb = sctp_findassociation_ep_addr(inp_p, + (struct sockaddr *)&remote_store, netp, + to, NULL); + } + return (stcb); +} + + +/* + * allocate a sctp_inpcb and setup a temporary binding to a port/all + * addresses. This way if we don't get a bind we by default pick a ephemeral + * port with all addresses bound. + */ +int +sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) +{ + /* + * we get called when a new endpoint starts up. We need to allocate + * the sctp_inpcb structure from the zone and init it. Mark it as + * unbound and find a port that we can use as an ephemeral with + * INADDR_ANY. If the user binds later no problem we can then add in + * the specific addresses. And setup the default parameters for the + * EP. + */ + int i, error; + struct sctp_inpcb *inp; + struct sctp_pcb *m; + struct timeval time; + sctp_sharedkey_t *null_key; + + error = 0; + + SCTP_INP_INFO_WLOCK(); + inp = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_ep), struct sctp_inpcb); + if (inp == NULL) { + SCTP_PRINTF("Out of SCTP-INPCB structures - no resources\n"); + SCTP_INP_INFO_WUNLOCK(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); + return (ENOBUFS); + } + /* zap it */ + bzero(inp, sizeof(*inp)); + + /* bump generations */ + /* setup socket pointers */ + inp->sctp_socket = so; + inp->ip_inp.inp.inp_socket = so; + inp->sctp_associd_counter = 1; + inp->partial_delivery_point = SCTP_SB_LIMIT_RCV(so) >> SCTP_PARTIAL_DELIVERY_SHIFT; + inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; + inp->sctp_cmt_on_off = SCTP_BASE_SYSCTL(sctp_cmt_on_off); + /* init the small hash table we use to track asocid <-> tcb */ + inp->sctp_asocidhash = SCTP_HASH_INIT(SCTP_STACK_VTAG_HASH_SIZE, &inp->hashasocidmark); + if (inp->sctp_asocidhash == NULL) { + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); + SCTP_INP_INFO_WUNLOCK(); + return (ENOBUFS); + } +#ifdef IPSEC + { + struct inpcbpolicy *pcb_sp = NULL; + + error = ipsec_init_policy(so, &pcb_sp); + /* Arrange to share the policy */ + inp->ip_inp.inp.inp_sp = pcb_sp; + ((struct in6pcb *)(&inp->ip_inp.inp))->in6p_sp = pcb_sp; + } + if (error != 0) { + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); + SCTP_INP_INFO_WUNLOCK(); + return error; + } +#endif /* IPSEC */ + SCTP_INCR_EP_COUNT(); + inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl); + SCTP_INP_INFO_WUNLOCK(); + + so->so_pcb = (caddr_t)inp; + + if ((SCTP_SO_TYPE(so) == SOCK_DGRAM) || + (SCTP_SO_TYPE(so) == SOCK_SEQPACKET)) { + /* UDP style socket */ + inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE | + SCTP_PCB_FLAGS_UNBOUND); + /* Be sure it is NON-BLOCKING IO for UDP */ + /* SCTP_SET_SO_NBIO(so); */ + } else if (SCTP_SO_TYPE(so) == SOCK_STREAM) { + /* TCP style socket */ + inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE | + SCTP_PCB_FLAGS_UNBOUND); + /* Be sure we have blocking IO by default */ + SCTP_CLEAR_SO_NBIO(so); + } else { + /* + * unsupported socket type (RAW, etc)- in case we missed it + * in protosw + */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EOPNOTSUPP); + so->so_pcb = NULL; + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); + return (EOPNOTSUPP); + } + if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_1) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); + sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); + } else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_2) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); + sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); + } else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_0) { + sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); + sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); + } + inp->sctp_tcbhash = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_pcbtblsize), + &inp->sctp_hashmark); + if (inp->sctp_tcbhash == NULL) { + SCTP_PRINTF("Out of SCTP-INPCB->hashinit - no resources\n"); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); + so->so_pcb = NULL; + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); + return (ENOBUFS); + } + inp->def_vrf_id = vrf_id; + + SCTP_INP_INFO_WLOCK(); + SCTP_INP_LOCK_INIT(inp); + INP_LOCK_INIT(&inp->ip_inp.inp, "inp", "sctpinp"); + SCTP_INP_READ_INIT(inp); + SCTP_ASOC_CREATE_LOCK_INIT(inp); + /* lock the new ep */ + SCTP_INP_WLOCK(inp); + + /* add it to the info area */ + LIST_INSERT_HEAD(&SCTP_BASE_INFO(listhead), inp, sctp_list); + SCTP_INP_INFO_WUNLOCK(); + + TAILQ_INIT(&inp->read_queue); + LIST_INIT(&inp->sctp_addr_list); + + LIST_INIT(&inp->sctp_asoc_list); + +#ifdef SCTP_TRACK_FREED_ASOCS + /* TEMP CODE */ + LIST_INIT(&inp->sctp_asoc_free_list); +#endif + /* Init the timer structure for signature change */ + SCTP_OS_TIMER_INIT(&inp->sctp_ep.signature_change.timer); + inp->sctp_ep.signature_change.type = SCTP_TIMER_TYPE_NEWCOOKIE; + + /* now init the actual endpoint default data */ + m = &inp->sctp_ep; + + /* setup the base timeout information */ + m->sctp_timeoutticks[SCTP_TIMER_SEND] = SEC_TO_TICKS(SCTP_SEND_SEC); /* needed ? */ + m->sctp_timeoutticks[SCTP_TIMER_INIT] = SEC_TO_TICKS(SCTP_INIT_SEC); /* needed ? */ + m->sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default)); + m->sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default)); + m->sctp_timeoutticks[SCTP_TIMER_PMTU] = SEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default)); + m->sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] = SEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default)); + m->sctp_timeoutticks[SCTP_TIMER_SIGNATURE] = SEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default)); + /* all max/min max are in ms */ + m->sctp_maxrto = SCTP_BASE_SYSCTL(sctp_rto_max_default); + m->sctp_minrto = SCTP_BASE_SYSCTL(sctp_rto_min_default); + m->initial_rto = SCTP_BASE_SYSCTL(sctp_rto_initial_default); + m->initial_init_rto_max = SCTP_BASE_SYSCTL(sctp_init_rto_max_default); + m->sctp_sack_freq = SCTP_BASE_SYSCTL(sctp_sack_freq_default); + + m->max_open_streams_intome = MAX_SCTP_STREAMS; + + m->max_init_times = SCTP_BASE_SYSCTL(sctp_init_rtx_max_default); + m->max_send_times = SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default); + m->def_net_failure = SCTP_BASE_SYSCTL(sctp_path_rtx_max_default); + m->sctp_sws_sender = SCTP_SWS_SENDER_DEF; + m->sctp_sws_receiver = SCTP_SWS_RECEIVER_DEF; + m->max_burst = SCTP_BASE_SYSCTL(sctp_max_burst_default); + if ((SCTP_BASE_SYSCTL(sctp_default_cc_module) >= SCTP_CC_RFC2581) && + (SCTP_BASE_SYSCTL(sctp_default_cc_module) <= SCTP_CC_HTCP)) { + m->sctp_default_cc_module = SCTP_BASE_SYSCTL(sctp_default_cc_module); + } else { + /* sysctl done with invalid value, set to 2581 */ + m->sctp_default_cc_module = SCTP_CC_RFC2581; + } + /* number of streams to pre-open on a association */ + m->pre_open_stream_count = SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default); + + /* Add adaptation cookie */ + m->adaptation_layer_indicator = 0x504C5253; + + /* seed random number generator */ + m->random_counter = 1; + m->store_at = SCTP_SIGNATURE_SIZE; + SCTP_READ_RANDOM(m->random_numbers, sizeof(m->random_numbers)); + sctp_fill_random_store(m); + + /* Minimum cookie size */ + m->size_of_a_cookie = (sizeof(struct sctp_init_msg) * 2) + + sizeof(struct sctp_state_cookie); + m->size_of_a_cookie += SCTP_SIGNATURE_SIZE; + + /* Setup the initial secret */ + (void)SCTP_GETTIME_TIMEVAL(&time); + m->time_of_secret_change = time.tv_sec; + + for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) { + m->secret_key[0][i] = sctp_select_initial_TSN(m); + } + sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL); + + /* How long is a cookie good for ? */ + m->def_cookie_life = MSEC_TO_TICKS(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default)); + /* + * Initialize authentication parameters + */ + m->local_hmacs = sctp_default_supported_hmaclist(); + m->local_auth_chunks = sctp_alloc_chunklist(); + sctp_auth_set_default_chunks(m->local_auth_chunks); + LIST_INIT(&m->shared_keys); + /* add default NULL key as key id 0 */ + null_key = sctp_alloc_sharedkey(); + sctp_insert_sharedkey(&m->shared_keys, null_key); + SCTP_INP_WUNLOCK(inp); +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 12); +#endif + return (error); +} + + +void +sctp_move_pcb_and_assoc(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp, + struct sctp_tcb *stcb) +{ + struct sctp_nets *net; + uint16_t lport, rport; + struct sctppcbhead *head; + struct sctp_laddr *laddr, *oladdr; + + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_INFO_WLOCK(); + SCTP_INP_WLOCK(old_inp); + SCTP_INP_WLOCK(new_inp); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + + new_inp->sctp_ep.time_of_secret_change = + old_inp->sctp_ep.time_of_secret_change; + memcpy(new_inp->sctp_ep.secret_key, old_inp->sctp_ep.secret_key, + sizeof(old_inp->sctp_ep.secret_key)); + new_inp->sctp_ep.current_secret_number = + old_inp->sctp_ep.current_secret_number; + new_inp->sctp_ep.last_secret_number = + old_inp->sctp_ep.last_secret_number; + new_inp->sctp_ep.size_of_a_cookie = old_inp->sctp_ep.size_of_a_cookie; + + /* make it so new data pours into the new socket */ + stcb->sctp_socket = new_inp->sctp_socket; + stcb->sctp_ep = new_inp; + + /* Copy the port across */ + lport = new_inp->sctp_lport = old_inp->sctp_lport; + rport = stcb->rport; + /* Pull the tcb from the old association */ + LIST_REMOVE(stcb, sctp_tcbhash); + LIST_REMOVE(stcb, sctp_tcblist); + if (stcb->asoc.in_asocid_hash) { + LIST_REMOVE(stcb, sctp_tcbasocidhash); + } + /* Now insert the new_inp into the TCP connected hash */ + head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))]; + + LIST_INSERT_HEAD(head, new_inp, sctp_hash); + /* Its safe to access */ + new_inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND; + + /* Now move the tcb into the endpoint list */ + LIST_INSERT_HEAD(&new_inp->sctp_asoc_list, stcb, sctp_tcblist); + /* + * Question, do we even need to worry about the ep-hash since we + * only have one connection? Probably not :> so lets get rid of it + * and not suck up any kernel memory in that. + */ + if (stcb->asoc.in_asocid_hash) { + struct sctpasochead *lhd; + + lhd = &new_inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(stcb->asoc.assoc_id, + new_inp->hashasocidmark)]; + LIST_INSERT_HEAD(lhd, stcb, sctp_tcbasocidhash); + } + /* Ok. Let's restart timer. */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, new_inp, + stcb, net); + } + + SCTP_INP_INFO_WUNLOCK(); + if (new_inp->sctp_tcbhash != NULL) { + SCTP_HASH_FREE(new_inp->sctp_tcbhash, new_inp->sctp_hashmark); + new_inp->sctp_tcbhash = NULL; + } + if ((new_inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { + /* Subset bound, so copy in the laddr list from the old_inp */ + LIST_FOREACH(oladdr, &old_inp->sctp_addr_list, sctp_nxt_addr) { + laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); + if (laddr == NULL) { + /* + * Gak, what can we do? This assoc is really + * HOSED. We probably should send an abort + * here. + */ + SCTPDBG(SCTP_DEBUG_PCB1, "Association hosed in TCP model, out of laddr memory\n"); + continue; + } + SCTP_INCR_LADDR_COUNT(); + bzero(laddr, sizeof(*laddr)); + (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time); + laddr->ifa = oladdr->ifa; + atomic_add_int(&laddr->ifa->refcount, 1); + LIST_INSERT_HEAD(&new_inp->sctp_addr_list, laddr, + sctp_nxt_addr); + new_inp->laddr_count++; + } + } + /* + * Now any running timers need to be adjusted since we really don't + * care if they are running or not just blast in the new_inp into + * all of them. + */ + + stcb->asoc.hb_timer.ep = (void *)new_inp; + stcb->asoc.dack_timer.ep = (void *)new_inp; + stcb->asoc.asconf_timer.ep = (void *)new_inp; + stcb->asoc.strreset_timer.ep = (void *)new_inp; + stcb->asoc.shut_guard_timer.ep = (void *)new_inp; + stcb->asoc.autoclose_timer.ep = (void *)new_inp; + stcb->asoc.delayed_event_timer.ep = (void *)new_inp; + stcb->asoc.delete_prim_timer.ep = (void *)new_inp; + /* now what about the nets? */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + net->pmtu_timer.ep = (void *)new_inp; + net->rxt_timer.ep = (void *)new_inp; + net->fr_timer.ep = (void *)new_inp; + } + SCTP_INP_WUNLOCK(new_inp); + SCTP_INP_WUNLOCK(old_inp); +} + + + + +/* sctp_ifap is used to bypass normal local address validation checks */ +int +sctp_inpcb_bind(struct socket *so, struct sockaddr *addr, + struct sctp_ifa *sctp_ifap, struct thread *p) +{ + /* bind a ep to a socket address */ + struct sctppcbhead *head; + struct sctp_inpcb *inp, *inp_tmp; + struct inpcb *ip_inp; + int port_reuse_active = 0; + int bindall; + uint16_t lport; + int error; + uint32_t vrf_id; + + lport = 0; + error = 0; + bindall = 1; + inp = (struct sctp_inpcb *)so->so_pcb; + ip_inp = (struct inpcb *)so->so_pcb; +#ifdef SCTP_DEBUG + if (addr) { + SCTPDBG(SCTP_DEBUG_PCB1, "Bind called port:%d\n", + ntohs(((struct sockaddr_in *)addr)->sin_port)); + SCTPDBG(SCTP_DEBUG_PCB1, "Addr :"); + SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr); + } +#endif + if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) { + /* already did a bind, subsequent binds NOT allowed ! */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + return (EINVAL); + } +#ifdef INVARIANTS + if (p == NULL) + panic("null proc/thread"); +#endif + if (addr != NULL) { + switch (addr->sa_family) { + case AF_INET: + { + struct sockaddr_in *sin; + + /* IPV6_V6ONLY socket? */ + if (SCTP_IPV6_V6ONLY(ip_inp)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + return (EINVAL); + } + if (addr->sa_len != sizeof(*sin)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + return (EINVAL); + } + sin = (struct sockaddr_in *)addr; + lport = sin->sin_port; + /* + * For LOOPBACK the prison_local_ip4() call + * will transmute the ip address to the + * proper value. + */ + if (p && (error = prison_local_ip4(p->td_ucred, &sin->sin_addr)) != 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); + return (error); + } + if (sin->sin_addr.s_addr != INADDR_ANY) { + bindall = 0; + } + break; + } +#ifdef INET6 + case AF_INET6: + { + /* + * Only for pure IPv6 Address. (No IPv4 + * Mapped!) + */ + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)addr; + + if (addr->sa_len != sizeof(*sin6)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + return (EINVAL); + } + lport = sin6->sin6_port; + + /* + * For LOOPBACK the prison_local_ip6() call + * will transmute the ipv6 address to the + * proper value. + */ + if (p && (error = prison_local_ip6(p->td_ucred, &sin6->sin6_addr, + (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); + return (error); + } + if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + bindall = 0; + /* KAME hack: embed scopeid */ + if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + return (EINVAL); + } + } + /* this must be cleared for ifa_ifwithaddr() */ + sin6->sin6_scope_id = 0; + break; + } +#endif + default: + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EAFNOSUPPORT); + return (EAFNOSUPPORT); + } + } + SCTP_INP_INFO_WLOCK(); + SCTP_INP_WLOCK(inp); + /* Setup a vrf_id to be the default for the non-bind-all case. */ + vrf_id = inp->def_vrf_id; + + /* increase our count due to the unlock we do */ + SCTP_INP_INCR_REF(inp); + if (lport) { + /* + * Did the caller specify a port? if so we must see if a ep + * already has this one bound. + */ + /* got to be root to get at low ports */ + if (ntohs(lport) < IPPORT_RESERVED) { + if (p && (error = + priv_check(p, PRIV_NETINET_RESERVEDPORT) + )) { + SCTP_INP_DECR_REF(inp); + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + return (error); + } + } + if (p == NULL) { + SCTP_INP_DECR_REF(inp); + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); + return (error); + } + SCTP_INP_WUNLOCK(inp); + if (bindall) { + vrf_id = inp->def_vrf_id; + inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id); + if (inp_tmp != NULL) { + /* + * lock guy returned and lower count note + * that we are not bound so inp_tmp should + * NEVER be inp. And it is this inp + * (inp_tmp) that gets the reference bump, + * so we must lower it. + */ + SCTP_INP_DECR_REF(inp_tmp); + /* unlock info */ + if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && + (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { + /* + * Ok, must be one-2-one and + * allowing port re-use + */ + port_reuse_active = 1; + goto continue_anyway; + } + SCTP_INP_DECR_REF(inp); + SCTP_INP_INFO_WUNLOCK(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE); + return (EADDRINUSE); + } + } else { + inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id); + if (inp_tmp != NULL) { + /* + * lock guy returned and lower count note + * that we are not bound so inp_tmp should + * NEVER be inp. And it is this inp + * (inp_tmp) that gets the reference bump, + * so we must lower it. + */ + SCTP_INP_DECR_REF(inp_tmp); + /* unlock info */ + if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && + (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { + /* + * Ok, must be one-2-one and + * allowing port re-use + */ + port_reuse_active = 1; + goto continue_anyway; + } + SCTP_INP_DECR_REF(inp); + SCTP_INP_INFO_WUNLOCK(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE); + return (EADDRINUSE); + } + } +continue_anyway: + SCTP_INP_WLOCK(inp); + if (bindall) { + /* verify that no lport is not used by a singleton */ + if ((port_reuse_active == 0) && + (inp_tmp = sctp_isport_inuse(inp, lport, vrf_id)) + ) { + /* Sorry someone already has this one bound */ + if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && + (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { + port_reuse_active = 1; + } else { + SCTP_INP_DECR_REF(inp); + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE); + return (EADDRINUSE); + } + } + } + } else { + uint16_t first, last, candidate; + uint16_t count; + int done; + + if (ip_inp->inp_flags & INP_HIGHPORT) { + first = MODULE_GLOBAL(ipport_hifirstauto); + last = MODULE_GLOBAL(ipport_hilastauto); + } else if (ip_inp->inp_flags & INP_LOWPORT) { + if (p && (error = + priv_check(p, PRIV_NETINET_RESERVEDPORT) + )) { + SCTP_INP_DECR_REF(inp); + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); + return (error); + } + first = MODULE_GLOBAL(ipport_lowfirstauto); + last = MODULE_GLOBAL(ipport_lowlastauto); + } else { + first = MODULE_GLOBAL(ipport_firstauto); + last = MODULE_GLOBAL(ipport_lastauto); + } + if (first > last) { + uint16_t temp; + + temp = first; + first = last; + last = temp; + } + count = last - first + 1; /* number of candidates */ + candidate = first + sctp_select_initial_TSN(&inp->sctp_ep) % (count); + + done = 0; + while (!done) { + if (sctp_isport_inuse(inp, htons(candidate), inp->def_vrf_id) == NULL) { + done = 1; + } + if (!done) { + if (--count == 0) { + SCTP_INP_DECR_REF(inp); + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE); + return (EADDRINUSE); + } + if (candidate == last) + candidate = first; + else + candidate = candidate + 1; + } + } + lport = htons(candidate); + } + SCTP_INP_DECR_REF(inp); + if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | + SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { + /* + * this really should not happen. The guy did a non-blocking + * bind and then did a close at the same time. + */ + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + return (EINVAL); + } + /* ok we look clear to give out this port, so lets setup the binding */ + if (bindall) { + /* binding to all addresses, so just set in the proper flags */ + inp->sctp_flags |= SCTP_PCB_FLAGS_BOUNDALL; + /* set the automatic addr changes from kernel flag */ + if (SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) { + sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF); + sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); + } else { + sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF); + sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); + } + if (SCTP_BASE_SYSCTL(sctp_multiple_asconfs) == 0) { + sctp_feature_off(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS); + } else { + sctp_feature_on(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS); + } + /* + * set the automatic mobility_base from kernel flag (by + * micchie) + */ + if (SCTP_BASE_SYSCTL(sctp_mobility_base) == 0) { + sctp_mobility_feature_off(inp, SCTP_MOBILITY_BASE); + sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); + } else { + sctp_mobility_feature_on(inp, SCTP_MOBILITY_BASE); + sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); + } + /* + * set the automatic mobility_fasthandoff from kernel flag + * (by micchie) + */ + if (SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff) == 0) { + sctp_mobility_feature_off(inp, SCTP_MOBILITY_FASTHANDOFF); + sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); + } else { + sctp_mobility_feature_on(inp, SCTP_MOBILITY_FASTHANDOFF); + sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); + } + } else { + /* + * bind specific, make sure flags is off and add a new + * address structure to the sctp_addr_list inside the ep + * structure. + * + * We will need to allocate one and insert it at the head. The + * socketopt call can just insert new addresses in there as + * well. It will also have to do the embed scope kame hack + * too (before adding). + */ + struct sctp_ifa *ifa; + struct sockaddr_storage store_sa; + + memset(&store_sa, 0, sizeof(store_sa)); + if (addr->sa_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&store_sa; + memcpy(sin, addr, sizeof(struct sockaddr_in)); + sin->sin_port = 0; + } else if (addr->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&store_sa; + memcpy(sin6, addr, sizeof(struct sockaddr_in6)); + sin6->sin6_port = 0; + } + /* + * first find the interface with the bound address need to + * zero out the port to find the address! yuck! can't do + * this earlier since need port for sctp_pcb_findep() + */ + if (sctp_ifap != NULL) + ifa = sctp_ifap; + else { + /* + * Note for BSD we hit here always other O/S's will + * pass things in via the sctp_ifap argument + * (Panda). + */ + ifa = sctp_find_ifa_by_addr((struct sockaddr *)&store_sa, + vrf_id, SCTP_ADDR_NOT_LOCKED); + } + if (ifa == NULL) { + /* Can't find an interface with that address */ + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRNOTAVAIL); + return (EADDRNOTAVAIL); + } + if (addr->sa_family == AF_INET6) { + /* GAK, more FIXME IFA lock? */ + if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { + /* Can't bind a non-existent addr. */ + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + return (EINVAL); + } + } + /* we're not bound all */ + inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUNDALL; + /* allow bindx() to send ASCONF's for binding changes */ + sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF); + /* clear automatic addr changes from kernel flag */ + sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); + + /* add this address to the endpoint list */ + error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, 0); + if (error != 0) { + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + return (error); + } + inp->laddr_count++; + } + /* find the bucket */ + if (port_reuse_active) { + /* Put it into tcp 1-2-1 hash */ + head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashtcpmark))]; + inp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL; + } else { + head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))]; + } + /* put it in the bucket */ + LIST_INSERT_HEAD(head, inp, sctp_hash); + SCTPDBG(SCTP_DEBUG_PCB1, "Main hash to bind at head:%p, bound port:%d - in tcp_pool=%d\n", + head, ntohs(lport), port_reuse_active); + /* set in the port */ + inp->sctp_lport = lport; + + /* turn off just the unbound flag */ + inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND; + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + return (0); +} + + +static void +sctp_iterator_inp_being_freed(struct sctp_inpcb *inp) +{ + struct sctp_iterator *it, *nit; + + /* + * We enter with the only the ITERATOR_LOCK in place and a write + * lock on the inp_info stuff. + */ + it = sctp_it_ctl.cur_it; + if (it && (it->vn != curvnet)) { + /* Its not looking at our VNET */ + return; + } + if (it && (it->inp == inp)) { + /* + * This is tricky and we hold the iterator lock, but when it + * returns and gets the lock (when we release it) the + * iterator will try to operate on inp. We need to stop that + * from happening. But of course the iterator has a + * reference on the stcb and inp. We can mark it and it will + * stop. + * + * If its a single iterator situation, we set the end iterator + * flag. Otherwise we set the iterator to go to the next + * inp. + * + */ + if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { + sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT; + } else { + sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_INP; + } + } + /* + * Now go through and remove any single reference to our inp that + * may be still pending on the list + */ + SCTP_IPI_ITERATOR_WQ_LOCK(); + it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead); + while (it) { + nit = TAILQ_NEXT(it, sctp_nxt_itr); + if (it->vn != curvnet) { + it = nit; + continue; + } + if (it->inp == inp) { + /* This one points to me is it inp specific? */ + if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { + /* Remove and free this one */ + TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, + it, sctp_nxt_itr); + if (it->function_atend != NULL) { + (*it->function_atend) (it->pointer, it->val); + } + SCTP_FREE(it, SCTP_M_ITER); + } else { + it->inp = LIST_NEXT(it->inp, sctp_list); + if (it->inp) { + SCTP_INP_INCR_REF(it->inp); + } + } + /* + * When its put in the refcnt is incremented so decr + * it + */ + SCTP_INP_DECR_REF(inp); + } + it = nit; + } + SCTP_IPI_ITERATOR_WQ_UNLOCK(); +} + +/* release sctp_inpcb unbind the port */ +void +sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) +{ + /* + * Here we free a endpoint. We must find it (if it is in the Hash + * table) and remove it from there. Then we must also find it in the + * overall list and remove it from there. After all removals are + * complete then any timer has to be stopped. Then start the actual + * freeing. a) Any local lists. b) Any associations. c) The hash of + * all associations. d) finally the ep itself. + */ + struct sctp_pcb *m; + struct sctp_tcb *asoc, *nasoc; + struct sctp_laddr *laddr, *nladdr; + struct inpcb *ip_pcb; + struct socket *so; + int being_refed = 0; + struct sctp_queued_to_read *sq; + + + int cnt; + sctp_sharedkey_t *shared_key; + + +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 0); +#endif + SCTP_ITERATOR_LOCK(); + /* mark any iterators on the list or being processed */ + sctp_iterator_inp_being_freed(inp); + SCTP_ITERATOR_UNLOCK(); + so = inp->sctp_socket; + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + /* been here before.. eeks.. get out of here */ + SCTP_PRINTF("This conflict in free SHOULD not be happening! from %d, imm %d\n", from, immediate); +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 1); +#endif + return; + } + SCTP_ASOC_CREATE_LOCK(inp); + SCTP_INP_INFO_WLOCK(); + + SCTP_INP_WLOCK(inp); + if (from == SCTP_CALLED_AFTER_CMPSET_OFCLOSE) { + inp->sctp_flags &= ~SCTP_PCB_FLAGS_CLOSE_IP; + /* socket is gone, so no more wakeups allowed */ + inp->sctp_flags |= SCTP_PCB_FLAGS_DONT_WAKE; + inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT; + inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT; + + } + /* First time through we have the socket lock, after that no more. */ + sctp_timer_stop(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL, + SCTP_FROM_SCTP_PCB + SCTP_LOC_1); + + if (inp->control) { + sctp_m_freem(inp->control); + inp->control = NULL; + } + if (inp->pkt) { + sctp_m_freem(inp->pkt); + inp->pkt = NULL; + } + m = &inp->sctp_ep; + ip_pcb = &inp->ip_inp.inp; /* we could just cast the main pointer + * here but I will be nice :> (i.e. + * ip_pcb = ep;) */ + if (immediate == SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) { + int cnt_in_sd; + + cnt_in_sd = 0; + for ((asoc = LIST_FIRST(&inp->sctp_asoc_list)); asoc != NULL; + asoc = nasoc) { + SCTP_TCB_LOCK(asoc); + nasoc = LIST_NEXT(asoc, sctp_tcblist); + if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + /* Skip guys being freed */ + cnt_in_sd++; + if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) { + /* + * Special case - we did not start a + * kill timer on the asoc due to it + * was not closed. So go ahead and + * start it now. + */ + asoc->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; + sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL); + } + SCTP_TCB_UNLOCK(asoc); + continue; + } + if (((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_COOKIE_WAIT) || + (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_COOKIE_ECHOED)) && + (asoc->asoc.total_output_queue_size == 0)) { + /* + * If we have data in queue, we don't want + * to just free since the app may have done, + * send()/close or connect/send/close. And + * it wants the data to get across first. + */ + /* Just abandon things in the front states */ + if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_NOFORCE, + SCTP_FROM_SCTP_PCB + SCTP_LOC_2) == 0) { + cnt_in_sd++; + } + continue; + } + /* Disconnect the socket please */ + asoc->sctp_socket = NULL; + asoc->asoc.state |= SCTP_STATE_CLOSED_SOCKET; + if ((asoc->asoc.size_on_reasm_queue > 0) || + (asoc->asoc.control_pdapi) || + (asoc->asoc.size_on_all_streams > 0) || + (so && (so->so_rcv.sb_cc > 0)) + ) { + /* Left with Data unread */ + struct mbuf *op_err; + + op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (op_err) { + /* Fill in the user initiated abort */ + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(op_err) = + sizeof(struct sctp_paramhdr) + sizeof(uint32_t); + ph = mtod(op_err, + struct sctp_paramhdr *); + ph->param_type = htons( + SCTP_CAUSE_USER_INITIATED_ABT); + ph->param_length = htons(SCTP_BUF_LEN(op_err)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_PCB + SCTP_LOC_3); + } + asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_3; +#if defined(SCTP_PANIC_ON_ABORT) + panic("inpcb_free does an abort"); +#endif + sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED); + SCTP_STAT_INCR_COUNTER32(sctps_aborted); + if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + if (sctp_free_assoc(inp, asoc, + SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_4) == 0) { + cnt_in_sd++; + } + continue; + } else if (TAILQ_EMPTY(&asoc->asoc.send_queue) && + TAILQ_EMPTY(&asoc->asoc.sent_queue) && + (asoc->asoc.stream_queue_cnt == 0) + ) { + if (asoc->asoc.locked_on_sending) { + goto abort_anyway; + } + if ((SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + /* + * there is nothing queued to send, + * so I send shutdown + */ + sctp_send_shutdown(asoc, asoc->asoc.primary_destination); + if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + SCTP_SET_STATE(&asoc->asoc, SCTP_STATE_SHUTDOWN_SENT); + SCTP_CLEAR_SUBSTATE(&asoc->asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, asoc->sctp_ep, asoc, + asoc->asoc.primary_destination); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc, + asoc->asoc.primary_destination); + sctp_chunk_output(inp, asoc, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_LOCKED); + } + } else { + /* mark into shutdown pending */ + struct sctp_stream_queue_pending *sp; + + asoc->asoc.state |= SCTP_STATE_SHUTDOWN_PENDING; + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc, + asoc->asoc.primary_destination); + if (asoc->asoc.locked_on_sending) { + sp = TAILQ_LAST(&((asoc->asoc.locked_on_sending)->outqueue), + sctp_streamhead); + if (sp == NULL) { + SCTP_PRINTF("Error, sp is NULL, locked on sending is %p strm:%d\n", + asoc->asoc.locked_on_sending, + asoc->asoc.locked_on_sending->stream_no); + } else { + if ((sp->length == 0) && (sp->msg_is_complete == 0)) + asoc->asoc.state |= SCTP_STATE_PARTIAL_MSG_LEFT; + } + } + if (TAILQ_EMPTY(&asoc->asoc.send_queue) && + TAILQ_EMPTY(&asoc->asoc.sent_queue) && + (asoc->asoc.state & SCTP_STATE_PARTIAL_MSG_LEFT)) { + struct mbuf *op_err; + + abort_anyway: + op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (op_err) { + /* + * Fill in the user + * initiated abort + */ + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(op_err) = + (sizeof(struct sctp_paramhdr) + + sizeof(uint32_t)); + ph = mtod(op_err, + struct sctp_paramhdr *); + ph->param_type = htons( + SCTP_CAUSE_USER_INITIATED_ABT); + ph->param_length = htons(SCTP_BUF_LEN(op_err)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_PCB + SCTP_LOC_5); + } + asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_5; +#if defined(SCTP_PANIC_ON_ABORT) + panic("inpcb_free does an abort"); +#endif + + sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED); + SCTP_STAT_INCR_COUNTER32(sctps_aborted); + if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + if (sctp_free_assoc(inp, asoc, + SCTP_PCBFREE_NOFORCE, + SCTP_FROM_SCTP_PCB + SCTP_LOC_6) == 0) { + cnt_in_sd++; + } + continue; + } else { + sctp_chunk_output(inp, asoc, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); + } + } + cnt_in_sd++; + SCTP_TCB_UNLOCK(asoc); + } + /* now is there some left in our SHUTDOWN state? */ + if (cnt_in_sd) { +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 2); +#endif + inp->sctp_socket = NULL; + SCTP_INP_WUNLOCK(inp); + SCTP_ASOC_CREATE_UNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + return; + } + } + inp->sctp_socket = NULL; + if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) != + SCTP_PCB_FLAGS_UNBOUND) { + /* + * ok, this guy has been bound. It's port is somewhere in + * the SCTP_BASE_INFO(hash table). Remove it! + */ + LIST_REMOVE(inp, sctp_hash); + inp->sctp_flags |= SCTP_PCB_FLAGS_UNBOUND; + } + /* + * If there is a timer running to kill us, forget it, since it may + * have a contest on the INP lock.. which would cause us to die ... + */ + cnt = 0; + for ((asoc = LIST_FIRST(&inp->sctp_asoc_list)); asoc != NULL; + asoc = nasoc) { + SCTP_TCB_LOCK(asoc); + nasoc = LIST_NEXT(asoc, sctp_tcblist); + if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) { + asoc->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; + sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL); + } + cnt++; + SCTP_TCB_UNLOCK(asoc); + continue; + } + /* Free associations that are NOT killing us */ + if ((SCTP_GET_STATE(&asoc->asoc) != SCTP_STATE_COOKIE_WAIT) && + ((asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) { + struct mbuf *op_err; + uint32_t *ippp; + + op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (op_err) { + /* Fill in the user initiated abort */ + struct sctp_paramhdr *ph; + + SCTP_BUF_LEN(op_err) = (sizeof(struct sctp_paramhdr) + + sizeof(uint32_t)); + ph = mtod(op_err, struct sctp_paramhdr *); + ph->param_type = htons( + SCTP_CAUSE_USER_INITIATED_ABT); + ph->param_length = htons(SCTP_BUF_LEN(op_err)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_PCB + SCTP_LOC_7); + + } + asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_7; +#if defined(SCTP_PANIC_ON_ABORT) + panic("inpcb_free does an abort"); +#endif + sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED); + SCTP_STAT_INCR_COUNTER32(sctps_aborted); + } else if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + cnt++; + SCTP_TCB_UNLOCK(asoc); + continue; + } + if ((SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&asoc->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) { + cnt++; + } + } + if (cnt) { + /* Ok we have someone out there that will kill us */ + (void)SCTP_OS_TIMER_STOP(&inp->sctp_ep.signature_change.timer); +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 3); +#endif + SCTP_INP_WUNLOCK(inp); + SCTP_ASOC_CREATE_UNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + return; + } +#ifndef __rtems__ + if (SCTP_INP_LOCK_CONTENDED(inp)) + being_refed++; + if (SCTP_INP_READ_CONTENDED(inp)) + being_refed++; + if (SCTP_ASOC_CREATE_LOCK_CONTENDED(inp)) + being_refed++; +#endif + + if ((inp->refcount) || + (being_refed) || + (inp->sctp_flags & SCTP_PCB_FLAGS_CLOSE_IP)) { + (void)SCTP_OS_TIMER_STOP(&inp->sctp_ep.signature_change.timer); +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 4); +#endif + sctp_timer_start(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL); + SCTP_INP_WUNLOCK(inp); + SCTP_ASOC_CREATE_UNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + return; + } + inp->sctp_ep.signature_change.type = 0; + inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_ALLGONE; + /* + * Remove it from the list .. last thing we need a lock for. + */ + LIST_REMOVE(inp, sctp_list); + SCTP_INP_WUNLOCK(inp); + SCTP_ASOC_CREATE_UNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + /* + * Now we release all locks. Since this INP cannot be found anymore + * except possibly by the kill timer that might be running. We call + * the drain function here. It should hit the case were it sees the + * ACTIVE flag cleared and exit out freeing us to proceed and + * destroy everything. + */ + if (from != SCTP_CALLED_FROM_INPKILL_TIMER) { + (void)SCTP_OS_TIMER_STOP_DRAIN(&inp->sctp_ep.signature_change.timer); + } else { + /* Probably un-needed */ + (void)SCTP_OS_TIMER_STOP(&inp->sctp_ep.signature_change.timer); + } + +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 5); +#endif + + + if ((inp->sctp_asocidhash) != NULL) { + SCTP_HASH_FREE(inp->sctp_asocidhash, inp->hashasocidmark); + inp->sctp_asocidhash = NULL; + } + /* sa_ignore FREED_MEMORY */ + while ((sq = TAILQ_FIRST(&inp->read_queue)) != NULL) { + /* Its only abandoned if it had data left */ + if (sq->length) + SCTP_STAT_INCR(sctps_left_abandon); + + TAILQ_REMOVE(&inp->read_queue, sq, next); + sctp_free_remote_addr(sq->whoFrom); + if (so) + so->so_rcv.sb_cc -= sq->length; + if (sq->data) { + sctp_m_freem(sq->data); + sq->data = NULL; + } + /* + * no need to free the net count, since at this point all + * assoc's are gone. + */ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), sq); + SCTP_DECR_READQ_COUNT(); + } + /* Now the sctp_pcb things */ + /* + * free each asoc if it is not already closed/free. we can't use the + * macro here since le_next will get freed as part of the + * sctp_free_assoc() call. + */ + cnt = 0; + if (so) { +#ifdef IPSEC + ipsec_delete_pcbpolicy(ip_pcb); +#endif /* IPSEC */ + + /* Unlocks not needed since the socket is gone now */ + } + if (ip_pcb->inp_options) { + (void)sctp_m_free(ip_pcb->inp_options); + ip_pcb->inp_options = 0; + } + if (ip_pcb->inp_moptions) { + inp_freemoptions(ip_pcb->inp_moptions); + ip_pcb->inp_moptions = 0; + } +#ifdef INET6 + if (ip_pcb->inp_vflag & INP_IPV6) { + struct in6pcb *in6p; + + in6p = (struct in6pcb *)inp; + ip6_freepcbopts(in6p->in6p_outputopts); + } +#endif /* INET6 */ + ip_pcb->inp_vflag = 0; + /* free up authentication fields */ + if (inp->sctp_ep.local_auth_chunks != NULL) + sctp_free_chunklist(inp->sctp_ep.local_auth_chunks); + if (inp->sctp_ep.local_hmacs != NULL) + sctp_free_hmaclist(inp->sctp_ep.local_hmacs); + + shared_key = LIST_FIRST(&inp->sctp_ep.shared_keys); + while (shared_key) { + LIST_REMOVE(shared_key, next); + sctp_free_sharedkey(shared_key); + /* sa_ignore FREED_MEMORY */ + shared_key = LIST_FIRST(&inp->sctp_ep.shared_keys); + } + + /* + * if we have an address list the following will free the list of + * ifaddr's that are set into this ep. Again macro limitations here, + * since the LIST_FOREACH could be a bad idea. + */ + for ((laddr = LIST_FIRST(&inp->sctp_addr_list)); laddr != NULL; + laddr = nladdr) { + nladdr = LIST_NEXT(laddr, sctp_nxt_addr); + sctp_remove_laddr(laddr); + } + +#ifdef SCTP_TRACK_FREED_ASOCS + /* TEMP CODE */ + for ((asoc = LIST_FIRST(&inp->sctp_asoc_free_list)); asoc != NULL; + asoc = nasoc) { + nasoc = LIST_NEXT(asoc, sctp_tcblist); + LIST_REMOVE(asoc, sctp_tcblist); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), asoc); + SCTP_DECR_ASOC_COUNT(); + } + /* *** END TEMP CODE *** */ +#endif + /* Now lets see about freeing the EP hash table. */ + if (inp->sctp_tcbhash != NULL) { + SCTP_HASH_FREE(inp->sctp_tcbhash, inp->sctp_hashmark); + inp->sctp_tcbhash = NULL; + } + /* Now we must put the ep memory back into the zone pool */ + INP_LOCK_DESTROY(&inp->ip_inp.inp); + SCTP_INP_LOCK_DESTROY(inp); + SCTP_INP_READ_DESTROY(inp); + SCTP_ASOC_CREATE_LOCK_DESTROY(inp); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); + SCTP_DECR_EP_COUNT(); +} + + +struct sctp_nets * +sctp_findnet(struct sctp_tcb *stcb, struct sockaddr *addr) +{ + struct sctp_nets *net; + + /* locate the address */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if (sctp_cmpaddr(addr, (struct sockaddr *)&net->ro._l_addr)) + return (net); + } + return (NULL); +} + + +int +sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id) +{ + struct sctp_ifa *sctp_ifa; + + sctp_ifa = sctp_find_ifa_by_addr(addr, vrf_id, SCTP_ADDR_NOT_LOCKED); + if (sctp_ifa) { + return (1); + } else { + return (0); + } +} + +/* + * add's a remote endpoint address, done with the INIT/INIT-ACK as well as + * when a ASCONF arrives that adds it. It will also initialize all the cwnd + * stats of stuff. + */ +int +sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, + int set_scope, int from) +{ + /* + * The following is redundant to the same lines in the + * sctp_aloc_assoc() but is needed since others call the add address + * function + */ + struct sctp_nets *net, *netfirst; + int addr_inscope; + + SCTPDBG(SCTP_DEBUG_PCB1, "Adding an address (from:%d) to the peer: ", + from); + SCTPDBG_ADDR(SCTP_DEBUG_PCB1, newaddr); + + netfirst = sctp_findnet(stcb, newaddr); + if (netfirst) { + /* + * Lie and return ok, we don't want to make the association + * go away for this behavior. It will happen in the TCP + * model in a connected socket. It does not reach the hash + * table until after the association is built so it can't be + * found. Mark as reachable, since the initial creation will + * have been cleared and the NOT_IN_ASSOC flag will have + * been added... and we don't want to end up removing it + * back out. + */ + if (netfirst->dest_state & SCTP_ADDR_UNCONFIRMED) { + netfirst->dest_state = (SCTP_ADDR_REACHABLE | + SCTP_ADDR_UNCONFIRMED); + } else { + netfirst->dest_state = SCTP_ADDR_REACHABLE; + } + + return (0); + } + addr_inscope = 1; + if (newaddr->sa_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)newaddr; + if (sin->sin_addr.s_addr == 0) { + /* Invalid address */ + return (-1); + } + /* zero out the bzero area */ + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + + /* assure len is set */ + sin->sin_len = sizeof(struct sockaddr_in); + if (set_scope) { +#ifdef SCTP_DONT_DO_PRIVADDR_SCOPE + stcb->ipv4_local_scope = 1; +#else + if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { + stcb->asoc.ipv4_local_scope = 1; + } +#endif /* SCTP_DONT_DO_PRIVADDR_SCOPE */ + } else { + /* Validate the address is in scope */ + if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) && + (stcb->asoc.ipv4_local_scope == 0)) { + addr_inscope = 0; + } + } +#ifdef INET6 + } else if (newaddr->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)newaddr; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* Invalid address */ + return (-1); + } + /* assure len is set */ + sin6->sin6_len = sizeof(struct sockaddr_in6); + if (set_scope) { + if (sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id)) { + stcb->asoc.loopback_scope = 1; + stcb->asoc.local_scope = 0; + stcb->asoc.ipv4_local_scope = 1; + stcb->asoc.site_scope = 1; + } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { + /* + * If the new destination is a LINK_LOCAL we + * must have common site scope. Don't set + * the local scope since we may not share + * all links, only loopback can do this. + * Links on the local network would also be + * on our private network for v4 too. + */ + stcb->asoc.ipv4_local_scope = 1; + stcb->asoc.site_scope = 1; + } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) { + /* + * If the new destination is SITE_LOCAL then + * we must have site scope in common. + */ + stcb->asoc.site_scope = 1; + } + } else { + /* Validate the address is in scope */ + if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr) && + (stcb->asoc.loopback_scope == 0)) { + addr_inscope = 0; + } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && + (stcb->asoc.local_scope == 0)) { + addr_inscope = 0; + } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && + (stcb->asoc.site_scope == 0)) { + addr_inscope = 0; + } + } +#endif + } else { + /* not supported family type */ + return (-1); + } + net = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_net), struct sctp_nets); + if (net == NULL) { + return (-1); + } + SCTP_INCR_RADDR_COUNT(); + bzero(net, sizeof(*net)); + (void)SCTP_GETTIME_TIMEVAL(&net->start_time); + memcpy(&net->ro._l_addr, newaddr, newaddr->sa_len); + if (newaddr->sa_family == AF_INET) { + ((struct sockaddr_in *)&net->ro._l_addr)->sin_port = stcb->rport; + } else if (newaddr->sa_family == AF_INET6) { + ((struct sockaddr_in6 *)&net->ro._l_addr)->sin6_port = stcb->rport; + } + net->addr_is_local = sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id); + if (net->addr_is_local && ((set_scope || (from == SCTP_ADDR_IS_CONFIRMED)))) { + stcb->asoc.loopback_scope = 1; + stcb->asoc.ipv4_local_scope = 1; + stcb->asoc.local_scope = 0; + stcb->asoc.site_scope = 1; + addr_inscope = 1; + } + net->failure_threshold = stcb->asoc.def_net_failure; + if (addr_inscope == 0) { + net->dest_state = (SCTP_ADDR_REACHABLE | + SCTP_ADDR_OUT_OF_SCOPE); + } else { + if (from == SCTP_ADDR_IS_CONFIRMED) + /* SCTP_ADDR_IS_CONFIRMED is passed by connect_x */ + net->dest_state = SCTP_ADDR_REACHABLE; + else + net->dest_state = SCTP_ADDR_REACHABLE | + SCTP_ADDR_UNCONFIRMED; + } + /* + * We set this to 0, the timer code knows that this means its an + * initial value + */ + net->RTO = 0; + net->RTO_measured = 0; + stcb->asoc.numnets++; + *(&net->ref_count) = 1; + net->tos_flowlabel = 0; + if (SCTP_BASE_SYSCTL(sctp_udp_tunneling_for_client_enable)) { + net->port = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); + } else { + net->port = 0; + } +#ifdef INET + if (newaddr->sa_family == AF_INET) + net->tos_flowlabel = stcb->asoc.default_tos; +#endif +#ifdef INET6 + if (newaddr->sa_family == AF_INET6) + net->tos_flowlabel = stcb->asoc.default_flowlabel; +#endif + /* Init the timer structure */ + SCTP_OS_TIMER_INIT(&net->rxt_timer.timer); + SCTP_OS_TIMER_INIT(&net->fr_timer.timer); + SCTP_OS_TIMER_INIT(&net->pmtu_timer.timer); + + /* Now generate a route for this guy */ +#ifdef INET6 + /* KAME hack: embed scopeid */ + if (newaddr->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; + (void)sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)); + sin6->sin6_scope_id = 0; + } +#endif + SCTP_RTALLOC((sctp_route_t *) & net->ro, stcb->asoc.vrf_id); + + if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro)) { + /* Get source address */ + net->ro._s_addr = sctp_source_address_selection(stcb->sctp_ep, + stcb, + (sctp_route_t *) & net->ro, + net, + 0, + stcb->asoc.vrf_id); + /* Now get the interface MTU */ + if (net->ro._s_addr && net->ro._s_addr->ifn_p) { + net->mtu = SCTP_GATHER_MTU_FROM_INTFC(net->ro._s_addr->ifn_p); + } else { + net->mtu = 0; + } + if (net->mtu == 0) { + /* Huh ?? */ + net->mtu = SCTP_DEFAULT_MTU; + } else { + uint32_t rmtu; + + rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_rt); + if (rmtu == 0) { + /* + * Start things off to match mtu of + * interface please. + */ + SCTP_SET_MTU_OF_ROUTE(&net->ro._l_addr.sa, + net->ro.ro_rt, net->mtu); + } else { + /* + * we take the route mtu over the interface, + * since the route may be leading out the + * loopback, or a different interface. + */ + net->mtu = rmtu; + } + } + if (from == SCTP_ALLOC_ASOC) { + stcb->asoc.smallest_mtu = net->mtu; + } + } else { + net->mtu = stcb->asoc.smallest_mtu; + } +#ifdef INET6 + if (newaddr->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; + (void)sa6_recoverscope(sin6); + } +#endif + if (net->port) { + net->mtu -= sizeof(struct udphdr); + } + if (stcb->asoc.smallest_mtu > net->mtu) { + stcb->asoc.smallest_mtu = net->mtu; + } + /* JRS - Use the congestion control given in the CC module */ + stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net); + + /* + * CMT: CUC algo - set find_pseudo_cumack to TRUE (1) at beginning + * of assoc (2005/06/27, iyengar@cis.udel.edu) + */ + net->find_pseudo_cumack = 1; + net->find_rtx_pseudo_cumack = 1; + net->src_addr_selected = 0; + netfirst = TAILQ_FIRST(&stcb->asoc.nets); + if (net->ro.ro_rt == NULL) { + /* Since we have no route put it at the back */ + TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next); + } else if (netfirst == NULL) { + /* We are the first one in the pool. */ + TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); + } else if (netfirst->ro.ro_rt == NULL) { + /* + * First one has NO route. Place this one ahead of the first + * one. + */ + TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); + } else if (net->ro.ro_rt->rt_ifp != netfirst->ro.ro_rt->rt_ifp) { + /* + * This one has a different interface than the one at the + * top of the list. Place it ahead. + */ + TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); + } else { + /* + * Ok we have the same interface as the first one. Move + * forward until we find either a) one with a NULL route... + * insert ahead of that b) one with a different ifp.. insert + * after that. c) end of the list.. insert at the tail. + */ + struct sctp_nets *netlook; + + do { + netlook = TAILQ_NEXT(netfirst, sctp_next); + if (netlook == NULL) { + /* End of the list */ + TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next); + break; + } else if (netlook->ro.ro_rt == NULL) { + /* next one has NO route */ + TAILQ_INSERT_BEFORE(netfirst, net, sctp_next); + break; + } else if (netlook->ro.ro_rt->rt_ifp != net->ro.ro_rt->rt_ifp) { + TAILQ_INSERT_AFTER(&stcb->asoc.nets, netlook, + net, sctp_next); + break; + } + /* Shift forward */ + netfirst = netlook; + } while (netlook != NULL); + } + + /* got to have a primary set */ + if (stcb->asoc.primary_destination == 0) { + stcb->asoc.primary_destination = net; + } else if ((stcb->asoc.primary_destination->ro.ro_rt == NULL) && + (net->ro.ro_rt) && + ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) { + /* No route to current primary adopt new primary */ + stcb->asoc.primary_destination = net; + } + sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, stcb->sctp_ep, stcb, + net); + /* Validate primary is first */ + net = TAILQ_FIRST(&stcb->asoc.nets); + if ((net != stcb->asoc.primary_destination) && + (stcb->asoc.primary_destination)) { + /* + * first one on the list is NOT the primary sctp_cmpaddr() + * is much more efficient if the primary is the first on the + * list, make it so. + */ + TAILQ_REMOVE(&stcb->asoc.nets, + stcb->asoc.primary_destination, sctp_next); + TAILQ_INSERT_HEAD(&stcb->asoc.nets, + stcb->asoc.primary_destination, sctp_next); + } + return (0); +} + + +static uint32_t +sctp_aloc_a_assoc_id(struct sctp_inpcb *inp, struct sctp_tcb *stcb) +{ + uint32_t id; + struct sctpasochead *head; + struct sctp_tcb *lstcb; + + SCTP_INP_WLOCK(inp); +try_again: + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + /* TSNH */ + SCTP_INP_WUNLOCK(inp); + return (0); + } + /* + * We don't allow assoc id to be 0, this is needed otherwise if the + * id were to wrap we would have issues with some socket options. + */ + if (inp->sctp_associd_counter == 0) { + inp->sctp_associd_counter++; + } + id = inp->sctp_associd_counter; + inp->sctp_associd_counter++; + lstcb = sctp_findasoc_ep_asocid_locked(inp, (sctp_assoc_t) id, 0); + if (lstcb) { + goto try_again; + } + head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)]; + LIST_INSERT_HEAD(head, stcb, sctp_tcbasocidhash); + stcb->asoc.in_asocid_hash = 1; + SCTP_INP_WUNLOCK(inp); + return id; +} + +/* + * allocate an association and add it to the endpoint. The caller must be + * careful to add all additional addresses once they are know right away or + * else the assoc will be may experience a blackout scenario. + */ +struct sctp_tcb * +sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, + int *error, uint32_t override_tag, uint32_t vrf_id, + struct thread *p +) +{ + /* note the p argument is only valid in unbound sockets */ + + struct sctp_tcb *stcb; + struct sctp_association *asoc; + struct sctpasochead *head; + uint16_t rport; + int err; + + /* + * Assumption made here: Caller has done a + * sctp_findassociation_ep_addr(ep, addr's); to make sure the + * address does not exist already. + */ + if (SCTP_BASE_INFO(ipi_count_asoc) >= SCTP_MAX_NUM_OF_ASOC) { + /* Hit max assoc, sorry no more */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); + *error = ENOBUFS; + return (NULL); + } + if (firstaddr == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + *error = EINVAL; + return (NULL); + } + SCTP_INP_RLOCK(inp); + if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) && + ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) || + (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) { + /* + * If its in the TCP pool, its NOT allowed to create an + * association. The parent listener needs to call + * sctp_aloc_assoc.. or the one-2-many socket. If a peeled + * off, or connected one does this.. its an error. + */ + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + *error = EINVAL; + return (NULL); + } + SCTPDBG(SCTP_DEBUG_PCB3, "Allocate an association for peer:"); +#ifdef SCTP_DEBUG + if (firstaddr) { + SCTPDBG_ADDR(SCTP_DEBUG_PCB3, firstaddr); + SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n", + ntohs(((struct sockaddr_in *)firstaddr)->sin_port)); + } else { + SCTPDBG(SCTP_DEBUG_PCB3, "None\n"); + } +#endif /* SCTP_DEBUG */ + if (firstaddr->sa_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)firstaddr; + if ((sin->sin_port == 0) || (sin->sin_addr.s_addr == 0)) { + /* Invalid address */ + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + *error = EINVAL; + return (NULL); + } + rport = sin->sin_port; + } else if (firstaddr->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)firstaddr; + if ((sin6->sin6_port == 0) || + (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))) { + /* Invalid address */ + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + *error = EINVAL; + return (NULL); + } + rport = sin6->sin6_port; + } else { + /* not supported family type */ + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + *error = EINVAL; + return (NULL); + } + SCTP_INP_RUNLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) { + /* + * If you have not performed a bind, then we need to do the + * ephemeral bind for you. + */ + if ((err = sctp_inpcb_bind(inp->sctp_socket, + (struct sockaddr *)NULL, + (struct sctp_ifa *)NULL, + p + ))) { + /* bind error, probably perm */ + *error = err; + return (NULL); + } + } + stcb = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asoc), struct sctp_tcb); + if (stcb == NULL) { + /* out of memory? */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM); + *error = ENOMEM; + return (NULL); + } + SCTP_INCR_ASOC_COUNT(); + + bzero(stcb, sizeof(*stcb)); + asoc = &stcb->asoc; + + asoc->assoc_id = sctp_aloc_a_assoc_id(inp, stcb); + SCTP_TCB_LOCK_INIT(stcb); + SCTP_TCB_SEND_LOCK_INIT(stcb); + stcb->rport = rport; + /* setup back pointer's */ + stcb->sctp_ep = inp; + stcb->sctp_socket = inp->sctp_socket; + if ((err = sctp_init_asoc(inp, stcb, override_tag, vrf_id))) { + /* failed */ + SCTP_TCB_LOCK_DESTROY(stcb); + SCTP_TCB_SEND_LOCK_DESTROY(stcb); + LIST_REMOVE(stcb, sctp_tcbasocidhash); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); + SCTP_DECR_ASOC_COUNT(); + *error = err; + return (NULL); + } + /* and the port */ + SCTP_INP_INFO_WLOCK(); + SCTP_INP_WLOCK(inp); + if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { + /* inpcb freed while alloc going on */ + SCTP_TCB_LOCK_DESTROY(stcb); + SCTP_TCB_SEND_LOCK_DESTROY(stcb); + LIST_REMOVE(stcb, sctp_tcbasocidhash); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); + SCTP_INP_WUNLOCK(inp); + SCTP_INP_INFO_WUNLOCK(); + SCTP_DECR_ASOC_COUNT(); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + *error = EINVAL; + return (NULL); + } + SCTP_TCB_LOCK(stcb); + + /* now that my_vtag is set, add it to the hash */ + head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; + /* put it in the bucket in the vtag hash of assoc's for the system */ + LIST_INSERT_HEAD(head, stcb, sctp_asocs); + SCTP_INP_INFO_WUNLOCK(); + + if ((err = sctp_add_remote_addr(stcb, firstaddr, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC))) { + /* failure.. memory error? */ + if (asoc->strmout) { + SCTP_FREE(asoc->strmout, SCTP_M_STRMO); + asoc->strmout = NULL; + } + if (asoc->mapping_array) { + SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); + asoc->mapping_array = NULL; + } + if (asoc->nr_mapping_array) { + SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP); + asoc->nr_mapping_array = NULL; + } + SCTP_DECR_ASOC_COUNT(); + SCTP_TCB_LOCK_DESTROY(stcb); + SCTP_TCB_SEND_LOCK_DESTROY(stcb); + LIST_REMOVE(stcb, sctp_tcbasocidhash); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); + SCTP_INP_WUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); + *error = ENOBUFS; + return (NULL); + } + /* Init all the timers */ + SCTP_OS_TIMER_INIT(&asoc->hb_timer.timer); + SCTP_OS_TIMER_INIT(&asoc->dack_timer.timer); + SCTP_OS_TIMER_INIT(&asoc->strreset_timer.timer); + SCTP_OS_TIMER_INIT(&asoc->asconf_timer.timer); + SCTP_OS_TIMER_INIT(&asoc->shut_guard_timer.timer); + SCTP_OS_TIMER_INIT(&asoc->autoclose_timer.timer); + SCTP_OS_TIMER_INIT(&asoc->delayed_event_timer.timer); + SCTP_OS_TIMER_INIT(&asoc->delete_prim_timer.timer); + + LIST_INSERT_HEAD(&inp->sctp_asoc_list, stcb, sctp_tcblist); + /* now file the port under the hash as well */ + if (inp->sctp_tcbhash != NULL) { + head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(stcb->rport, + inp->sctp_hashmark)]; + LIST_INSERT_HEAD(head, stcb, sctp_tcbhash); + } + SCTP_INP_WUNLOCK(inp); + SCTPDBG(SCTP_DEBUG_PCB1, "Association %p now allocated\n", stcb); + return (stcb); +} + + +void +sctp_remove_net(struct sctp_tcb *stcb, struct sctp_nets *net) +{ + struct sctp_association *asoc; + + asoc = &stcb->asoc; + asoc->numnets--; + TAILQ_REMOVE(&asoc->nets, net, sctp_next); + if (net == asoc->primary_destination) { + /* Reset primary */ + struct sctp_nets *lnet; + + lnet = TAILQ_FIRST(&asoc->nets); + /* + * Mobility adaptation Ideally, if deleted destination is + * the primary, it becomes a fast retransmission trigger by + * the subsequent SET PRIMARY. (by micchie) + */ + if (sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_BASE) || + sctp_is_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_FASTHANDOFF)) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: primary dst is deleting\n"); + if (asoc->deleted_primary != NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: deleted primary may be already stored\n"); + goto out; + } + asoc->deleted_primary = net; + atomic_add_int(&net->ref_count, 1); + memset(&net->lastsa, 0, sizeof(net->lastsa)); + memset(&net->lastsv, 0, sizeof(net->lastsv)); + sctp_mobility_feature_on(stcb->sctp_ep, + SCTP_MOBILITY_PRIM_DELETED); + sctp_timer_start(SCTP_TIMER_TYPE_PRIM_DELETED, + stcb->sctp_ep, stcb, NULL); + } +out: + /* Try to find a confirmed primary */ + asoc->primary_destination = sctp_find_alternate_net(stcb, lnet, 0); + } + if (net == asoc->last_data_chunk_from) { + /* Reset primary */ + asoc->last_data_chunk_from = TAILQ_FIRST(&asoc->nets); + } + if (net == asoc->last_control_chunk_from) { + /* Clear net */ + asoc->last_control_chunk_from = NULL; + } + sctp_free_remote_addr(net); +} + +/* + * remove a remote endpoint address from an association, it will fail if the + * address does not exist. + */ +int +sctp_del_remote_addr(struct sctp_tcb *stcb, struct sockaddr *remaddr) +{ + /* + * Here we need to remove a remote address. This is quite simple, we + * first find it in the list of address for the association + * (tasoc->asoc.nets) and then if it is there, we do a LIST_REMOVE + * on that item. Note we do not allow it to be removed if there are + * no other addresses. + */ + struct sctp_association *asoc; + struct sctp_nets *net, *net_tmp; + + asoc = &stcb->asoc; + + /* locate the address */ + for (net = TAILQ_FIRST(&asoc->nets); net != NULL; net = net_tmp) { + net_tmp = TAILQ_NEXT(net, sctp_next); + if (net->ro._l_addr.sa.sa_family != remaddr->sa_family) { + continue; + } + if (sctp_cmpaddr((struct sockaddr *)&net->ro._l_addr, + remaddr)) { + /* we found the guy */ + if (asoc->numnets < 2) { + /* Must have at LEAST two remote addresses */ + return (-1); + } else { + sctp_remove_net(stcb, net); + return (0); + } + } + } + /* not found. */ + return (-2); +} + +void +sctp_delete_from_timewait(uint32_t tag, uint16_t lport, uint16_t rport) +{ + struct sctpvtaghead *chain; + struct sctp_tagblock *twait_block; + int found = 0; + int i; + + chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; + if (!LIST_EMPTY(chain)) { + LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { + for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { + if ((twait_block->vtag_block[i].v_tag == tag) && + (twait_block->vtag_block[i].lport == lport) && + (twait_block->vtag_block[i].rport == rport)) { + twait_block->vtag_block[i].tv_sec_at_expire = 0; + twait_block->vtag_block[i].v_tag = 0; + twait_block->vtag_block[i].lport = 0; + twait_block->vtag_block[i].rport = 0; + found = 1; + break; + } + } + if (found) + break; + } + } +} + +int +sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport) +{ + struct sctpvtaghead *chain; + struct sctp_tagblock *twait_block; + int found = 0; + int i; + + SCTP_INP_INFO_WLOCK(); + chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; + if (!LIST_EMPTY(chain)) { + LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { + for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { + if ((twait_block->vtag_block[i].v_tag == tag) && + (twait_block->vtag_block[i].lport == lport) && + (twait_block->vtag_block[i].rport == rport)) { + found = 1; + break; + } + } + if (found) + break; + } + } + SCTP_INP_INFO_WUNLOCK(); + return (found); +} + + +void +sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t rport) +{ + struct sctpvtaghead *chain; + struct sctp_tagblock *twait_block; + struct timeval now; + int set, i; + + if (time == 0) { + /* Its disabled */ + return; + } + (void)SCTP_GETTIME_TIMEVAL(&now); + chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; + set = 0; + if (!LIST_EMPTY(chain)) { + /* Block(s) present, lets find space, and expire on the fly */ + LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { + for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { + if ((twait_block->vtag_block[i].v_tag == 0) && + !set) { + twait_block->vtag_block[i].tv_sec_at_expire = + now.tv_sec + time; + twait_block->vtag_block[i].v_tag = tag; + twait_block->vtag_block[i].lport = lport; + twait_block->vtag_block[i].rport = rport; + set = 1; + } else if ((twait_block->vtag_block[i].v_tag) && + ((long)twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) { + /* Audit expires this guy */ + twait_block->vtag_block[i].tv_sec_at_expire = 0; + twait_block->vtag_block[i].v_tag = 0; + twait_block->vtag_block[i].lport = 0; + twait_block->vtag_block[i].rport = 0; + if (set == 0) { + /* Reuse it for my new tag */ + twait_block->vtag_block[i].tv_sec_at_expire = now.tv_sec + time; + twait_block->vtag_block[i].v_tag = tag; + twait_block->vtag_block[i].lport = lport; + twait_block->vtag_block[i].rport = rport; + set = 1; + } + } + } + if (set) { + /* + * We only do up to the block where we can + * place our tag for audits + */ + break; + } + } + } + /* Need to add a new block to chain */ + if (!set) { + SCTP_MALLOC(twait_block, struct sctp_tagblock *, + sizeof(struct sctp_tagblock), SCTP_M_TIMW); + if (twait_block == NULL) { +#ifdef INVARIANTS + panic("Can not alloc tagblock"); +#endif + return; + } + memset(twait_block, 0, sizeof(struct sctp_tagblock)); + LIST_INSERT_HEAD(chain, twait_block, sctp_nxt_tagblock); + twait_block->vtag_block[0].tv_sec_at_expire = now.tv_sec + time; + twait_block->vtag_block[0].v_tag = tag; + twait_block->vtag_block[0].lport = lport; + twait_block->vtag_block[0].rport = rport; + } +} + + + +/*- + * Free the association after un-hashing the remote port. This + * function ALWAYS returns holding NO LOCK on the stcb. It DOES + * expect that the input to this function IS a locked TCB. + * It will return 0, if it did NOT destroy the association (instead + * it unlocks it. It will return NON-zero if it either destroyed the + * association OR the association is already destroyed. + */ +int +sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfree, int from_location) +{ + int i; + struct sctp_association *asoc; + struct sctp_nets *net, *prev; + struct sctp_laddr *laddr; + struct sctp_tmit_chunk *chk; + struct sctp_asconf_addr *aparam; + struct sctp_asconf_ack *aack; + struct sctp_stream_reset_list *liste; + struct sctp_queued_to_read *sq; + struct sctp_stream_queue_pending *sp; + sctp_sharedkey_t *shared_key; + struct socket *so; + int ccnt = 0; + int cnt = 0; + + /* first, lets purge the entry from the hash table. */ + +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, stcb, 6); +#endif + if (stcb->asoc.state == 0) { +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 7); +#endif + /* there is no asoc, really TSNH :-0 */ + return (1); + } + /* TEMP CODE */ + if (stcb->freed_from_where == 0) { + /* Only record the first place free happened from */ + stcb->freed_from_where = from_location; + } + /* TEMP CODE */ + + asoc = &stcb->asoc; + if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) + /* nothing around */ + so = NULL; + else + so = inp->sctp_socket; + + /* + * We used timer based freeing if a reader or writer is in the way. + * So we first check if we are actually being called from a timer, + * if so we abort early if a reader or writer is still in the way. + */ + if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) && + (from_inpcbfree == SCTP_NORMAL_PROC)) { + /* + * is it the timer driving us? if so are the reader/writers + * gone? + */ + if (stcb->asoc.refcnt) { + /* nope, reader or writer in the way */ + sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); + /* no asoc destroyed */ + SCTP_TCB_UNLOCK(stcb); +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, stcb, 8); +#endif + return (0); + } + } + /* now clean up any other timers */ + (void)SCTP_OS_TIMER_STOP(&asoc->hb_timer.timer); + asoc->hb_timer.self = NULL; + (void)SCTP_OS_TIMER_STOP(&asoc->dack_timer.timer); + asoc->dack_timer.self = NULL; + (void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer); + /*- + * For stream reset we don't blast this unless + * it is a str-reset timer, it might be the + * free-asoc timer which we DON'T want to + * disturb. + */ + if (asoc->strreset_timer.type == SCTP_TIMER_TYPE_STRRESET) + asoc->strreset_timer.self = NULL; + (void)SCTP_OS_TIMER_STOP(&asoc->asconf_timer.timer); + asoc->asconf_timer.self = NULL; + (void)SCTP_OS_TIMER_STOP(&asoc->autoclose_timer.timer); + asoc->autoclose_timer.self = NULL; + (void)SCTP_OS_TIMER_STOP(&asoc->shut_guard_timer.timer); + asoc->shut_guard_timer.self = NULL; + (void)SCTP_OS_TIMER_STOP(&asoc->delayed_event_timer.timer); + asoc->delayed_event_timer.self = NULL; + /* Mobility adaptation */ + (void)SCTP_OS_TIMER_STOP(&asoc->delete_prim_timer.timer); + asoc->delete_prim_timer.self = NULL; + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + (void)SCTP_OS_TIMER_STOP(&net->fr_timer.timer); + net->fr_timer.self = NULL; + (void)SCTP_OS_TIMER_STOP(&net->rxt_timer.timer); + net->rxt_timer.self = NULL; + (void)SCTP_OS_TIMER_STOP(&net->pmtu_timer.timer); + net->pmtu_timer.self = NULL; + } + /* Now the read queue needs to be cleaned up (only once) */ + cnt = 0; + if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0) { + stcb->asoc.state |= SCTP_STATE_ABOUT_TO_BE_FREED; + SCTP_INP_READ_LOCK(inp); + TAILQ_FOREACH(sq, &inp->read_queue, next) { + if (sq->stcb == stcb) { + sq->do_not_ref_stcb = 1; + sq->sinfo_cumtsn = stcb->asoc.cumulative_tsn; + /* + * If there is no end, there never will be + * now. + */ + if (sq->end_added == 0) { + /* Held for PD-API clear that. */ + sq->pdapi_aborted = 1; + sq->held_length = 0; + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) { + /* + * Need to add a PD-API + * aborted indication. + * Setting the control_pdapi + * assures that it will be + * added right after this + * msg. + */ + uint32_t strseq; + + stcb->asoc.control_pdapi = sq; + strseq = (sq->sinfo_stream << 16) | sq->sinfo_ssn; + sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION, + stcb, + SCTP_PARTIAL_DELIVERY_ABORTED, + (void *)&strseq, + SCTP_SO_LOCKED); + stcb->asoc.control_pdapi = NULL; + } + } + /* Add an end to wake them */ + sq->end_added = 1; + cnt++; + } + } + SCTP_INP_READ_UNLOCK(inp); + if (stcb->block_entry) { + cnt++; + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PCB, ECONNRESET); + stcb->block_entry->error = ECONNRESET; + stcb->block_entry = NULL; + } + } + if ((stcb->asoc.refcnt) || (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE)) { + /* + * Someone holds a reference OR the socket is unaccepted + * yet. + */ + if ((stcb->asoc.refcnt) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { + stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; + sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); + } + SCTP_TCB_UNLOCK(stcb); + if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) + /* nothing around */ + so = NULL; + if (so) { + /* Wake any reader/writers */ + sctp_sorwakeup(inp, so); + sctp_sowwakeup(inp, so); + } +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, stcb, 9); +#endif + /* no asoc destroyed */ + return (0); + } +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, stcb, 10); +#endif + /* + * When I reach here, no others want to kill the assoc yet.. and I + * own the lock. Now its possible an abort comes in when I do the + * lock exchange below to grab all the locks to do the final take + * out. to prevent this we increment the count, which will start a + * timer and blow out above thus assuring us that we hold exclusive + * killing of the asoc. Note that after getting back the TCB lock we + * will go ahead and increment the counter back up and stop any + * timer a passing stranger may have started :-S + */ + if (from_inpcbfree == SCTP_NORMAL_PROC) { + atomic_add_int(&stcb->asoc.refcnt, 1); + + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_INFO_WLOCK(); + SCTP_INP_WLOCK(inp); + SCTP_TCB_LOCK(stcb); + } + /* Double check the GONE flag */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) + /* nothing around */ + so = NULL; + + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + /* + * For TCP type we need special handling when we are + * connected. We also include the peel'ed off ones to. + */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { + inp->sctp_flags &= ~SCTP_PCB_FLAGS_CONNECTED; + inp->sctp_flags |= SCTP_PCB_FLAGS_WAS_CONNECTED; + if (so) { + SOCK_LOCK(so); + if (so->so_rcv.sb_cc == 0) { + so->so_state &= ~(SS_ISCONNECTING | + SS_ISDISCONNECTING | + SS_ISCONFIRMING | + SS_ISCONNECTED); + } + socantrcvmore_locked(so); + sctp_sowwakeup(inp, so); + sctp_sorwakeup(inp, so); + SCTP_SOWAKEUP(so); + } + } + } + /* + * Make it invalid too, that way if its about to run it will abort + * and return. + */ + /* re-increment the lock */ + if (from_inpcbfree == SCTP_NORMAL_PROC) { + atomic_add_int(&stcb->asoc.refcnt, -1); + } + if (stcb->asoc.refcnt) { + stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; + sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); + if (from_inpcbfree == SCTP_NORMAL_PROC) { + SCTP_INP_INFO_WUNLOCK(); + SCTP_INP_WUNLOCK(inp); + } + SCTP_TCB_UNLOCK(stcb); + return (0); + } + asoc->state = 0; + if (inp->sctp_tcbhash) { + LIST_REMOVE(stcb, sctp_tcbhash); + } + if (stcb->asoc.in_asocid_hash) { + LIST_REMOVE(stcb, sctp_tcbasocidhash); + } + /* Now lets remove it from the list of ALL associations in the EP */ + LIST_REMOVE(stcb, sctp_tcblist); + if (from_inpcbfree == SCTP_NORMAL_PROC) { + SCTP_INP_INCR_REF(inp); + SCTP_INP_WUNLOCK(inp); + } + /* pull from vtag hash */ + LIST_REMOVE(stcb, sctp_asocs); + sctp_add_vtag_to_timewait(asoc->my_vtag, SCTP_BASE_SYSCTL(sctp_vtag_time_wait), + inp->sctp_lport, stcb->rport); + + /* + * Now restop the timers to be sure this is paranoia at is finest! + */ + (void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->hb_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->dack_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->asconf_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->shut_guard_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->autoclose_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->delayed_event_timer.timer); + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + (void)SCTP_OS_TIMER_STOP(&net->fr_timer.timer); + (void)SCTP_OS_TIMER_STOP(&net->rxt_timer.timer); + (void)SCTP_OS_TIMER_STOP(&net->pmtu_timer.timer); + } + + asoc->strreset_timer.type = SCTP_TIMER_TYPE_NONE; + prev = NULL; + /* + * The chunk lists and such SHOULD be empty but we check them just + * in case. + */ + /* anything on the wheel needs to be removed */ + for (i = 0; i < asoc->streamoutcnt; i++) { + struct sctp_stream_out *outs; + + outs = &asoc->strmout[i]; + /* now clean up any chunks here */ + sp = TAILQ_FIRST(&outs->outqueue); + while (sp) { + TAILQ_REMOVE(&outs->outqueue, sp, next); + if (sp->data) { + if (so) { + /* Still an open socket - report */ + sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, + SCTP_NOTIFY_DATAGRAM_UNSENT, + (void *)sp, SCTP_SO_LOCKED); + } + if (sp->data) { + sctp_m_freem(sp->data); + sp->data = NULL; + sp->tail_mbuf = NULL; + } + } + if (sp->net) { + sctp_free_remote_addr(sp->net); + sp->net = NULL; + } + sctp_free_spbufspace(stcb, asoc, sp); + if (sp->holds_key_ref) + sctp_auth_key_release(stcb, sp->auth_keyid); + /* Free the zone stuff */ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_strmoq), sp); + SCTP_DECR_STRMOQ_COUNT(); + /* sa_ignore FREED_MEMORY */ + sp = TAILQ_FIRST(&outs->outqueue); + } + } + + /* sa_ignore FREED_MEMORY */ + while ((liste = TAILQ_FIRST(&asoc->resetHead)) != NULL) { + TAILQ_REMOVE(&asoc->resetHead, liste, next_resp); + SCTP_FREE(liste, SCTP_M_STRESET); + } + + sq = TAILQ_FIRST(&asoc->pending_reply_queue); + while (sq) { + TAILQ_REMOVE(&asoc->pending_reply_queue, sq, next); + if (sq->data) { + sctp_m_freem(sq->data); + sq->data = NULL; + } + sctp_free_remote_addr(sq->whoFrom); + sq->whoFrom = NULL; + sq->stcb = NULL; + /* Free the ctl entry */ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), sq); + SCTP_DECR_READQ_COUNT(); + /* sa_ignore FREED_MEMORY */ + sq = TAILQ_FIRST(&asoc->pending_reply_queue); + } + + chk = TAILQ_FIRST(&asoc->free_chunks); + while (chk) { + TAILQ_REMOVE(&asoc->free_chunks, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + if (chk->holds_key_ref) + sctp_auth_key_release(stcb, chk->auth_keyid); + ccnt++; + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); + SCTP_DECR_CHK_COUNT(); + atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); + asoc->free_chunk_cnt--; + /* sa_ignore FREED_MEMORY */ + chk = TAILQ_FIRST(&asoc->free_chunks); + } + /* pending send queue SHOULD be empty */ + if (!TAILQ_EMPTY(&asoc->send_queue)) { + chk = TAILQ_FIRST(&asoc->send_queue); + while (chk) { + TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next); + if (chk->data) { + if (so) { + /* Still a socket? */ + sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, + SCTP_NOTIFY_DATAGRAM_UNSENT, chk, SCTP_SO_LOCKED); + } + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + } + if (chk->holds_key_ref) + sctp_auth_key_release(stcb, chk->auth_keyid); + ccnt++; + if (chk->whoTo) { + sctp_free_remote_addr(chk->whoTo); + chk->whoTo = NULL; + } + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); + SCTP_DECR_CHK_COUNT(); + /* sa_ignore FREED_MEMORY */ + chk = TAILQ_FIRST(&asoc->send_queue); + } + } +/* + if (ccnt) { + printf("Freed %d from send_queue\n", ccnt); + ccnt = 0; + } +*/ + /* sent queue SHOULD be empty */ + if (!TAILQ_EMPTY(&asoc->sent_queue)) { + chk = TAILQ_FIRST(&asoc->sent_queue); + while (chk) { + TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next); + if (chk->data) { + if (so) { + /* Still a socket? */ + sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, + SCTP_NOTIFY_DATAGRAM_SENT, chk, SCTP_SO_LOCKED); + } + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + } + if (chk->holds_key_ref) + sctp_auth_key_release(stcb, chk->auth_keyid); + ccnt++; + sctp_free_remote_addr(chk->whoTo); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); + SCTP_DECR_CHK_COUNT(); + /* sa_ignore FREED_MEMORY */ + chk = TAILQ_FIRST(&asoc->sent_queue); + } + } +/* + if (ccnt) { + printf("Freed %d from sent_queue\n", ccnt); + ccnt = 0; + } +*/ + /* control queue MAY not be empty */ + if (!TAILQ_EMPTY(&asoc->control_send_queue)) { + chk = TAILQ_FIRST(&asoc->control_send_queue); + while (chk) { + TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + if (chk->holds_key_ref) + sctp_auth_key_release(stcb, chk->auth_keyid); + ccnt++; + sctp_free_remote_addr(chk->whoTo); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); + SCTP_DECR_CHK_COUNT(); + /* sa_ignore FREED_MEMORY */ + chk = TAILQ_FIRST(&asoc->control_send_queue); + } + } +/* + if (ccnt) { + printf("Freed %d from ctrl_queue\n", ccnt); + ccnt = 0; + } +*/ + + /* ASCONF queue MAY not be empty */ + if (!TAILQ_EMPTY(&asoc->asconf_send_queue)) { + chk = TAILQ_FIRST(&asoc->asconf_send_queue); + while (chk) { + TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + if (chk->holds_key_ref) + sctp_auth_key_release(stcb, chk->auth_keyid); + ccnt++; + sctp_free_remote_addr(chk->whoTo); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); + SCTP_DECR_CHK_COUNT(); + /* sa_ignore FREED_MEMORY */ + chk = TAILQ_FIRST(&asoc->asconf_send_queue); + } + } +/* + if (ccnt) { + printf("Freed %d from asconf_queue\n", ccnt); + ccnt = 0; + } +*/ + if (!TAILQ_EMPTY(&asoc->reasmqueue)) { + chk = TAILQ_FIRST(&asoc->reasmqueue); + while (chk) { + TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + if (chk->holds_key_ref) + sctp_auth_key_release(stcb, chk->auth_keyid); + sctp_free_remote_addr(chk->whoTo); + ccnt++; + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); + SCTP_DECR_CHK_COUNT(); + /* sa_ignore FREED_MEMORY */ + chk = TAILQ_FIRST(&asoc->reasmqueue); + } + } +/* + if (ccnt) { + printf("Freed %d from reasm_queue\n", ccnt); + ccnt = 0; + } +*/ + if (asoc->mapping_array) { + SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); + asoc->mapping_array = NULL; + } + if (asoc->nr_mapping_array) { + SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP); + asoc->nr_mapping_array = NULL; + } + /* the stream outs */ + if (asoc->strmout) { + SCTP_FREE(asoc->strmout, SCTP_M_STRMO); + asoc->strmout = NULL; + } + asoc->strm_realoutsize = asoc->streamoutcnt = 0; + if (asoc->strmin) { + struct sctp_queued_to_read *ctl; + + for (i = 0; i < asoc->streamincnt; i++) { + if (!TAILQ_EMPTY(&asoc->strmin[i].inqueue)) { + /* We have somethings on the streamin queue */ + ctl = TAILQ_FIRST(&asoc->strmin[i].inqueue); + while (ctl) { + TAILQ_REMOVE(&asoc->strmin[i].inqueue, + ctl, next); + sctp_free_remote_addr(ctl->whoFrom); + if (ctl->data) { + sctp_m_freem(ctl->data); + ctl->data = NULL; + } + /* + * We don't free the address here + * since all the net's were freed + * above. + */ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), ctl); + SCTP_DECR_READQ_COUNT(); + ctl = TAILQ_FIRST(&asoc->strmin[i].inqueue); + } + } + } + SCTP_FREE(asoc->strmin, SCTP_M_STRMI); + asoc->strmin = NULL; + } + asoc->streamincnt = 0; + while (!TAILQ_EMPTY(&asoc->nets)) { + /* sa_ignore FREED_MEMORY */ + net = TAILQ_FIRST(&asoc->nets); + /* pull from list */ + if ((SCTP_BASE_INFO(ipi_count_raddr) == 0) || (prev == net)) { +#ifdef INVARIANTS + panic("no net's left alloc'ed, or list points to itself"); +#endif + break; + } + prev = net; + TAILQ_REMOVE(&asoc->nets, net, sctp_next); + sctp_free_remote_addr(net); + } + + while (!LIST_EMPTY(&asoc->sctp_restricted_addrs)) { + /* sa_ignore FREED_MEMORY */ + laddr = LIST_FIRST(&asoc->sctp_restricted_addrs); + sctp_remove_laddr(laddr); + } + + /* pending asconf (address) parameters */ + while (!TAILQ_EMPTY(&asoc->asconf_queue)) { + /* sa_ignore FREED_MEMORY */ + aparam = TAILQ_FIRST(&asoc->asconf_queue); + TAILQ_REMOVE(&asoc->asconf_queue, aparam, next); + SCTP_FREE(aparam, SCTP_M_ASC_ADDR); + } + while (!TAILQ_EMPTY(&asoc->asconf_ack_sent)) { + /* sa_ignore FREED_MEMORY */ + aack = TAILQ_FIRST(&asoc->asconf_ack_sent); + TAILQ_REMOVE(&asoc->asconf_ack_sent, aack, next); + if (aack->data != NULL) { + sctp_m_freem(aack->data); + } + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), aack); + } + /* clean up auth stuff */ + if (asoc->local_hmacs) + sctp_free_hmaclist(asoc->local_hmacs); + if (asoc->peer_hmacs) + sctp_free_hmaclist(asoc->peer_hmacs); + + if (asoc->local_auth_chunks) + sctp_free_chunklist(asoc->local_auth_chunks); + if (asoc->peer_auth_chunks) + sctp_free_chunklist(asoc->peer_auth_chunks); + + sctp_free_authinfo(&asoc->authinfo); + + shared_key = LIST_FIRST(&asoc->shared_keys); + while (shared_key) { + LIST_REMOVE(shared_key, next); + sctp_free_sharedkey(shared_key); + /* sa_ignore FREED_MEMORY */ + shared_key = LIST_FIRST(&asoc->shared_keys); + } + + /* Insert new items here :> */ + + /* Get rid of LOCK */ + SCTP_TCB_LOCK_DESTROY(stcb); + SCTP_TCB_SEND_LOCK_DESTROY(stcb); + if (from_inpcbfree == SCTP_NORMAL_PROC) { + SCTP_INP_INFO_WUNLOCK(); + SCTP_INP_RLOCK(inp); + } +#ifdef SCTP_TRACK_FREED_ASOCS + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + /* now clean up the tasoc itself */ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); + SCTP_DECR_ASOC_COUNT(); + } else { + LIST_INSERT_HEAD(&inp->sctp_asoc_free_list, stcb, sctp_tcblist); + } +#else + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); + SCTP_DECR_ASOC_COUNT(); +#endif + if (from_inpcbfree == SCTP_NORMAL_PROC) { + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + /* + * If its NOT the inp_free calling us AND sctp_close + * as been called, we call back... + */ + SCTP_INP_RUNLOCK(inp); + /* + * This will start the kill timer (if we are the + * last one) since we hold an increment yet. But + * this is the only safe way to do this since + * otherwise if the socket closes at the same time + * we are here we might collide in the cleanup. + */ + sctp_inpcb_free(inp, + SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE, + SCTP_CALLED_DIRECTLY_NOCMPSET); + SCTP_INP_DECR_REF(inp); + goto out_of; + } else { + /* The socket is still open. */ + SCTP_INP_DECR_REF(inp); + } + } + if (from_inpcbfree == SCTP_NORMAL_PROC) { + SCTP_INP_RUNLOCK(inp); + } +out_of: + /* destroyed the asoc */ +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 11); +#endif + return (1); +} + + + +/* + * determine if a destination is "reachable" based upon the addresses bound + * to the current endpoint (e.g. only v4 or v6 currently bound) + */ +/* + * FIX: if we allow assoc-level bindx(), then this needs to be fixed to use + * assoc level v4/v6 flags, as the assoc *may* not have the same address + * types bound as its endpoint + */ +int +sctp_destination_is_reachable(struct sctp_tcb *stcb, struct sockaddr *destaddr) +{ + struct sctp_inpcb *inp; + int answer; + + /* + * No locks here, the TCB, in all cases is already locked and an + * assoc is up. There is either a INP lock by the caller applied (in + * asconf case when deleting an address) or NOT in the HB case, + * however if HB then the INP increment is up and the INP will not + * be removed (on top of the fact that we have a TCB lock). So we + * only want to read the sctp_flags, which is either bound-all or + * not.. no protection needed since once an assoc is up you can't be + * changing your binding. + */ + inp = stcb->sctp_ep; + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + /* if bound all, destination is not restricted */ + /* + * RRS: Question during lock work: Is this correct? If you + * are bound-all you still might need to obey the V4--V6 + * flags??? IMO this bound-all stuff needs to be removed! + */ + return (1); + } + /* NOTE: all "scope" checks are done when local addresses are added */ + if (destaddr->sa_family == AF_INET6) { + answer = inp->ip_inp.inp.inp_vflag & INP_IPV6; + } else if (destaddr->sa_family == AF_INET) { + answer = inp->ip_inp.inp.inp_vflag & INP_IPV4; + } else { + /* invalid family, so it's unreachable */ + answer = 0; + } + return (answer); +} + +/* + * update the inp_vflags on an endpoint + */ +static void +sctp_update_ep_vflag(struct sctp_inpcb *inp) +{ + struct sctp_laddr *laddr; + + /* first clear the flag */ + inp->ip_inp.inp.inp_vflag = 0; + /* set the flag based on addresses on the ep list */ + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa == NULL) { + SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", + __FUNCTION__); + continue; + } + if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { + continue; + } + if (laddr->ifa->address.sa.sa_family == AF_INET6) { + inp->ip_inp.inp.inp_vflag |= INP_IPV6; + } else if (laddr->ifa->address.sa.sa_family == AF_INET) { + inp->ip_inp.inp.inp_vflag |= INP_IPV4; + } + } +} + +/* + * Add the address to the endpoint local address list There is nothing to be + * done if we are bound to all addresses + */ +void +sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t action) +{ + struct sctp_laddr *laddr; + int fnd, error = 0; + + fnd = 0; + + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + /* You are already bound to all. You have it already */ + return; + } + if (ifa->address.sa.sa_family == AF_INET6) { + if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { + /* Can't bind a non-useable addr. */ + return; + } + } + /* first, is it already present? */ + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa == ifa) { + fnd = 1; + break; + } + } + + if (fnd == 0) { + /* Not in the ep list */ + error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, action); + if (error != 0) + return; + inp->laddr_count++; + /* update inp_vflag flags */ + if (ifa->address.sa.sa_family == AF_INET6) { + inp->ip_inp.inp.inp_vflag |= INP_IPV6; + } else if (ifa->address.sa.sa_family == AF_INET) { + inp->ip_inp.inp.inp_vflag |= INP_IPV4; + } + } + return; +} + + +/* + * select a new (hopefully reachable) destination net (should only be used + * when we deleted an ep addr that is the only usable source address to reach + * the destination net) + */ +static void +sctp_select_primary_destination(struct sctp_tcb *stcb) +{ + struct sctp_nets *net; + + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + /* for now, we'll just pick the first reachable one we find */ + if (net->dest_state & SCTP_ADDR_UNCONFIRMED) + continue; + if (sctp_destination_is_reachable(stcb, + (struct sockaddr *)&net->ro._l_addr)) { + /* found a reachable destination */ + stcb->asoc.primary_destination = net; + } + } + /* I can't there from here! ...we're gonna die shortly... */ +} + + +/* + * Delete the address from the endpoint local address list There is nothing + * to be done if we are bound to all addresses + */ +void +sctp_del_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa) +{ + struct sctp_laddr *laddr; + int fnd; + + fnd = 0; + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + /* You are already bound to all. You have it already */ + return; + } + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa == ifa) { + fnd = 1; + break; + } + } + if (fnd && (inp->laddr_count < 2)) { + /* can't delete unless there are at LEAST 2 addresses */ + return; + } + if (fnd) { + /* + * clean up any use of this address go through our + * associations and clear any last_used_address that match + * this one for each assoc, see if a new primary_destination + * is needed + */ + struct sctp_tcb *stcb; + + /* clean up "next_addr_touse" */ + if (inp->next_addr_touse == laddr) + /* delete this address */ + inp->next_addr_touse = NULL; + + /* clean up "last_used_address" */ + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + struct sctp_nets *net; + + SCTP_TCB_LOCK(stcb); + if (stcb->asoc.last_used_address == laddr) + /* delete this address */ + stcb->asoc.last_used_address = NULL; + /* + * Now spin through all the nets and purge any ref + * to laddr + */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if (net->ro._s_addr && + (net->ro._s_addr->ifa == laddr->ifa)) { + /* Yep, purge src address selected */ + sctp_rtentry_t *rt; + + /* delete this address if cached */ + rt = net->ro.ro_rt; + if (rt != NULL) { + RTFREE(rt); + net->ro.ro_rt = NULL; + } + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + net->src_addr_selected = 0; + } + } + SCTP_TCB_UNLOCK(stcb); + } /* for each tcb */ + /* remove it from the ep list */ + sctp_remove_laddr(laddr); + inp->laddr_count--; + /* update inp_vflag flags */ + sctp_update_ep_vflag(inp); + } + return; +} + +/* + * Add the address to the TCB local address restricted list. + * This is a "pending" address list (eg. addresses waiting for an + * ASCONF-ACK response) and cannot be used as a valid source address. + */ +void +sctp_add_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) +{ + struct sctp_inpcb *inp; + struct sctp_laddr *laddr; + struct sctpladdr *list; + + /* + * Assumes TCB is locked.. and possibly the INP. May need to + * confirm/fix that if we need it and is not the case. + */ + list = &stcb->asoc.sctp_restricted_addrs; + + inp = stcb->sctp_ep; + if (ifa->address.sa.sa_family == AF_INET6) { + if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { + /* Can't bind a non-existent addr. */ + return; + } + } + /* does the address already exist? */ + LIST_FOREACH(laddr, list, sctp_nxt_addr) { + if (laddr->ifa == ifa) { + return; + } + } + + /* add to the list */ + (void)sctp_insert_laddr(list, ifa, 0); + return; +} + +/* + * insert an laddr entry with the given ifa for the desired list + */ +int +sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act) +{ + struct sctp_laddr *laddr; + + laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); + if (laddr == NULL) { + /* out of memory? */ + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); + return (EINVAL); + } + SCTP_INCR_LADDR_COUNT(); + bzero(laddr, sizeof(*laddr)); + (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time); + laddr->ifa = ifa; + laddr->action = act; + atomic_add_int(&ifa->refcount, 1); + /* insert it */ + LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr); + + return (0); +} + +/* + * Remove an laddr entry from the local address list (on an assoc) + */ +void +sctp_remove_laddr(struct sctp_laddr *laddr) +{ + + /* remove from the list */ + LIST_REMOVE(laddr, sctp_nxt_addr); + sctp_free_ifa(laddr->ifa); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr); + SCTP_DECR_LADDR_COUNT(); +} + +/* + * Remove a local address from the TCB local address restricted list + */ +void +sctp_del_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) +{ + struct sctp_inpcb *inp; + struct sctp_laddr *laddr; + + /* + * This is called by asconf work. It is assumed that a) The TCB is + * locked and b) The INP is locked. This is true in as much as I can + * trace through the entry asconf code where I did these locks. + * Again, the ASCONF code is a bit different in that it does lock + * the INP during its work often times. This must be since we don't + * want other proc's looking up things while what they are looking + * up is changing :-D + */ + + inp = stcb->sctp_ep; + /* if subset bound and don't allow ASCONF's, can't delete last */ + if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) && + sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) { + if (stcb->sctp_ep->laddr_count < 2) { + /* can't delete last address */ + return; + } + } + LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) { + /* remove the address if it exists */ + if (laddr->ifa == NULL) + continue; + if (laddr->ifa == ifa) { + sctp_remove_laddr(laddr); + return; + } + } + + /* address not found! */ + return; +} + +/* + * Temporarily remove for __APPLE__ until we use the Tiger equivalents + */ +/* sysctl */ +static int sctp_max_number_of_assoc = SCTP_MAX_NUM_OF_ASOC; +static int sctp_scale_up_for_address = SCTP_SCALE_FOR_ADDR; + +void +sctp_pcb_init() +{ + /* + * SCTP initialization for the PCB structures should be called by + * the sctp_init() funciton. + */ + int i; + struct timeval tv; + + if (SCTP_BASE_VAR(sctp_pcb_initialized) != 0) { + /* error I was called twice */ + return; + } + SCTP_BASE_VAR(sctp_pcb_initialized) = 1; + +#if defined(SCTP_LOCAL_TRACE_BUF) + bzero(&SCTP_BASE_SYSCTL(sctp_log), sizeof(struct sctp_log)); +#endif + (void)SCTP_GETTIME_TIMEVAL(&tv); +#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT) + SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_sec = (uint32_t) tv.tv_sec; + SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_usec = (uint32_t) tv.tv_usec; +#else + SCTP_BASE_STAT(sctps_discontinuitytime).tv_sec = (uint32_t) tv.tv_sec; + SCTP_BASE_STAT(sctps_discontinuitytime).tv_usec = (uint32_t) tv.tv_usec; +#endif + /* init the empty list of (All) Endpoints */ + LIST_INIT(&SCTP_BASE_INFO(listhead)); + + + /* init the hash table of endpoints */ + TUNABLE_INT_FETCH("net.inet.sctp.tcbhashsize", &SCTP_BASE_SYSCTL(sctp_hashtblsize)); + TUNABLE_INT_FETCH("net.inet.sctp.pcbhashsize", &SCTP_BASE_SYSCTL(sctp_pcbtblsize)); + TUNABLE_INT_FETCH("net.inet.sctp.chunkscale", &SCTP_BASE_SYSCTL(sctp_chunkscale)); + SCTP_BASE_INFO(sctp_asochash) = SCTP_HASH_INIT((SCTP_BASE_SYSCTL(sctp_hashtblsize) * 31), + &SCTP_BASE_INFO(hashasocmark)); + SCTP_BASE_INFO(sctp_ephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize), + &SCTP_BASE_INFO(hashmark)); + SCTP_BASE_INFO(sctp_tcpephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize), + &SCTP_BASE_INFO(hashtcpmark)); + SCTP_BASE_INFO(hashtblsize) = SCTP_BASE_SYSCTL(sctp_hashtblsize); + + + SCTP_BASE_INFO(sctp_vrfhash) = SCTP_HASH_INIT(SCTP_SIZE_OF_VRF_HASH, + &SCTP_BASE_INFO(hashvrfmark)); + + SCTP_BASE_INFO(vrf_ifn_hash) = SCTP_HASH_INIT(SCTP_VRF_IFN_HASH_SIZE, + &SCTP_BASE_INFO(vrf_ifn_hashmark)); + /* init the zones */ + /* + * FIX ME: Should check for NULL returns, but if it does fail we are + * doomed to panic anyways... add later maybe. + */ + SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_ep), "sctp_ep", + sizeof(struct sctp_inpcb), maxsockets); + + SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asoc), "sctp_asoc", + sizeof(struct sctp_tcb), sctp_max_number_of_assoc); + + SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_laddr), "sctp_laddr", + sizeof(struct sctp_laddr), + (sctp_max_number_of_assoc * sctp_scale_up_for_address)); + + SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_net), "sctp_raddr", + sizeof(struct sctp_nets), + (sctp_max_number_of_assoc * sctp_scale_up_for_address)); + + SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_chunk), "sctp_chunk", + sizeof(struct sctp_tmit_chunk), + (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); + + SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_readq), "sctp_readq", + sizeof(struct sctp_queued_to_read), + (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); + + SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_strmoq), "sctp_stream_msg_out", + sizeof(struct sctp_stream_queue_pending), + (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); + + SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf), "sctp_asconf", + sizeof(struct sctp_asconf), + (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); + + SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf_ack), "sctp_asconf_ack", + sizeof(struct sctp_asconf_ack), + (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); + + + /* Master Lock INIT for info structure */ + SCTP_INP_INFO_LOCK_INIT(); + SCTP_STATLOG_INIT_LOCK(); + + SCTP_IPI_COUNT_INIT(); + SCTP_IPI_ADDR_INIT(); +#ifdef SCTP_PACKET_LOGGING + SCTP_IP_PKTLOG_INIT(); +#endif + LIST_INIT(&SCTP_BASE_INFO(addr_wq)); + + SCTP_WQ_ADDR_INIT(); + /* not sure if we need all the counts */ + SCTP_BASE_INFO(ipi_count_ep) = 0; + /* assoc/tcb zone info */ + SCTP_BASE_INFO(ipi_count_asoc) = 0; + /* local addrlist zone info */ + SCTP_BASE_INFO(ipi_count_laddr) = 0; + /* remote addrlist zone info */ + SCTP_BASE_INFO(ipi_count_raddr) = 0; + /* chunk info */ + SCTP_BASE_INFO(ipi_count_chunk) = 0; + + /* socket queue zone info */ + SCTP_BASE_INFO(ipi_count_readq) = 0; + + /* stream out queue cont */ + SCTP_BASE_INFO(ipi_count_strmoq) = 0; + + SCTP_BASE_INFO(ipi_free_strmoq) = 0; + SCTP_BASE_INFO(ipi_free_chunks) = 0; + + SCTP_OS_TIMER_INIT(&SCTP_BASE_INFO(addr_wq_timer.timer)); + + /* Init the TIMEWAIT list */ + for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) { + LIST_INIT(&SCTP_BASE_INFO(vtag_timewait)[i]); + } + + sctp_startup_iterator(); + + /* + * INIT the default VRF which for BSD is the only one, other O/S's + * may have more. But initially they must start with one and then + * add the VRF's as addresses are added. + */ + sctp_init_vrf_list(SCTP_DEFAULT_VRF); + +} + +/* + * Assumes that the SCTP_BASE_INFO() lock is NOT held. + */ +void +sctp_pcb_finish(void) +{ + struct sctp_vrflist *vrf_bucket; + struct sctp_vrf *vrf; + struct sctp_ifn *ifn; + struct sctp_ifa *ifa; + struct sctpvtaghead *chain; + struct sctp_tagblock *twait_block, *prev_twait_block; + struct sctp_laddr *wi; + int i; + + /* + * Free BSD the it thread never exits but we do clean up. The only + * way freebsd reaches here if we have VRF's but we still add the + * ifdef to make it compile on old versions. + */ + { + struct sctp_iterator *it, *nit; + + SCTP_IPI_ITERATOR_WQ_LOCK(); + it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead); + while (it) { + nit = TAILQ_NEXT(it, sctp_nxt_itr); + if (it->vn != curvnet) { + it = nit; + continue; + } + TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, + it, sctp_nxt_itr); + if (it->function_atend != NULL) { + (*it->function_atend) (it->pointer, it->val); + } + SCTP_FREE(it, SCTP_M_ITER); + it = nit; + } + SCTP_IPI_ITERATOR_WQ_UNLOCK(); + SCTP_ITERATOR_LOCK(); + if ((sctp_it_ctl.cur_it) && + (sctp_it_ctl.cur_it->vn == curvnet)) { + sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT; + } + SCTP_ITERATOR_UNLOCK(); + } + + SCTP_OS_TIMER_STOP(&SCTP_BASE_INFO(addr_wq_timer.timer)); + SCTP_WQ_ADDR_LOCK(); + while ((wi = LIST_FIRST(&SCTP_BASE_INFO(addr_wq))) != NULL) { + LIST_REMOVE(wi, sctp_nxt_addr); + SCTP_DECR_LADDR_COUNT(); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), wi); + } + SCTP_WQ_ADDR_UNLOCK(); + + /* + * free the vrf/ifn/ifa lists and hashes (be sure address monitor is + * destroyed first). + */ + vrf_bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(SCTP_DEFAULT_VRFID & SCTP_BASE_INFO(hashvrfmark))]; + while ((vrf = LIST_FIRST(vrf_bucket)) != NULL) { + while ((ifn = LIST_FIRST(&vrf->ifnlist)) != NULL) { + while ((ifa = LIST_FIRST(&ifn->ifalist)) != NULL) { + /* free the ifa */ + LIST_REMOVE(ifa, next_bucket); + LIST_REMOVE(ifa, next_ifa); + SCTP_FREE(ifa, SCTP_M_IFA); + } + /* free the ifn */ + LIST_REMOVE(ifn, next_bucket); + LIST_REMOVE(ifn, next_ifn); + SCTP_FREE(ifn, SCTP_M_IFN); + } + SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark); + /* free the vrf */ + LIST_REMOVE(vrf, next_vrf); + SCTP_FREE(vrf, SCTP_M_VRF); + } + /* free the vrf hashes */ + SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_vrfhash), SCTP_BASE_INFO(hashvrfmark)); + SCTP_HASH_FREE(SCTP_BASE_INFO(vrf_ifn_hash), SCTP_BASE_INFO(vrf_ifn_hashmark)); + + /* + * free the TIMEWAIT list elements malloc'd in the function + * sctp_add_vtag_to_timewait()... + */ + for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) { + chain = &SCTP_BASE_INFO(vtag_timewait)[i]; + if (!LIST_EMPTY(chain)) { + prev_twait_block = NULL; + LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { + if (prev_twait_block) { + SCTP_FREE(prev_twait_block, SCTP_M_TIMW); + } + prev_twait_block = twait_block; + } + SCTP_FREE(prev_twait_block, SCTP_M_TIMW); + } + } + + /* free the locks and mutexes */ +#ifdef SCTP_PACKET_LOGGING + SCTP_IP_PKTLOG_DESTROY(); +#endif + SCTP_IPI_ADDR_DESTROY(); + SCTP_STATLOG_DESTROY(); + SCTP_INP_INFO_LOCK_DESTROY(); + + SCTP_WQ_ADDR_DESTROY(); + + SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_ep)); + SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asoc)); + SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_laddr)); + SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_net)); + SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_chunk)); + SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_readq)); + SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_strmoq)); + SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf)); + SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf_ack)); + /* Get rid of other stuff to */ + if (SCTP_BASE_INFO(sctp_asochash) != NULL) + SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark)); + if (SCTP_BASE_INFO(sctp_ephash) != NULL) + SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark)); + if (SCTP_BASE_INFO(sctp_tcpephash) != NULL) + SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark)); + +} + + +int +sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, + int iphlen, int offset, int limit, struct sctphdr *sh, + struct sockaddr *altsa) +{ + /* + * grub through the INIT pulling addresses and loading them to the + * nets structure in the asoc. The from address in the mbuf should + * also be loaded (if it is not already). This routine can be called + * with either INIT or INIT-ACK's as long as the m points to the IP + * packet and the offset points to the beginning of the parameters. + */ + struct sctp_inpcb *inp, *l_inp; + struct sctp_nets *net, *net_tmp; + struct ip *iph; + struct sctp_paramhdr *phdr, parm_buf; + struct sctp_tcb *stcb_tmp; + uint16_t ptype, plen; + struct sockaddr *sa; + struct sockaddr_storage dest_store; + struct sockaddr *local_sa = (struct sockaddr *)&dest_store; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + uint8_t random_store[SCTP_PARAM_BUFFER_SIZE]; + struct sctp_auth_random *p_random = NULL; + uint16_t random_len = 0; + uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE]; + struct sctp_auth_hmac_algo *hmacs = NULL; + uint16_t hmacs_len = 0; + uint8_t saw_asconf = 0; + uint8_t saw_asconf_ack = 0; + uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE]; + struct sctp_auth_chunk_list *chunks = NULL; + uint16_t num_chunks = 0; + sctp_key_t *new_key; + uint32_t keylen; + int got_random = 0, got_hmacs = 0, got_chklist = 0; + + /* First get the destination address setup too. */ + memset(&sin, 0, sizeof(sin)); + memset(&sin6, 0, sizeof(sin6)); + + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + sin.sin_port = stcb->rport; + + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_port = stcb->rport; + if (altsa == NULL) { + iph = mtod(m, struct ip *); + switch (iph->ip_v) { + case IPVERSION: + { + /* its IPv4 */ + struct sockaddr_in *sin_2; + + sin_2 = (struct sockaddr_in *)(local_sa); + memset(sin_2, 0, sizeof(sin)); + sin_2->sin_family = AF_INET; + sin_2->sin_len = sizeof(sin); + sin_2->sin_port = sh->dest_port; + sin_2->sin_addr.s_addr = iph->ip_dst.s_addr; + sin.sin_addr = iph->ip_src; + sa = (struct sockaddr *)&sin; + break; + } +#ifdef INET6 + case IPV6_VERSION >> 4: + { + /* its IPv6 */ + struct ip6_hdr *ip6; + struct sockaddr_in6 *sin6_2; + + ip6 = mtod(m, struct ip6_hdr *); + sin6_2 = (struct sockaddr_in6 *)(local_sa); + memset(sin6_2, 0, sizeof(sin6)); + sin6_2->sin6_family = AF_INET6; + sin6_2->sin6_len = sizeof(struct sockaddr_in6); + sin6_2->sin6_port = sh->dest_port; + sin6.sin6_addr = ip6->ip6_src; + sa = (struct sockaddr *)&sin6; + break; + } +#endif + default: + return (-1); + break; + } + } else { + /* + * For cookies we use the src address NOT from the packet + * but from the original INIT + */ + sa = altsa; + } + /* Turn off ECN until we get through all params */ + stcb->asoc.ecn_allowed = 0; + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + /* mark all addresses that we have currently on the list */ + net->dest_state |= SCTP_ADDR_NOT_IN_ASSOC; + } + /* does the source address already exist? if so skip it */ + l_inp = inp = stcb->sctp_ep; + + atomic_add_int(&stcb->asoc.refcnt, 1); + stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net_tmp, local_sa, stcb); + atomic_add_int(&stcb->asoc.refcnt, -1); + + if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) { + /* we must add the source address */ + /* no scope set here since we have a tcb already. */ + if ((sa->sa_family == AF_INET) && + (stcb->asoc.ipv4_addr_legal)) { + if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) { + return (-1); + } + } else if ((sa->sa_family == AF_INET6) && + (stcb->asoc.ipv6_addr_legal)) { + if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) { + return (-2); + } + } + } else { + if (net_tmp != NULL && stcb_tmp == stcb) { + net_tmp->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC; + } else if (stcb_tmp != stcb) { + /* It belongs to another association? */ + if (stcb_tmp) + SCTP_TCB_UNLOCK(stcb_tmp); + return (-3); + } + } + if (stcb->asoc.state == 0) { + /* the assoc was freed? */ + return (-4); + } + /* + * peer must explicitly turn this on. This may have been initialized + * to be "on" in order to allow local addr changes while INIT's are + * in flight. + */ + stcb->asoc.peer_supports_asconf = 0; + /* now we must go through each of the params. */ + phdr = sctp_get_next_param(m, offset, &parm_buf, sizeof(parm_buf)); + while (phdr) { + ptype = ntohs(phdr->param_type); + plen = ntohs(phdr->param_length); + /* + * printf("ptype => %0x, plen => %d\n", (uint32_t)ptype, + * (int)plen); + */ + if (offset + plen > limit) { + break; + } + if (plen == 0) { + break; + } + if (ptype == SCTP_IPV4_ADDRESS) { + if (stcb->asoc.ipv4_addr_legal) { + struct sctp_ipv4addr_param *p4, p4_buf; + + /* ok get the v4 address and check/add */ + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)&p4_buf, + sizeof(p4_buf)); + if (plen != sizeof(struct sctp_ipv4addr_param) || + phdr == NULL) { + return (-5); + } + p4 = (struct sctp_ipv4addr_param *)phdr; + sin.sin_addr.s_addr = p4->addr; + if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { + /* Skip multi-cast addresses */ + goto next_param; + } + if ((sin.sin_addr.s_addr == INADDR_BROADCAST) || + (sin.sin_addr.s_addr == INADDR_ANY)) { + goto next_param; + } + sa = (struct sockaddr *)&sin; + inp = stcb->sctp_ep; + atomic_add_int(&stcb->asoc.refcnt, 1); + stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net, + local_sa, stcb); + atomic_add_int(&stcb->asoc.refcnt, -1); + + if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || + inp == NULL) { + /* we must add the source address */ + /* + * no scope set since we have a tcb + * already + */ + + /* + * we must validate the state again + * here + */ + add_it_now: + if (stcb->asoc.state == 0) { + /* the assoc was freed? */ + return (-7); + } + if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) { + return (-8); + } + } else if (stcb_tmp == stcb) { + if (stcb->asoc.state == 0) { + /* the assoc was freed? */ + return (-10); + } + if (net != NULL) { + /* clear flag */ + net->dest_state &= + ~SCTP_ADDR_NOT_IN_ASSOC; + } + } else { + /* + * strange, address is in another + * assoc? straighten out locks. + */ + if (stcb_tmp) { + if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) { + /* + * in setup state we + * abort this guy + */ + sctp_abort_an_association(stcb_tmp->sctp_ep, + stcb_tmp, 1, NULL, 0); + goto add_it_now; + } + SCTP_TCB_UNLOCK(stcb_tmp); + } + if (stcb->asoc.state == 0) { + /* the assoc was freed? */ + return (-12); + } + return (-13); + } + } + } else if (ptype == SCTP_IPV6_ADDRESS) { + if (stcb->asoc.ipv6_addr_legal) { + /* ok get the v6 address and check/add */ + struct sctp_ipv6addr_param *p6, p6_buf; + + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)&p6_buf, + sizeof(p6_buf)); + if (plen != sizeof(struct sctp_ipv6addr_param) || + phdr == NULL) { + return (-14); + } + p6 = (struct sctp_ipv6addr_param *)phdr; + memcpy((caddr_t)&sin6.sin6_addr, p6->addr, + sizeof(p6->addr)); + if (IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) { + /* Skip multi-cast addresses */ + goto next_param; + } + if (IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) { + /* + * Link local make no sense without + * scope + */ + goto next_param; + } + sa = (struct sockaddr *)&sin6; + inp = stcb->sctp_ep; + atomic_add_int(&stcb->asoc.refcnt, 1); + stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net, + local_sa, stcb); + atomic_add_int(&stcb->asoc.refcnt, -1); + if (stcb_tmp == NULL && + (inp == stcb->sctp_ep || inp == NULL)) { + /* + * we must validate the state again + * here + */ + add_it_now6: + if (stcb->asoc.state == 0) { + /* the assoc was freed? */ + return (-16); + } + /* + * we must add the address, no scope + * set + */ + if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) { + return (-17); + } + } else if (stcb_tmp == stcb) { + /* + * we must validate the state again + * here + */ + if (stcb->asoc.state == 0) { + /* the assoc was freed? */ + return (-19); + } + if (net != NULL) { + /* clear flag */ + net->dest_state &= + ~SCTP_ADDR_NOT_IN_ASSOC; + } + } else { + /* + * strange, address is in another + * assoc? straighten out locks. + */ + if (stcb_tmp) + if (SCTP_GET_STATE(&stcb_tmp->asoc) & SCTP_STATE_COOKIE_WAIT) { + /* + * in setup state we + * abort this guy + */ + sctp_abort_an_association(stcb_tmp->sctp_ep, + stcb_tmp, 1, NULL, 0); + goto add_it_now6; + } + SCTP_TCB_UNLOCK(stcb_tmp); + + if (stcb->asoc.state == 0) { + /* the assoc was freed? */ + return (-21); + } + return (-22); + } + } + } else if (ptype == SCTP_ECN_CAPABLE) { + stcb->asoc.ecn_allowed = 1; + } else if (ptype == SCTP_ULP_ADAPTATION) { + if (stcb->asoc.state != SCTP_STATE_OPEN) { + struct sctp_adaptation_layer_indication ai, + *aip; + + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)&ai, sizeof(ai)); + aip = (struct sctp_adaptation_layer_indication *)phdr; + if (aip) { + stcb->asoc.peers_adaptation = ntohl(aip->indication); + stcb->asoc.adaptation_needed = 1; + } + } + } else if (ptype == SCTP_SET_PRIM_ADDR) { + struct sctp_asconf_addr_param lstore, *fee; + struct sctp_asconf_addrv4_param *fii; + int lptype; + struct sockaddr *lsa = NULL; + + stcb->asoc.peer_supports_asconf = 1; + if (plen > sizeof(lstore)) { + return (-23); + } + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)&lstore, + min(plen, sizeof(lstore))); + if (phdr == NULL) { + return (-24); + } + fee = (struct sctp_asconf_addr_param *)phdr; + lptype = ntohs(fee->addrp.ph.param_type); + if (lptype == SCTP_IPV4_ADDRESS) { + if (plen != + sizeof(struct sctp_asconf_addrv4_param)) { + SCTP_PRINTF("Sizeof setprim in init/init ack not %d but %d - ignored\n", + (int)sizeof(struct sctp_asconf_addrv4_param), + plen); + } else { + fii = (struct sctp_asconf_addrv4_param *)fee; + sin.sin_addr.s_addr = fii->addrp.addr; + lsa = (struct sockaddr *)&sin; + } + } else if (lptype == SCTP_IPV6_ADDRESS) { + if (plen != + sizeof(struct sctp_asconf_addr_param)) { + SCTP_PRINTF("Sizeof setprim (v6) in init/init ack not %d but %d - ignored\n", + (int)sizeof(struct sctp_asconf_addr_param), + plen); + } else { + memcpy(sin6.sin6_addr.s6_addr, + fee->addrp.addr, + sizeof(fee->addrp.addr)); + lsa = (struct sockaddr *)&sin6; + } + } + if (lsa) { + (void)sctp_set_primary_addr(stcb, sa, NULL); + } + } else if (ptype == SCTP_HAS_NAT_SUPPORT) { + stcb->asoc.peer_supports_nat = 1; + } else if (ptype == SCTP_PRSCTP_SUPPORTED) { + /* Peer supports pr-sctp */ + stcb->asoc.peer_supports_prsctp = 1; + } else if (ptype == SCTP_SUPPORTED_CHUNK_EXT) { + /* A supported extension chunk */ + struct sctp_supported_chunk_types_param *pr_supported; + uint8_t local_store[SCTP_PARAM_BUFFER_SIZE]; + int num_ent, i; + + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)&local_store, min(sizeof(local_store), plen)); + if (phdr == NULL) { + return (-25); + } + stcb->asoc.peer_supports_asconf = 0; + stcb->asoc.peer_supports_prsctp = 0; + stcb->asoc.peer_supports_pktdrop = 0; + stcb->asoc.peer_supports_strreset = 0; + stcb->asoc.peer_supports_nr_sack = 0; + stcb->asoc.peer_supports_auth = 0; + pr_supported = (struct sctp_supported_chunk_types_param *)phdr; + num_ent = plen - sizeof(struct sctp_paramhdr); + for (i = 0; i < num_ent; i++) { + switch (pr_supported->chunk_types[i]) { + case SCTP_ASCONF: + case SCTP_ASCONF_ACK: + stcb->asoc.peer_supports_asconf = 1; + break; + case SCTP_FORWARD_CUM_TSN: + stcb->asoc.peer_supports_prsctp = 1; + break; + case SCTP_PACKET_DROPPED: + stcb->asoc.peer_supports_pktdrop = 1; + break; + case SCTP_NR_SELECTIVE_ACK: + stcb->asoc.peer_supports_nr_sack = 1; + break; + case SCTP_STREAM_RESET: + stcb->asoc.peer_supports_strreset = 1; + break; + case SCTP_AUTHENTICATION: + stcb->asoc.peer_supports_auth = 1; + break; + default: + /* one I have not learned yet */ + break; + + } + } + } else if (ptype == SCTP_ECN_NONCE_SUPPORTED) { + /* Peer supports ECN-nonce */ + stcb->asoc.peer_supports_ecn_nonce = 1; + stcb->asoc.ecn_nonce_allowed = 1; + } else if (ptype == SCTP_RANDOM) { + if (plen > sizeof(random_store)) + break; + if (got_random) { + /* already processed a RANDOM */ + goto next_param; + } + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)random_store, + min(sizeof(random_store), plen)); + if (phdr == NULL) + return (-26); + p_random = (struct sctp_auth_random *)phdr; + random_len = plen - sizeof(*p_random); + /* enforce the random length */ + if (random_len != SCTP_AUTH_RANDOM_SIZE_REQUIRED) { + SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid RANDOM len\n"); + return (-27); + } + got_random = 1; + } else if (ptype == SCTP_HMAC_LIST) { + int num_hmacs; + int i; + + if (plen > sizeof(hmacs_store)) + break; + if (got_hmacs) { + /* already processed a HMAC list */ + goto next_param; + } + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)hmacs_store, + min(plen, sizeof(hmacs_store))); + if (phdr == NULL) + return (-28); + hmacs = (struct sctp_auth_hmac_algo *)phdr; + hmacs_len = plen - sizeof(*hmacs); + num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]); + /* validate the hmac list */ + if (sctp_verify_hmac_param(hmacs, num_hmacs)) { + return (-29); + } + if (stcb->asoc.peer_hmacs != NULL) + sctp_free_hmaclist(stcb->asoc.peer_hmacs); + stcb->asoc.peer_hmacs = sctp_alloc_hmaclist(num_hmacs); + if (stcb->asoc.peer_hmacs != NULL) { + for (i = 0; i < num_hmacs; i++) { + (void)sctp_auth_add_hmacid(stcb->asoc.peer_hmacs, + ntohs(hmacs->hmac_ids[i])); + } + } + got_hmacs = 1; + } else if (ptype == SCTP_CHUNK_LIST) { + int i; + + if (plen > sizeof(chunks_store)) + break; + if (got_chklist) { + /* already processed a Chunks list */ + goto next_param; + } + phdr = sctp_get_next_param(m, offset, + (struct sctp_paramhdr *)chunks_store, + min(plen, sizeof(chunks_store))); + if (phdr == NULL) + return (-30); + chunks = (struct sctp_auth_chunk_list *)phdr; + num_chunks = plen - sizeof(*chunks); + if (stcb->asoc.peer_auth_chunks != NULL) + sctp_clear_chunklist(stcb->asoc.peer_auth_chunks); + else + stcb->asoc.peer_auth_chunks = sctp_alloc_chunklist(); + for (i = 0; i < num_chunks; i++) { + (void)sctp_auth_add_chunk(chunks->chunk_types[i], + stcb->asoc.peer_auth_chunks); + /* record asconf/asconf-ack if listed */ + if (chunks->chunk_types[i] == SCTP_ASCONF) + saw_asconf = 1; + if (chunks->chunk_types[i] == SCTP_ASCONF_ACK) + saw_asconf_ack = 1; + + } + got_chklist = 1; + } else if ((ptype == SCTP_HEARTBEAT_INFO) || + (ptype == SCTP_STATE_COOKIE) || + (ptype == SCTP_UNRECOG_PARAM) || + (ptype == SCTP_COOKIE_PRESERVE) || + (ptype == SCTP_SUPPORTED_ADDRTYPE) || + (ptype == SCTP_ADD_IP_ADDRESS) || + (ptype == SCTP_DEL_IP_ADDRESS) || + (ptype == SCTP_ERROR_CAUSE_IND) || + (ptype == SCTP_SUCCESS_REPORT)) { + /* don't care */ ; + } else { + if ((ptype & 0x8000) == 0x0000) { + /* + * must stop processing the rest of the + * param's. Any report bits were handled + * with the call to + * sctp_arethere_unrecognized_parameters() + * when the INIT or INIT-ACK was first seen. + */ + break; + } + } + +next_param: + offset += SCTP_SIZE32(plen); + if (offset >= limit) { + break; + } + phdr = sctp_get_next_param(m, offset, &parm_buf, + sizeof(parm_buf)); + } + /* Now check to see if we need to purge any addresses */ + for (net = TAILQ_FIRST(&stcb->asoc.nets); net != NULL; net = net_tmp) { + net_tmp = TAILQ_NEXT(net, sctp_next); + if ((net->dest_state & SCTP_ADDR_NOT_IN_ASSOC) == + SCTP_ADDR_NOT_IN_ASSOC) { + /* This address has been removed from the asoc */ + /* remove and free it */ + stcb->asoc.numnets--; + TAILQ_REMOVE(&stcb->asoc.nets, net, sctp_next); + sctp_free_remote_addr(net); + if (net == stcb->asoc.primary_destination) { + stcb->asoc.primary_destination = NULL; + sctp_select_primary_destination(stcb); + } + } + } + /* validate authentication required parameters */ + if (got_random && got_hmacs) { + stcb->asoc.peer_supports_auth = 1; + } else { + stcb->asoc.peer_supports_auth = 0; + } + if (!stcb->asoc.peer_supports_auth && got_chklist) { + /* peer does not support auth but sent a chunks list? */ + return (-31); + } + if (!SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) && stcb->asoc.peer_supports_asconf && + !stcb->asoc.peer_supports_auth) { + /* peer supports asconf but not auth? */ + return (-32); + } else if ((stcb->asoc.peer_supports_asconf) && (stcb->asoc.peer_supports_auth) && + ((saw_asconf == 0) || (saw_asconf_ack == 0))) { + return (-33); + } + /* concatenate the full random key */ + keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len; + if (chunks != NULL) { + keylen += sizeof(*chunks) + num_chunks; + } + new_key = sctp_alloc_key(keylen); + if (new_key != NULL) { + /* copy in the RANDOM */ + if (p_random != NULL) { + keylen = sizeof(*p_random) + random_len; + bcopy(p_random, new_key->key, keylen); + } + /* append in the AUTH chunks */ + if (chunks != NULL) { + bcopy(chunks, new_key->key + keylen, + sizeof(*chunks) + num_chunks); + keylen += sizeof(*chunks) + num_chunks; + } + /* append in the HMACs */ + if (hmacs != NULL) { + bcopy(hmacs, new_key->key + keylen, + sizeof(*hmacs) + hmacs_len); + } + } else { + /* failed to get memory for the key */ + return (-34); + } + if (stcb->asoc.authinfo.peer_random != NULL) + sctp_free_key(stcb->asoc.authinfo.peer_random); + stcb->asoc.authinfo.peer_random = new_key; + sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid); + sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid); + + return (0); +} + +int +sctp_set_primary_addr(struct sctp_tcb *stcb, struct sockaddr *sa, + struct sctp_nets *net) +{ + /* make sure the requested primary address exists in the assoc */ + if (net == NULL && sa) + net = sctp_findnet(stcb, sa); + + if (net == NULL) { + /* didn't find the requested primary address! */ + return (-1); + } else { + /* set the primary address */ + if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { + /* Must be confirmed, so queue to set */ + net->dest_state |= SCTP_ADDR_REQ_PRIMARY; + return (0); + } + stcb->asoc.primary_destination = net; + net->dest_state &= ~SCTP_ADDR_WAS_PRIMARY; + net = TAILQ_FIRST(&stcb->asoc.nets); + if (net != stcb->asoc.primary_destination) { + /* + * first one on the list is NOT the primary + * sctp_cmpaddr() is much more efficient if the + * primary is the first on the list, make it so. + */ + TAILQ_REMOVE(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); + TAILQ_INSERT_HEAD(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); + } + return (0); + } +} + +int +sctp_is_vtag_good(struct sctp_inpcb *inp, uint32_t tag, uint16_t lport, uint16_t rport, struct timeval *now, int save_in_twait) +{ + /* + * This function serves two purposes. It will see if a TAG can be + * re-used and return 1 for yes it is ok and 0 for don't use that + * tag. A secondary function it will do is purge out old tags that + * can be removed. + */ + struct sctpvtaghead *chain; + struct sctp_tagblock *twait_block; + struct sctpasochead *head; + struct sctp_tcb *stcb; + int i; + + SCTP_INP_INFO_RLOCK(); + head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag, + SCTP_BASE_INFO(hashasocmark))]; + if (head == NULL) { + /* invalid vtag */ + goto skip_vtag_check; + } + LIST_FOREACH(stcb, head, sctp_asocs) { + /* + * We choose not to lock anything here. TCB's can't be + * removed since we have the read lock, so they can't be + * freed on us, same thing for the INP. I may be wrong with + * this assumption, but we will go with it for now :-) + */ + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + continue; + } + if (stcb->asoc.my_vtag == tag) { + /* candidate */ + if (stcb->rport != rport) { + continue; + } + if (stcb->sctp_ep->sctp_lport != lport) { + continue; + } + /* Its a used tag set */ + SCTP_INP_INFO_RUNLOCK(); + return (0); + } + } +skip_vtag_check: + + chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; + /* Now what about timed wait ? */ + if (!LIST_EMPTY(chain)) { + /* + * Block(s) are present, lets see if we have this tag in the + * list + */ + LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { + for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { + if (twait_block->vtag_block[i].v_tag == 0) { + /* not used */ + continue; + } else if ((long)twait_block->vtag_block[i].tv_sec_at_expire < + now->tv_sec) { + /* Audit expires this guy */ + twait_block->vtag_block[i].tv_sec_at_expire = 0; + twait_block->vtag_block[i].v_tag = 0; + twait_block->vtag_block[i].lport = 0; + twait_block->vtag_block[i].rport = 0; + } else if ((twait_block->vtag_block[i].v_tag == tag) && + (twait_block->vtag_block[i].lport == lport) && + (twait_block->vtag_block[i].rport == rport)) { + /* Bad tag, sorry :< */ + SCTP_INP_INFO_RUNLOCK(); + return (0); + } + } + } + } + SCTP_INP_INFO_RUNLOCK(); + return (1); +} + + +static sctp_assoc_t reneged_asoc_ids[256]; +static uint8_t reneged_at = 0; + + +static void +sctp_drain_mbufs(struct sctp_inpcb *inp, struct sctp_tcb *stcb) +{ + /* + * We must hunt this association for MBUF's past the cumack (i.e. + * out of order data that we can renege on). + */ + struct sctp_association *asoc; + struct sctp_tmit_chunk *chk, *nchk; + uint32_t cumulative_tsn_p1; + struct sctp_queued_to_read *ctl, *nctl; + int cnt, strmat; + uint32_t gap, i; + int fnd = 0; + + /* We look for anything larger than the cum-ack + 1 */ + + asoc = &stcb->asoc; + if (asoc->cumulative_tsn == asoc->highest_tsn_inside_map) { + /* none we can reneg on. */ + return; + } + SCTP_STAT_INCR(sctps_protocol_drains_done); + cumulative_tsn_p1 = asoc->cumulative_tsn + 1; + cnt = 0; + /* First look in the re-assembly queue */ + chk = TAILQ_FIRST(&asoc->reasmqueue); + while (chk) { + /* Get the next one */ + nchk = TAILQ_NEXT(chk, sctp_next); + if (compare_with_wrap(chk->rec.data.TSN_seq, + cumulative_tsn_p1, MAX_TSN)) { + /* Yep it is above cum-ack */ + cnt++; + SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.TSN_seq, asoc->mapping_array_base_tsn); + asoc->size_on_reasm_queue = sctp_sbspace_sub(asoc->size_on_reasm_queue, chk->send_size); + sctp_ucount_decr(asoc->cnt_on_reasm_queue); + SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); + TAILQ_REMOVE(&asoc->reasmqueue, chk, sctp_next); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + sctp_free_a_chunk(stcb, chk); + } + chk = nchk; + } + /* Ok that was fun, now we will drain all the inbound streams? */ + for (strmat = 0; strmat < asoc->streamincnt; strmat++) { + ctl = TAILQ_FIRST(&asoc->strmin[strmat].inqueue); + while (ctl) { + nctl = TAILQ_NEXT(ctl, next); + if (compare_with_wrap(ctl->sinfo_tsn, + cumulative_tsn_p1, MAX_TSN)) { + /* Yep it is above cum-ack */ + cnt++; + SCTP_CALC_TSN_TO_GAP(gap, ctl->sinfo_tsn, asoc->mapping_array_base_tsn); + asoc->size_on_all_streams = sctp_sbspace_sub(asoc->size_on_all_streams, ctl->length); + sctp_ucount_decr(asoc->cnt_on_all_streams); + SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); + TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, ctl, next); + if (ctl->data) { + sctp_m_freem(ctl->data); + ctl->data = NULL; + } + sctp_free_remote_addr(ctl->whoFrom); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), ctl); + SCTP_DECR_READQ_COUNT(); + } + ctl = nctl; + } + } + if (cnt) { + /* We must back down to see what the new highest is */ + for (i = asoc->highest_tsn_inside_map; + (compare_with_wrap(i, asoc->mapping_array_base_tsn, MAX_TSN) || (i == asoc->mapping_array_base_tsn)); + i--) { + SCTP_CALC_TSN_TO_GAP(gap, i, asoc->mapping_array_base_tsn); + if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) { + asoc->highest_tsn_inside_map = i; + fnd = 1; + break; + } + } + if (!fnd) { + asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1; + } + /* + * Question, should we go through the delivery queue? The + * only reason things are on here is the app not reading OR + * a p-d-api up. An attacker COULD send enough in to + * initiate the PD-API and then send a bunch of stuff to + * other streams... these would wind up on the delivery + * queue.. and then we would not get to them. But in order + * to do this I then have to back-track and un-deliver + * sequence numbers in streams.. el-yucko. I think for now + * we will NOT look at the delivery queue and leave it to be + * something to consider later. An alternative would be to + * abort the P-D-API with a notification and then deliver + * the data.... Or another method might be to keep track of + * how many times the situation occurs and if we see a + * possible attack underway just abort the association. + */ +#ifdef SCTP_DEBUG + SCTPDBG(SCTP_DEBUG_PCB1, "Freed %d chunks from reneg harvest\n", cnt); +#endif + /* + * Now do we need to find a new + * asoc->highest_tsn_inside_map? + */ + asoc->last_revoke_count = cnt; + (void)SCTP_OS_TIMER_STOP(&stcb->asoc.dack_timer.timer); + /* sa_ignore NO_NULL_CHK */ + sctp_send_sack(stcb); + sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_DRAIN, SCTP_SO_NOT_LOCKED); + reneged_asoc_ids[reneged_at] = sctp_get_associd(stcb); + reneged_at++; + } + /* + * Another issue, in un-setting the TSN's in the mapping array we + * DID NOT adjust the highest_tsn marker. This will cause one of + * two things to occur. It may cause us to do extra work in checking + * for our mapping array movement. More importantly it may cause us + * to SACK every datagram. This may not be a bad thing though since + * we will recover once we get our cum-ack above and all this stuff + * we dumped recovered. + */ +} + +void +sctp_drain() +{ + /* + * We must walk the PCB lists for ALL associations here. The system + * is LOW on MBUF's and needs help. This is where reneging will + * occur. We really hope this does NOT happen! + */ + VNET_ITERATOR_DECL(vnet_iter); + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + struct sctp_inpcb *inp; + struct sctp_tcb *stcb; + + SCTP_STAT_INCR(sctps_protocol_drain_calls); + if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { +#ifdef VIMAGE + continue; +#else + return; +#endif + } + SCTP_INP_INFO_RLOCK(); + LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) { + /* For each endpoint */ + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + /* For each association */ + SCTP_TCB_LOCK(stcb); + sctp_drain_mbufs(inp, stcb); + SCTP_TCB_UNLOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + } + SCTP_INP_INFO_RUNLOCK(); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +} + +/* + * start a new iterator + * iterates through all endpoints and associations based on the pcb_state + * flags and asoc_state. "af" (mandatory) is executed for all matching + * assocs and "ef" (optional) is executed when the iterator completes. + * "inpf" (optional) is executed for each new endpoint as it is being + * iterated through. inpe (optional) is called when the inp completes + * its way through all the stcbs. + */ +int +sctp_initiate_iterator(inp_func inpf, + asoc_func af, + inp_func inpe, + uint32_t pcb_state, + uint32_t pcb_features, + uint32_t asoc_state, + void *argp, + uint32_t argi, + end_func ef, + struct sctp_inpcb *s_inp, + uint8_t chunk_output_off) +{ + struct sctp_iterator *it = NULL; + + if (af == NULL) { + return (-1); + } + SCTP_MALLOC(it, struct sctp_iterator *, sizeof(struct sctp_iterator), + SCTP_M_ITER); + if (it == NULL) { + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM); + return (ENOMEM); + } + memset(it, 0, sizeof(*it)); + it->function_assoc = af; + it->function_inp = inpf; + if (inpf) + it->done_current_ep = 0; + else + it->done_current_ep = 1; + it->function_atend = ef; + it->pointer = argp; + it->val = argi; + it->pcb_flags = pcb_state; + it->pcb_features = pcb_features; + it->asoc_state = asoc_state; + it->function_inp_end = inpe; + it->no_chunk_output = chunk_output_off; + it->vn = curvnet; + if (s_inp) { + /* Assume lock is held here */ + it->inp = s_inp; + SCTP_INP_INCR_REF(it->inp); + it->iterator_flags = SCTP_ITERATOR_DO_SINGLE_INP; + } else { + SCTP_INP_INFO_RLOCK(); + it->inp = LIST_FIRST(&SCTP_BASE_INFO(listhead)); + if (it->inp) { + SCTP_INP_INCR_REF(it->inp); + } + SCTP_INP_INFO_RUNLOCK(); + it->iterator_flags = SCTP_ITERATOR_DO_ALL_INP; + + } + SCTP_IPI_ITERATOR_WQ_LOCK(); + + TAILQ_INSERT_TAIL(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); + if (sctp_it_ctl.iterator_running == 0) { + sctp_wakeup_iterator(); + } + SCTP_IPI_ITERATOR_WQ_UNLOCK(); + /* sa_ignore MEMLEAK {memory is put on the tailq for the iterator} */ + return (0); +} diff --git a/freebsd/sys/netinet/sctp_pcb.h b/freebsd/sys/netinet/sctp_pcb.h new file mode 100644 index 00000000..a4f4d30c --- /dev/null +++ b/freebsd/sys/netinet/sctp_pcb.h @@ -0,0 +1,632 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_pcb.h,v 1.21 2005/07/16 01:18:47 suz Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_pcb_h__ +#define __sctp_pcb_h__ + +#include +#include +#include +#include + +LIST_HEAD(sctppcbhead, sctp_inpcb); +LIST_HEAD(sctpasochead, sctp_tcb); +LIST_HEAD(sctpladdr, sctp_laddr); +LIST_HEAD(sctpvtaghead, sctp_tagblock); +LIST_HEAD(sctp_vrflist, sctp_vrf); +LIST_HEAD(sctp_ifnlist, sctp_ifn); +LIST_HEAD(sctp_ifalist, sctp_ifa); +TAILQ_HEAD(sctp_readhead, sctp_queued_to_read); +TAILQ_HEAD(sctp_streamhead, sctp_stream_queue_pending); + +#include +#include + +#define SCTP_PCBHASH_ALLADDR(port, mask) (port & mask) +#define SCTP_PCBHASH_ASOC(tag, mask) (tag & mask) + +struct sctp_vrf { + LIST_ENTRY(sctp_vrf) next_vrf; + struct sctp_ifalist *vrf_addr_hash; + struct sctp_ifnlist ifnlist; + uint32_t vrf_id; + uint32_t tbl_id_v4; /* default v4 table id */ + uint32_t tbl_id_v6; /* default v6 table id */ + uint32_t total_ifa_count; + u_long vrf_addr_hashmark; + uint32_t refcount; +}; + +struct sctp_ifn { + struct sctp_ifalist ifalist; + struct sctp_vrf *vrf; + LIST_ENTRY(sctp_ifn) next_ifn; + LIST_ENTRY(sctp_ifn) next_bucket; + void *ifn_p; /* never access without appropriate lock */ + uint32_t ifn_mtu; + uint32_t ifn_type; + uint32_t ifn_index; /* shorthand way to look at ifn for reference */ + uint32_t refcount; /* number of reference held should be >= + * ifa_count */ + uint32_t ifa_count; /* IFA's we hold (in our list - ifalist) */ + uint32_t num_v6; /* number of v6 addresses */ + uint32_t num_v4; /* number of v4 addresses */ + uint32_t registered_af; /* registered address family for i/f events */ + char ifn_name[SCTP_IFNAMSIZ]; +}; + +/* SCTP local IFA flags */ +#define SCTP_ADDR_VALID 0x00000001 /* its up and active */ +#define SCTP_BEING_DELETED 0x00000002 /* being deleted, when + * refcount = 0. Note that it + * is pulled from the ifn list + * and ifa_p is nulled right + * away but it cannot be freed + * until the last *net + * pointing to it is deleted. */ +#define SCTP_ADDR_DEFER_USE 0x00000004 /* Hold off using this one */ +#define SCTP_ADDR_IFA_UNUSEABLE 0x00000008 + +struct sctp_ifa { + LIST_ENTRY(sctp_ifa) next_ifa; + LIST_ENTRY(sctp_ifa) next_bucket; + struct sctp_ifn *ifn_p; /* back pointer to parent ifn */ + void *ifa; /* pointer to ifa, needed for flag update for + * that we MUST lock appropriate locks. This + * is for V6. */ + union sctp_sockstore address; + uint32_t refcount; /* number of folks refering to this */ + uint32_t flags; + uint32_t localifa_flags; + uint32_t vrf_id; /* vrf_id of this addr (for deleting) */ + uint8_t src_is_loop; + uint8_t src_is_priv; + uint8_t src_is_glob; + uint8_t resv; +}; + +struct sctp_laddr { + LIST_ENTRY(sctp_laddr) sctp_nxt_addr; /* next in list */ + struct sctp_ifa *ifa; + uint32_t action; /* Used during asconf and adding if no-zero + * src-addr selection will not consider this + * address. */ + struct timeval start_time; /* time when this address was created */ +}; + +struct sctp_block_entry { + int error; +}; + +struct sctp_timewait { + uint32_t tv_sec_at_expire; /* the seconds from boot to expire */ + uint32_t v_tag; /* the vtag that can not be reused */ + uint16_t lport; /* the local port used in vtag */ + uint16_t rport; /* the remote port used in vtag */ +}; + +struct sctp_tagblock { + LIST_ENTRY(sctp_tagblock) sctp_nxt_tagblock; + struct sctp_timewait vtag_block[SCTP_NUMBER_IN_VTAG_BLOCK]; +}; + + +struct sctp_epinfo { + struct socket *udp_tun_socket; + struct sctpasochead *sctp_asochash; + u_long hashasocmark; + + struct sctppcbhead *sctp_ephash; + u_long hashmark; + + /*- + * The TCP model represents a substantial overhead in that we get an + * additional hash table to keep explicit connections in. The + * listening TCP endpoint will exist in the usual ephash above and + * accept only INIT's. It will be incapable of sending off an INIT. + * When a dg arrives we must look in the normal ephash. If we find a + * TCP endpoint that will tell us to go to the specific endpoint + * hash and re-hash to find the right assoc/socket. If we find a UDP + * model socket we then must complete the lookup. If this fails, + * i.e. no association can be found then we must continue to see if + * a sctp_peeloff()'d socket is in the tcpephash (a spun off socket + * acts like a TCP model connected socket). + */ + struct sctppcbhead *sctp_tcpephash; + u_long hashtcpmark; + uint32_t hashtblsize; + + struct sctp_vrflist *sctp_vrfhash; + u_long hashvrfmark; + + struct sctp_ifnlist *vrf_ifn_hash; + u_long vrf_ifn_hashmark; + + struct sctppcbhead listhead; + struct sctpladdr addr_wq; + + /* ep zone info */ + sctp_zone_t ipi_zone_ep; + sctp_zone_t ipi_zone_asoc; + sctp_zone_t ipi_zone_laddr; + sctp_zone_t ipi_zone_net; + sctp_zone_t ipi_zone_chunk; + sctp_zone_t ipi_zone_readq; + sctp_zone_t ipi_zone_strmoq; + sctp_zone_t ipi_zone_asconf; + sctp_zone_t ipi_zone_asconf_ack; + + struct rwlock ipi_ep_mtx; + struct mtx ipi_iterator_wq_mtx; + struct rwlock ipi_addr_mtx; + struct mtx ipi_pktlog_mtx; + struct mtx wq_addr_mtx; + uint32_t ipi_count_ep; + + /* assoc/tcb zone info */ + uint32_t ipi_count_asoc; + + /* local addrlist zone info */ + uint32_t ipi_count_laddr; + + /* remote addrlist zone info */ + uint32_t ipi_count_raddr; + + /* chunk structure list for output */ + uint32_t ipi_count_chunk; + + /* socket queue zone info */ + uint32_t ipi_count_readq; + + /* socket queue zone info */ + uint32_t ipi_count_strmoq; + + /* Number of vrfs */ + uint32_t ipi_count_vrfs; + + /* Number of ifns */ + uint32_t ipi_count_ifns; + + /* Number of ifas */ + uint32_t ipi_count_ifas; + + /* system wide number of free chunks hanging around */ + uint32_t ipi_free_chunks; + uint32_t ipi_free_strmoq; + + struct sctpvtaghead vtag_timewait[SCTP_STACK_VTAG_HASH_SIZE]; + + /* address work queue handling */ + struct sctp_timer addr_wq_timer; + +}; + + +struct sctp_base_info { + /* + * All static structures that anchor the system must be here. + */ + struct sctp_epinfo sctppcbinfo; +#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT) + struct sctpstat sctpstat[MAXCPU]; +#else + struct sctpstat sctpstat; +#endif + struct sctp_sysctl sctpsysctl; + uint8_t first_time; + char sctp_pcb_initialized; +#if defined(SCTP_PACKET_LOGGING) + int packet_log_writers; + int packet_log_end; + uint8_t packet_log_buffer[SCTP_PACKET_LOG_SIZE]; +#endif +}; + +/*- + * Here we have all the relevant information for each SCTP entity created. We + * will need to modify this as approprate. We also need to figure out how to + * access /dev/random. + */ +struct sctp_pcb { + unsigned int time_of_secret_change; /* number of seconds from + * timeval.tv_sec */ + uint32_t secret_key[SCTP_HOW_MANY_SECRETS][SCTP_NUMBER_OF_SECRETS]; + unsigned int size_of_a_cookie; + + unsigned int sctp_timeoutticks[SCTP_NUM_TMRS]; + unsigned int sctp_minrto; + unsigned int sctp_maxrto; + unsigned int initial_rto; + int initial_init_rto_max; + + unsigned int sctp_sack_freq; + uint32_t sctp_sws_sender; + uint32_t sctp_sws_receiver; + + uint32_t sctp_default_cc_module; + /* authentication related fields */ + struct sctp_keyhead shared_keys; + sctp_auth_chklist_t *local_auth_chunks; + sctp_hmaclist_t *local_hmacs; + uint16_t default_keyid; + + /* various thresholds */ + /* Max times I will init at a guy */ + uint16_t max_init_times; + + /* Max times I will send before we consider someone dead */ + uint16_t max_send_times; + + uint16_t def_net_failure; + + /* number of streams to pre-open on a association */ + uint16_t pre_open_stream_count; + uint16_t max_open_streams_intome; + + /* random number generator */ + uint32_t random_counter; + uint8_t random_numbers[SCTP_SIGNATURE_ALOC_SIZE]; + uint8_t random_store[SCTP_SIGNATURE_ALOC_SIZE]; + + /* + * This timer is kept running per endpoint. When it fires it will + * change the secret key. The default is once a hour + */ + struct sctp_timer signature_change; + + /* Zero copy full buffer timer */ + struct sctp_timer zero_copy_timer; + /* Zero copy app to transport (sendq) read repulse timer */ + struct sctp_timer zero_copy_sendq_timer; + uint32_t def_cookie_life; + /* defaults to 0 */ + int auto_close_time; + uint32_t initial_sequence_debug; + uint32_t adaptation_layer_indicator; + uint32_t store_at; + uint8_t max_burst; + char current_secret_number; + char last_secret_number; +}; + +#ifndef SCTP_ALIGNMENT +#define SCTP_ALIGNMENT 32 +#endif + +#ifndef SCTP_ALIGNM1 +#define SCTP_ALIGNM1 (SCTP_ALIGNMENT-1) +#endif + +#define sctp_lport ip_inp.inp.inp_lport + +struct sctp_pcbtsn_rlog { + uint32_t vtag; + uint16_t strm; + uint16_t seq; + uint16_t sz; + uint16_t flgs; +}; + +#define SCTP_READ_LOG_SIZE 135 /* we choose the number to make a pcb a page */ + + +struct sctp_inpcb { + /*- + * put an inpcb in front of it all, kind of a waste but we need to + * for compatability with all the other stuff. + */ + union { + struct inpcb inp; + char align[(sizeof(struct in6pcb) + SCTP_ALIGNM1) & + ~SCTP_ALIGNM1]; + } ip_inp; + + + /* Socket buffer lock protects read_queue and of course sb_cc */ + struct sctp_readhead read_queue; + + LIST_ENTRY(sctp_inpcb) sctp_list; /* lists all endpoints */ + /* hash of all endpoints for model */ + LIST_ENTRY(sctp_inpcb) sctp_hash; + /* count of local addresses bound, 0 if bound all */ + int laddr_count; + + /* list of addrs in use by the EP, NULL if bound-all */ + struct sctpladdr sctp_addr_list; + /* + * used for source address selection rotation when we are subset + * bound + */ + struct sctp_laddr *next_addr_touse; + + /* back pointer to our socket */ + struct socket *sctp_socket; + uint32_t sctp_flags; /* INP state flag set */ + uint32_t sctp_features; /* Feature flags */ + uint32_t sctp_mobility_features; /* Mobility Feature flags */ + struct sctp_pcb sctp_ep;/* SCTP ep data */ + /* head of the hash of all associations */ + struct sctpasochead *sctp_tcbhash; + u_long sctp_hashmark; + /* head of the list of all associations */ + struct sctpasochead sctp_asoc_list; +#ifdef SCTP_TRACK_FREED_ASOCS + struct sctpasochead sctp_asoc_free_list; +#endif + struct sctp_iterator *inp_starting_point_for_iterator; + uint32_t sctp_frag_point; + uint32_t partial_delivery_point; + uint32_t sctp_context; + uint32_t sctp_cmt_on_off; + struct sctp_nonpad_sndrcvinfo def_send; + /*- + * These three are here for the sosend_dgram + * (pkt, pkt_last and control). + * routine. However, I don't think anyone in + * the current FreeBSD kernel calls this. So + * they are candidates with sctp_sendm for + * de-supporting. + */ + struct mbuf *pkt, *pkt_last; + struct mbuf *control; + struct mtx inp_mtx; + struct mtx inp_create_mtx; + struct mtx inp_rdata_mtx; + int32_t refcount; + uint32_t def_vrf_id; + uint32_t total_sends; + uint32_t total_recvs; + uint32_t last_abort_code; + uint32_t total_nospaces; + struct sctpasochead *sctp_asocidhash; + u_long hashasocidmark; + uint32_t sctp_associd_counter; + +#ifdef SCTP_ASOCLOG_OF_TSNS + struct sctp_pcbtsn_rlog readlog[SCTP_READ_LOG_SIZE]; + uint32_t readlog_index; +#endif +}; + +struct sctp_tcb { + struct socket *sctp_socket; /* back pointer to socket */ + struct sctp_inpcb *sctp_ep; /* back pointer to ep */ + LIST_ENTRY(sctp_tcb) sctp_tcbhash; /* next link in hash + * table */ + LIST_ENTRY(sctp_tcb) sctp_tcblist; /* list of all of the + * TCB's */ + LIST_ENTRY(sctp_tcb) sctp_tcbasocidhash; /* next link in asocid + * hash table */ + LIST_ENTRY(sctp_tcb) sctp_asocs; /* vtag hash list */ + struct sctp_block_entry *block_entry; /* pointer locked by socket + * send buffer */ + struct sctp_association asoc; + /*- + * freed_by_sorcv_sincelast is protected by the sockbuf_lock NOT the + * tcb_lock. Its special in this way to help avoid extra mutex calls + * in the reading of data. + */ + uint32_t freed_by_sorcv_sincelast; + uint32_t total_sends; + uint32_t total_recvs; + int freed_from_where; + uint16_t rport; /* remote port in network format */ + uint16_t resv; + struct mtx tcb_mtx; + struct mtx tcb_send_mtx; +}; + + + +#include + + +/* TODO where to put non-_KERNEL things for __Userspace__? */ +#if defined(_KERNEL) || defined(__Userspace__) + +/* Attention Julian, this is the extern that + * goes with the base info. sctp_pcb.c has + * the real definition. + */ +VNET_DECLARE(struct sctp_base_info, system_base_info); + +#ifdef INET6 +int SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b); + +#endif + +void sctp_fill_pcbinfo(struct sctp_pcbinfo *); + +struct sctp_ifn * + sctp_find_ifn(void *ifn, uint32_t ifn_index); + +struct sctp_vrf *sctp_allocate_vrf(int vrfid); +struct sctp_vrf *sctp_find_vrf(uint32_t vrfid); +void sctp_free_vrf(struct sctp_vrf *vrf); + +/*- + * Change address state, can be used if + * O/S supports telling transports about + * changes to IFA/IFN's (link layer triggers). + * If a ifn goes down, we will do src-addr-selection + * and NOT use that, as a source address. This does + * not stop the routing system from routing out + * that interface, but we won't put it as a source. + */ +void sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index); +void sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index); + +struct sctp_ifa * +sctp_add_addr_to_vrf(uint32_t vrfid, + void *ifn, uint32_t ifn_index, uint32_t ifn_type, + const char *if_name, + void *ifa, struct sockaddr *addr, uint32_t ifa_flags, + int dynamic_add); + +void sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu); + +void sctp_free_ifn(struct sctp_ifn *sctp_ifnp); +void sctp_free_ifa(struct sctp_ifa *sctp_ifap); + + +void +sctp_del_addr_from_vrf(uint32_t vrfid, struct sockaddr *addr, + uint32_t ifn_index, const char *if_name); + + + +struct sctp_nets *sctp_findnet(struct sctp_tcb *, struct sockaddr *); + +struct sctp_inpcb *sctp_pcb_findep(struct sockaddr *, int, int, uint32_t); + +int +sctp_inpcb_bind(struct socket *, struct sockaddr *, + struct sctp_ifa *, struct thread *); + +struct sctp_tcb * +sctp_findassociation_addr(struct mbuf *, int, int, + struct sctphdr *, struct sctp_chunkhdr *, struct sctp_inpcb **, + struct sctp_nets **, uint32_t vrf_id); + +struct sctp_tcb * +sctp_findassociation_addr_sa(struct sockaddr *, + struct sockaddr *, struct sctp_inpcb **, struct sctp_nets **, int, uint32_t); + +void +sctp_move_pcb_and_assoc(struct sctp_inpcb *, struct sctp_inpcb *, + struct sctp_tcb *); + +/*- + * For this call ep_addr, the to is the destination endpoint address of the + * peer (relative to outbound). The from field is only used if the TCP model + * is enabled and helps distingush amongst the subset bound (non-boundall). + * The TCP model MAY change the actual ep field, this is why it is passed. + */ +struct sctp_tcb * +sctp_findassociation_ep_addr(struct sctp_inpcb **, + struct sockaddr *, struct sctp_nets **, struct sockaddr *, + struct sctp_tcb *); + +struct sctp_tcb * + sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock); + +struct sctp_tcb * +sctp_findassociation_ep_asocid(struct sctp_inpcb *, + sctp_assoc_t, int); + +struct sctp_tcb * +sctp_findassociation_ep_asconf(struct mbuf *, int, int, + struct sctphdr *, struct sctp_inpcb **, struct sctp_nets **, uint32_t vrf_id); + +int sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id); + +int sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id); + +void sctp_inpcb_free(struct sctp_inpcb *, int, int); + +struct sctp_tcb * +sctp_aloc_assoc(struct sctp_inpcb *, struct sockaddr *, + int *, uint32_t, uint32_t, struct thread *); + +int sctp_free_assoc(struct sctp_inpcb *, struct sctp_tcb *, int, int); + + +void sctp_delete_from_timewait(uint32_t, uint16_t, uint16_t); + +int sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport); + +void + sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t rport); + +void sctp_add_local_addr_ep(struct sctp_inpcb *, struct sctp_ifa *, uint32_t); + +int sctp_insert_laddr(struct sctpladdr *, struct sctp_ifa *, uint32_t); + +void sctp_remove_laddr(struct sctp_laddr *); + +void sctp_del_local_addr_ep(struct sctp_inpcb *, struct sctp_ifa *); + +int sctp_add_remote_addr(struct sctp_tcb *, struct sockaddr *, int, int); + +void sctp_remove_net(struct sctp_tcb *, struct sctp_nets *); + +int sctp_del_remote_addr(struct sctp_tcb *, struct sockaddr *); + +void sctp_pcb_init(void); + +void sctp_pcb_finish(void); + +void sctp_add_local_addr_restricted(struct sctp_tcb *, struct sctp_ifa *); +void sctp_del_local_addr_restricted(struct sctp_tcb *, struct sctp_ifa *); + +int +sctp_load_addresses_from_init(struct sctp_tcb *, struct mbuf *, int, int, + int, struct sctphdr *, struct sockaddr *); + +int +sctp_set_primary_addr(struct sctp_tcb *, struct sockaddr *, + struct sctp_nets *); + +int sctp_is_vtag_good(struct sctp_inpcb *, uint32_t, uint16_t lport, uint16_t rport, struct timeval *, int); + +/* void sctp_drain(void); */ + +int sctp_destination_is_reachable(struct sctp_tcb *, struct sockaddr *); + +int sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp); + +/*- + * Null in last arg inpcb indicate run on ALL ep's. Specific inp in last arg + * indicates run on ONLY assoc's of the specified endpoint. + */ +int +sctp_initiate_iterator(inp_func inpf, + asoc_func af, + inp_func inpe, + uint32_t, uint32_t, + uint32_t, void *, + uint32_t, + end_func ef, + struct sctp_inpcb *, + uint8_t co_off); + +#ifdef INVARIANTS +void + sctp_validate_no_locks(struct sctp_inpcb *inp); + +#endif + +#endif /* _KERNEL */ +#endif /* !__sctp_pcb_h__ */ diff --git a/freebsd/sys/netinet/sctp_peeloff.c b/freebsd/sys/netinet/sctp_peeloff.c new file mode 100644 index 00000000..7b859bba --- /dev/null +++ b/freebsd/sys/netinet/sctp_peeloff.c @@ -0,0 +1,240 @@ +#include + +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + + +/* $KAME: sctp_peeloff.c,v 1.13 2005/03/06 16:04:18 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +int +sctp_can_peel_off(struct socket *head, sctp_assoc_t assoc_id) +{ + struct sctp_inpcb *inp; + struct sctp_tcb *stcb; + uint32_t state; + + inp = (struct sctp_inpcb *)head->so_pcb; + if (inp == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, EFAULT); + return (EFAULT); + } + stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1); + if (stcb == NULL) { + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PEELOFF, ENOENT); + return (ENOENT); + } + state = SCTP_GET_STATE((&stcb->asoc)); + if ((state == SCTP_STATE_EMPTY) || + (state == SCTP_STATE_INUSE) || + (state == SCTP_STATE_COOKIE_WAIT) || + (state == SCTP_STATE_COOKIE_ECHOED)) { + SCTP_TCB_UNLOCK(stcb); + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PEELOFF, ENOTCONN); + return (ENOTCONN); + } + SCTP_TCB_UNLOCK(stcb); + /* We are clear to peel this one off */ + return (0); +} + +int +sctp_do_peeloff(struct socket *head, struct socket *so, sctp_assoc_t assoc_id) +{ + struct sctp_inpcb *inp, *n_inp; + struct sctp_tcb *stcb; + uint32_t state; + + inp = (struct sctp_inpcb *)head->so_pcb; + if (inp == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, EFAULT); + return (EFAULT); + } + stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1); + if (stcb == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, ENOTCONN); + return (ENOTCONN); + } + state = SCTP_GET_STATE((&stcb->asoc)); + if ((state == SCTP_STATE_EMPTY) || + (state == SCTP_STATE_INUSE) || + (state == SCTP_STATE_COOKIE_WAIT) || + (state == SCTP_STATE_COOKIE_ECHOED)) { + SCTP_TCB_UNLOCK(stcb); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, ENOTCONN); + return (ENOTCONN); + } + n_inp = (struct sctp_inpcb *)so->so_pcb; + n_inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE | + SCTP_PCB_FLAGS_CONNECTED | + SCTP_PCB_FLAGS_IN_TCPPOOL | /* Turn on Blocking IO */ + (SCTP_PCB_COPY_FLAGS & inp->sctp_flags)); + n_inp->sctp_socket = so; + n_inp->sctp_features = inp->sctp_features; + n_inp->sctp_mobility_features = inp->sctp_mobility_features; + n_inp->sctp_frag_point = inp->sctp_frag_point; + n_inp->sctp_cmt_on_off = inp->sctp_cmt_on_off; + n_inp->partial_delivery_point = inp->partial_delivery_point; + n_inp->sctp_context = inp->sctp_context; + n_inp->inp_starting_point_for_iterator = NULL; + /* copy in the authentication parameters from the original endpoint */ + if (n_inp->sctp_ep.local_hmacs) + sctp_free_hmaclist(n_inp->sctp_ep.local_hmacs); + n_inp->sctp_ep.local_hmacs = + sctp_copy_hmaclist(inp->sctp_ep.local_hmacs); + if (n_inp->sctp_ep.local_auth_chunks) + sctp_free_chunklist(n_inp->sctp_ep.local_auth_chunks); + n_inp->sctp_ep.local_auth_chunks = + sctp_copy_chunklist(inp->sctp_ep.local_auth_chunks); + (void)sctp_copy_skeylist(&inp->sctp_ep.shared_keys, + &n_inp->sctp_ep.shared_keys); + /* + * Now we must move it from one hash table to another and get the + * stcb in the right place. + */ + sctp_move_pcb_and_assoc(inp, n_inp, stcb); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + + sctp_pull_off_control_to_new_inp(inp, n_inp, stcb, SBL_WAIT); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + + return (0); +} + + +struct socket * +sctp_get_peeloff(struct socket *head, sctp_assoc_t assoc_id, int *error) +{ + struct socket *newso; + struct sctp_inpcb *inp, *n_inp; + struct sctp_tcb *stcb; + + SCTPDBG(SCTP_DEBUG_PEEL1, "SCTP peel-off called\n"); + inp = (struct sctp_inpcb *)head->so_pcb; + if (inp == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, EFAULT); + *error = EFAULT; + return (NULL); + } + stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1); + if (stcb == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PEELOFF, ENOTCONN); + *error = ENOTCONN; + return (NULL); + } + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + newso = sonewconn(head, SS_ISCONNECTED + ); + if (newso == NULL) { + SCTPDBG(SCTP_DEBUG_PEEL1, "sctp_peeloff:sonewconn failed\n"); + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_PEELOFF, ENOMEM); + *error = ENOMEM; + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return (NULL); + + } + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + n_inp = (struct sctp_inpcb *)newso->so_pcb; + SOCK_LOCK(head); + n_inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE | + SCTP_PCB_FLAGS_CONNECTED | + SCTP_PCB_FLAGS_IN_TCPPOOL | /* Turn on Blocking IO */ + (SCTP_PCB_COPY_FLAGS & inp->sctp_flags)); + n_inp->sctp_features = inp->sctp_features; + n_inp->sctp_frag_point = inp->sctp_frag_point; + n_inp->sctp_cmt_on_off = inp->sctp_cmt_on_off; + n_inp->partial_delivery_point = inp->partial_delivery_point; + n_inp->sctp_context = inp->sctp_context; + n_inp->inp_starting_point_for_iterator = NULL; + + /* copy in the authentication parameters from the original endpoint */ + if (n_inp->sctp_ep.local_hmacs) + sctp_free_hmaclist(n_inp->sctp_ep.local_hmacs); + n_inp->sctp_ep.local_hmacs = + sctp_copy_hmaclist(inp->sctp_ep.local_hmacs); + if (n_inp->sctp_ep.local_auth_chunks) + sctp_free_chunklist(n_inp->sctp_ep.local_auth_chunks); + n_inp->sctp_ep.local_auth_chunks = + sctp_copy_chunklist(inp->sctp_ep.local_auth_chunks); + (void)sctp_copy_skeylist(&inp->sctp_ep.shared_keys, + &n_inp->sctp_ep.shared_keys); + + n_inp->sctp_socket = newso; + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { + sctp_feature_off(n_inp, SCTP_PCB_FLAGS_AUTOCLOSE); + n_inp->sctp_ep.auto_close_time = 0; + sctp_timer_stop(SCTP_TIMER_TYPE_AUTOCLOSE, n_inp, stcb, NULL, + SCTP_FROM_SCTP_PEELOFF + SCTP_LOC_1); + } + /* Turn off any non-blocking semantic. */ + SCTP_CLEAR_SO_NBIO(newso); + newso->so_state |= SS_ISCONNECTED; + /* We remove it right away */ + +#ifdef SCTP_LOCK_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) { + sctp_log_lock(inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_SOCK); + } +#endif + TAILQ_REMOVE(&head->so_comp, newso, so_list); + head->so_qlen--; + SOCK_UNLOCK(head); + /* + * Now we must move it from one hash table to another and get the + * stcb in the right place. + */ + sctp_move_pcb_and_assoc(inp, n_inp, stcb); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + /* + * And now the final hack. We move data in the pending side i.e. + * head to the new socket buffer. Let the GRUBBING begin :-0 + */ + sctp_pull_off_control_to_new_inp(inp, n_inp, stcb, SBL_WAIT); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return (newso); +} diff --git a/freebsd/sys/netinet/sctp_peeloff.h b/freebsd/sys/netinet/sctp_peeloff.h new file mode 100644 index 00000000..57fd5fef --- /dev/null +++ b/freebsd/sys/netinet/sctp_peeloff.h @@ -0,0 +1,52 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_peeloff.h,v 1.6 2005/03/06 16:04:18 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_peeloff_h__ +#define __sctp_peeloff_h__ + + + + +#if defined(_KERNEL) + +int sctp_can_peel_off(struct socket *, sctp_assoc_t); +int sctp_do_peeloff(struct socket *, struct socket *, sctp_assoc_t); +struct socket *sctp_get_peeloff(struct socket *, sctp_assoc_t, int *); + + + +#endif /* _KERNEL */ + +#endif diff --git a/freebsd/sys/netinet/sctp_structs.h b/freebsd/sys/netinet/sctp_structs.h new file mode 100644 index 00000000..2050c581 --- /dev/null +++ b/freebsd/sys/netinet/sctp_structs.h @@ -0,0 +1,1094 @@ +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_structs.h,v 1.13 2005/03/06 16:04:18 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_structs_h__ +#define __sctp_structs_h__ + +#include +#include +#include + +struct sctp_timer { + sctp_os_timer_t timer; + + int type; + /* + * Depending on the timer type these will be setup and cast with the + * appropriate entity. + */ + void *ep; + void *tcb; + void *net; + void *vnet; + + /* for sanity checking */ + void *self; + uint32_t ticks; + uint32_t stopped_from; +}; + + +struct sctp_foo_stuff { + struct sctp_inpcb *inp; + uint32_t lineno; + uint32_t ticks; + int updown; +}; + + +/* + * This is the information we track on each interface that we know about from + * the distant end. + */ +TAILQ_HEAD(sctpnetlisthead, sctp_nets); + +struct sctp_stream_reset_list { + TAILQ_ENTRY(sctp_stream_reset_list) next_resp; + uint32_t tsn; + int number_entries; + struct sctp_stream_reset_out_request req; +}; + +TAILQ_HEAD(sctp_resethead, sctp_stream_reset_list); + +/* + * Users of the iterator need to malloc a iterator with a call to + * sctp_initiate_iterator(inp_func, assoc_func, inp_func, pcb_flags, pcb_features, + * asoc_state, void-ptr-arg, uint32-arg, end_func, inp); + * + * Use the following two defines if you don't care what pcb flags are on the EP + * and/or you don't care what state the association is in. + * + * Note that if you specify an INP as the last argument then ONLY each + * association of that single INP will be executed upon. Note that the pcb + * flags STILL apply so if the inp you specify has different pcb_flags then + * what you put in pcb_flags nothing will happen. use SCTP_PCB_ANY_FLAGS to + * assure the inp you specify gets treated. + */ +#define SCTP_PCB_ANY_FLAGS 0x00000000 +#define SCTP_PCB_ANY_FEATURES 0x00000000 +#define SCTP_ASOC_ANY_STATE 0x00000000 + +typedef void (*asoc_func) (struct sctp_inpcb *, struct sctp_tcb *, void *ptr, + uint32_t val); +typedef int (*inp_func) (struct sctp_inpcb *, void *ptr, uint32_t val); +typedef void (*end_func) (void *ptr, uint32_t val); + +struct sctp_iterator { + TAILQ_ENTRY(sctp_iterator) sctp_nxt_itr; + struct vnet *vn; + struct sctp_timer tmr; + struct sctp_inpcb *inp; /* current endpoint */ + struct sctp_tcb *stcb; /* current* assoc */ + struct sctp_inpcb *next_inp; /* special hook to skip to */ + asoc_func function_assoc; /* per assoc function */ + inp_func function_inp; /* per endpoint function */ + inp_func function_inp_end; /* end INP function */ + end_func function_atend;/* iterator completion function */ + void *pointer; /* pointer for apply func to use */ + uint32_t val; /* value for apply func to use */ + uint32_t pcb_flags; /* endpoint flags being checked */ + uint32_t pcb_features; /* endpoint features being checked */ + uint32_t asoc_state; /* assoc state being checked */ + uint32_t iterator_flags; + uint8_t no_chunk_output; + uint8_t done_current_ep; +}; + +/* iterator_flags values */ +#define SCTP_ITERATOR_DO_ALL_INP 0x00000001 +#define SCTP_ITERATOR_DO_SINGLE_INP 0x00000002 + + +TAILQ_HEAD(sctpiterators, sctp_iterator); + +struct sctp_copy_all { + struct sctp_inpcb *inp; /* ep */ + struct mbuf *m; + struct sctp_sndrcvinfo sndrcv; + int sndlen; + int cnt_sent; + int cnt_failed; +}; + +struct sctp_asconf_iterator { + struct sctpladdr list_of_work; + int cnt; +}; + +struct iterator_control { + struct mtx ipi_iterator_wq_mtx; + struct mtx it_mtx; + SCTP_PROCESS_STRUCT thread_proc; + struct sctpiterators iteratorhead; + struct sctp_iterator *cur_it; + uint32_t iterator_running; + uint32_t iterator_flags; +}; + +#define SCTP_ITERATOR_MUST_EXIT 0x00000001 +#define SCTP_ITERATOR_STOP_CUR_IT 0x00000002 +#define SCTP_ITERATOR_STOP_CUR_INP 0x00000004 + +struct sctp_net_route { + sctp_rtentry_t *ro_rt; + void *ro_lle; + union sctp_sockstore _l_addr; /* remote peer addr */ + struct sctp_ifa *_s_addr; /* our selected src addr */ +}; + +struct htcp { + uint16_t alpha; /* Fixed point arith, << 7 */ + uint8_t beta; /* Fixed point arith, << 7 */ + uint8_t modeswitch; /* Delay modeswitch until we had at least one + * congestion event */ + uint32_t last_cong; /* Time since last congestion event end */ + uint32_t undo_last_cong; + uint16_t bytes_acked; + uint32_t bytecount; + uint32_t minRTT; + uint32_t maxRTT; + + uint32_t undo_maxRTT; + uint32_t undo_old_maxB; + + /* Bandwidth estimation */ + uint32_t minB; + uint32_t maxB; + uint32_t old_maxB; + uint32_t Bi; + uint32_t lasttime; +}; + + +struct sctp_nets { + TAILQ_ENTRY(sctp_nets) sctp_next; /* next link */ + + /* + * Things on the top half may be able to be split into a common + * structure shared by all. + */ + struct sctp_timer pmtu_timer; + + /* + * The following two in combination equate to a route entry for v6 + * or v4. + */ + struct sctp_net_route ro; + + /* mtu discovered so far */ + uint32_t mtu; + uint32_t ssthresh; /* not sure about this one for split */ + + /* smoothed average things for RTT and RTO itself */ + int lastsa; + int lastsv; + int rtt; /* last measured rtt value in ms */ + unsigned int RTO; + + /* This is used for SHUTDOWN/SHUTDOWN-ACK/SEND or INIT timers */ + struct sctp_timer rxt_timer; + struct sctp_timer fr_timer; /* for early fr */ + + /* last time in seconds I sent to it */ + struct timeval last_sent_time; + int ref_count; + + /* Congestion stats per destination */ + /* + * flight size variables and such, sorry Vern, I could not avoid + * this if I wanted performance :> + */ + uint32_t flight_size; + uint32_t cwnd; /* actual cwnd */ + uint32_t prev_cwnd; /* cwnd before any processing */ + uint32_t partial_bytes_acked; /* in CA tracks when to incr a MTU */ + uint32_t prev_rtt; + /* tracking variables to avoid the aloc/free in sack processing */ + unsigned int net_ack; + unsigned int net_ack2; + + /* + * JRS - 5/8/07 - Variable to track last time a destination was + * active for CMT PF + */ + uint32_t last_active; + + /* + * CMT variables (iyengar@cis.udel.edu) + */ + uint32_t this_sack_highest_newack; /* tracks highest TSN newly + * acked for a given dest in + * the current SACK. Used in + * SFR and HTNA algos */ + uint32_t pseudo_cumack; /* CMT CUC algorithm. Maintains next expected + * pseudo-cumack for this destination */ + uint32_t rtx_pseudo_cumack; /* CMT CUC algorithm. Maintains next + * expected pseudo-cumack for this + * destination */ + + /* CMT fast recovery variables */ + uint32_t fast_recovery_tsn; + uint32_t heartbeat_random1; + uint32_t heartbeat_random2; + uint32_t tos_flowlabel; + + struct timeval start_time; /* time when this net was created */ + + uint32_t marked_retrans;/* number or DATA chunks marked for timer + * based retransmissions */ + uint32_t marked_fastretrans; + + /* if this guy is ok or not ... status */ + uint16_t dest_state; + /* number of transmit failures to down this guy */ + uint16_t failure_threshold; + /* error stats on destination */ + uint16_t error_count; + /* UDP port number in case of UDP tunneling */ + uint16_t port; + + uint8_t fast_retran_loss_recovery; + uint8_t will_exit_fast_recovery; + /* Flags that probably can be combined into dest_state */ + uint8_t fast_retran_ip; /* fast retransmit in progress */ + uint8_t hb_responded; + uint8_t saw_newack; /* CMT's SFR algorithm flag */ + uint8_t src_addr_selected; /* if we split we move */ + uint8_t indx_of_eligible_next_to_use; + uint8_t addr_is_local; /* its a local address (if known) could move + * in split */ + + /* + * CMT variables (iyengar@cis.udel.edu) + */ + uint8_t find_pseudo_cumack; /* CMT CUC algorithm. Flag used to + * find a new pseudocumack. This flag + * is set after a new pseudo-cumack + * has been received and indicates + * that the sender should find the + * next pseudo-cumack expected for + * this destination */ + uint8_t find_rtx_pseudo_cumack; /* CMT CUCv2 algorithm. Flag used to + * find a new rtx-pseudocumack. This + * flag is set after a new + * rtx-pseudo-cumack has been received + * and indicates that the sender + * should find the next + * rtx-pseudo-cumack expected for this + * destination */ + uint8_t new_pseudo_cumack; /* CMT CUC algorithm. Flag used to + * indicate if a new pseudo-cumack or + * rtx-pseudo-cumack has been received */ + uint8_t window_probe; /* Doing a window probe? */ + uint8_t RTO_measured; /* Have we done the first measure */ + uint8_t last_hs_used; /* index into the last HS table entry we used */ + /* JRS - struct used in HTCP algorithm */ + struct htcp htcp_ca; +}; + + +struct sctp_data_chunkrec { + uint32_t TSN_seq; /* the TSN of this transmit */ + uint16_t stream_seq; /* the stream sequence number of this transmit */ + uint16_t stream_number; /* the stream number of this guy */ + uint32_t payloadtype; + uint32_t context; /* from send */ + + /* ECN Nonce: Nonce Value for this chunk */ + uint8_t ect_nonce; + uint8_t fwd_tsn_cnt; + /* + * part of the Highest sacked algorithm to be able to stroke counts + * on ones that are FR'd. + */ + uint32_t fast_retran_tsn; /* sending_seq at the time of FR */ + struct timeval timetodrop; /* time we drop it from queue */ + uint8_t doing_fast_retransmit; + uint8_t rcv_flags; /* flags pulled from data chunk on inbound for + * outbound holds sending flags for PR-SCTP. */ + uint8_t state_flags; + uint8_t chunk_was_revoked; +}; + +TAILQ_HEAD(sctpchunk_listhead, sctp_tmit_chunk); + +/* The lower byte is used to enumerate PR_SCTP policies */ +#define CHUNK_FLAGS_PR_SCTP_TTL SCTP_PR_SCTP_TTL +#define CHUNK_FLAGS_PR_SCTP_BUF SCTP_PR_SCTP_BUF +#define CHUNK_FLAGS_PR_SCTP_RTX SCTP_PR_SCTP_RTX + +/* The upper byte is used a a bit mask */ +#define CHUNK_FLAGS_FRAGMENT_OK 0x0100 + +struct chk_id { + uint16_t id; + uint16_t can_take_data; +}; + + +struct sctp_tmit_chunk { + union { + struct sctp_data_chunkrec data; + struct chk_id chunk_id; + } rec; + struct sctp_association *asoc; /* bp to asoc this belongs to */ + struct timeval sent_rcv_time; /* filled in if RTT being calculated */ + struct mbuf *data; /* pointer to mbuf chain of data */ + struct mbuf *last_mbuf; /* pointer to last mbuf in chain */ + struct sctp_nets *whoTo; + TAILQ_ENTRY(sctp_tmit_chunk) sctp_next; /* next link */ + int32_t sent; /* the send status */ + uint16_t snd_count; /* number of times I sent */ + uint16_t flags; /* flags, such as FRAGMENT_OK */ + uint16_t send_size; + uint16_t book_size; + uint16_t mbcnt; + uint16_t auth_keyid; + uint8_t holds_key_ref; /* flag if auth keyid refcount is held */ + uint8_t pad_inplace; + uint8_t do_rtt; + uint8_t book_size_scale; + uint8_t no_fr_allowed; + uint8_t pr_sctp_on; + uint8_t copy_by_ref; + uint8_t window_probe; +}; + +/* + * The first part of this structure MUST be the entire sinfo structure. Maybe + * I should have made it a sub structure... we can circle back later and do + * that if we want. + */ +struct sctp_queued_to_read { /* sinfo structure Pluse more */ + uint16_t sinfo_stream; /* off the wire */ + uint16_t sinfo_ssn; /* off the wire */ + uint16_t sinfo_flags; /* SCTP_UNORDERED from wire use SCTP_EOF for + * EOR */ + uint32_t sinfo_ppid; /* off the wire */ + uint32_t sinfo_context; /* pick this up from assoc def context? */ + uint32_t sinfo_timetolive; /* not used by kernel */ + uint32_t sinfo_tsn; /* Use this in reassembly as first TSN */ + uint32_t sinfo_cumtsn; /* Use this in reassembly as last TSN */ + sctp_assoc_t sinfo_assoc_id; /* our assoc id */ + /* Non sinfo stuff */ + uint32_t length; /* length of data */ + uint32_t held_length; /* length held in sb */ + struct sctp_nets *whoFrom; /* where it came from */ + struct mbuf *data; /* front of the mbuf chain of data with + * PKT_HDR */ + struct mbuf *tail_mbuf; /* used for multi-part data */ + struct mbuf *aux_data; /* used to hold/cache control if o/s does not + * take it from us */ + struct sctp_tcb *stcb; /* assoc, used for window update */ + TAILQ_ENTRY(sctp_queued_to_read) next; + uint16_t port_from; + uint16_t spec_flags; /* Flags to hold the notification field */ + uint8_t do_not_ref_stcb; + uint8_t end_added; + uint8_t pdapi_aborted; + uint8_t some_taken; +}; + +/* This data structure will be on the outbound + * stream queues. Data will be pulled off from + * the front of the mbuf data and chunk-ified + * by the output routines. We will custom + * fit every chunk we pull to the send/sent + * queue to make up the next full packet + * if we can. An entry cannot be removed + * from the stream_out queue until + * the msg_is_complete flag is set. This + * means at times data/tail_mbuf MIGHT + * be NULL.. If that occurs it happens + * for one of two reasons. Either the user + * is blocked on a send() call and has not + * awoken to copy more data down... OR + * the user is in the explict MSG_EOR mode + * and wrote some data, but has not completed + * sending. + */ +struct sctp_stream_queue_pending { + struct mbuf *data; + struct mbuf *tail_mbuf; + struct timeval ts; + struct sctp_nets *net; + TAILQ_ENTRY(sctp_stream_queue_pending) next; + uint32_t length; + uint32_t timetolive; + uint32_t ppid; + uint32_t context; + uint16_t sinfo_flags; + uint16_t stream; + uint16_t strseq; + uint16_t act_flags; + uint16_t auth_keyid; + uint8_t holds_key_ref; + uint8_t msg_is_complete; + uint8_t some_taken; + uint8_t pr_sctp_on; + uint8_t sender_all_done; + uint8_t put_last_out; + uint8_t discard_rest; +}; + +/* + * this struct contains info that is used to track inbound stream data and + * help with ordering. + */ +TAILQ_HEAD(sctpwheelunrel_listhead, sctp_stream_in); +struct sctp_stream_in { + struct sctp_readhead inqueue; + uint16_t stream_no; + uint16_t last_sequence_delivered; /* used for re-order */ + uint8_t delivery_started; +}; + +/* This struct is used to track the traffic on outbound streams */ +TAILQ_HEAD(sctpwheel_listhead, sctp_stream_out); +struct sctp_stream_out { + struct sctp_streamhead outqueue; + TAILQ_ENTRY(sctp_stream_out) next_spoke; /* next link in wheel */ + uint16_t stream_no; + uint16_t next_sequence_sent; /* next one I expect to send out */ + uint8_t last_msg_incomplete; +}; + +/* used to keep track of the addresses yet to try to add/delete */ +TAILQ_HEAD(sctp_asconf_addrhead, sctp_asconf_addr); +struct sctp_asconf_addr { + TAILQ_ENTRY(sctp_asconf_addr) next; + struct sctp_asconf_addr_param ap; + struct sctp_ifa *ifa; /* save the ifa for add/del ip */ + uint8_t sent; /* has this been sent yet? */ + uint8_t special_del; /* not to be used in lookup */ +}; + +struct sctp_scoping { + uint8_t ipv4_addr_legal; + uint8_t ipv6_addr_legal; + uint8_t loopback_scope; + uint8_t ipv4_local_scope; + uint8_t local_scope; + uint8_t site_scope; +}; + +#define SCTP_TSN_LOG_SIZE 40 + +struct sctp_tsn_log { + void *stcb; + uint32_t tsn; + uint16_t strm; + uint16_t seq; + uint16_t sz; + uint16_t flgs; + uint16_t in_pos; + uint16_t in_out; +}; + +#define SCTP_FS_SPEC_LOG_SIZE 200 +struct sctp_fs_spec_log { + uint32_t sent; + uint32_t total_flight; + uint32_t tsn; + uint16_t book; + uint8_t incr; + uint8_t decr; +}; + +/* This struct is here to cut out the compatiabilty + * pad that bulks up both the inp and stcb. The non + * pad portion MUST stay in complete sync with + * sctp_sndrcvinfo... i.e. if sinfo_xxxx is added + * this must be done here too. + */ +struct sctp_nonpad_sndrcvinfo { + uint16_t sinfo_stream; + uint16_t sinfo_ssn; + uint16_t sinfo_flags; + uint32_t sinfo_ppid; + uint32_t sinfo_context; + uint32_t sinfo_timetolive; + uint32_t sinfo_tsn; + uint32_t sinfo_cumtsn; + sctp_assoc_t sinfo_assoc_id; +}; + +/* + * JRS - Structure to hold function pointers to the functions responsible + * for congestion control. + */ + +struct sctp_cc_functions { + void (*sctp_set_initial_cc_param) (struct sctp_tcb *stcb, struct sctp_nets *net); + void (*sctp_cwnd_update_after_sack) (struct sctp_tcb *stcb, + struct sctp_association *asoc, + int accum_moved, int reneged_all, int will_exit); + void (*sctp_cwnd_update_after_fr) (struct sctp_tcb *stcb, + struct sctp_association *asoc); + void (*sctp_cwnd_update_after_timeout) (struct sctp_tcb *stcb, + struct sctp_nets *net); + void (*sctp_cwnd_update_after_ecn_echo) (struct sctp_tcb *stcb, + struct sctp_nets *net); + void (*sctp_cwnd_update_after_packet_dropped) (struct sctp_tcb *stcb, + struct sctp_nets *net, struct sctp_pktdrop_chunk *cp, + uint32_t * bottle_bw, uint32_t * on_queue); + void (*sctp_cwnd_update_after_output) (struct sctp_tcb *stcb, + struct sctp_nets *net, int burst_limit); + void (*sctp_cwnd_update_after_fr_timer) (struct sctp_inpcb *inp, + struct sctp_tcb *stcb, struct sctp_nets *net); +}; + +/* used to save ASCONF chunks for retransmission */ +TAILQ_HEAD(sctp_asconf_head, sctp_asconf); +struct sctp_asconf { + TAILQ_ENTRY(sctp_asconf) next; + uint32_t serial_number; + uint16_t snd_count; + struct mbuf *data; + uint16_t len; +}; + +/* used to save ASCONF-ACK chunks for retransmission */ +TAILQ_HEAD(sctp_asconf_ackhead, sctp_asconf_ack); +struct sctp_asconf_ack { + TAILQ_ENTRY(sctp_asconf_ack) next; + uint32_t serial_number; + struct sctp_nets *last_sent_to; + struct mbuf *data; + uint16_t len; +}; + +/* + * Here we have information about each individual association that we track. + * We probably in production would be more dynamic. But for ease of + * implementation we will have a fixed array that we hunt for in a linear + * fashion. + */ +struct sctp_association { + /* association state */ + int state; + + /* queue of pending addrs to add/delete */ + struct sctp_asconf_addrhead asconf_queue; + + struct timeval time_entered; /* time we entered state */ + struct timeval time_last_rcvd; + struct timeval time_last_sent; + struct timeval time_last_sat_advance; + struct sctp_nonpad_sndrcvinfo def_send; + + /* timers and such */ + struct sctp_timer hb_timer; /* hb timer */ + struct sctp_timer dack_timer; /* Delayed ack timer */ + struct sctp_timer asconf_timer; /* asconf */ + struct sctp_timer strreset_timer; /* stream reset */ + struct sctp_timer shut_guard_timer; /* shutdown guard */ + struct sctp_timer autoclose_timer; /* automatic close timer */ + struct sctp_timer delayed_event_timer; /* timer for delayed events */ + struct sctp_timer delete_prim_timer; /* deleting primary dst */ + + /* list of restricted local addresses */ + struct sctpladdr sctp_restricted_addrs; + + /* last local address pending deletion (waiting for an address add) */ + struct sctp_ifa *asconf_addr_del_pending; + /* Deleted primary destination (used to stop timer) */ + struct sctp_nets *deleted_primary; + + struct sctpnetlisthead nets; /* remote address list */ + + /* Free chunk list */ + struct sctpchunk_listhead free_chunks; + + /* Control chunk queue */ + struct sctpchunk_listhead control_send_queue; + + /* ASCONF chunk queue */ + struct sctpchunk_listhead asconf_send_queue; + + /* + * Once a TSN hits the wire it is moved to the sent_queue. We + * maintain two counts here (don't know if any but retran_cnt is + * needed). The idea is that the sent_queue_retran_cnt reflects how + * many chunks have been marked for retranmission by either T3-rxt + * or FR. + */ + struct sctpchunk_listhead sent_queue; + struct sctpchunk_listhead send_queue; + + /* re-assembly queue for fragmented chunks on the inbound path */ + struct sctpchunk_listhead reasmqueue; + + /* + * this queue is used when we reach a condition that we can NOT put + * data into the socket buffer. We track the size of this queue and + * set our rwnd to the space in the socket minus also the + * size_on_delivery_queue. + */ + struct sctpwheel_listhead out_wheel; + + /* + * This pointer will be set to NULL most of the time. But when we + * have a fragmented message, where we could not get out all of the + * message at the last send then this will point to the stream to go + * get data from. + */ + struct sctp_stream_out *locked_on_sending; + + /* If an iterator is looking at me, this is it */ + struct sctp_iterator *stcb_starting_point_for_iterator; + + /* ASCONF save the last ASCONF-ACK so we can resend it if necessary */ + struct sctp_asconf_ackhead asconf_ack_sent; + + /* + * pointer to last stream reset queued to control queue by us with + * requests. + */ + struct sctp_tmit_chunk *str_reset; + /* + * if Source Address Selection happening, this will rotate through + * the link list. + */ + struct sctp_laddr *last_used_address; + + /* stream arrays */ + struct sctp_stream_in *strmin; + struct sctp_stream_out *strmout; + uint8_t *mapping_array; + /* primary destination to use */ + struct sctp_nets *primary_destination; + /* For CMT */ + struct sctp_nets *last_net_cmt_send_started; + /* last place I got a data chunk from */ + struct sctp_nets *last_data_chunk_from; + /* last place I got a control from */ + struct sctp_nets *last_control_chunk_from; + + /* circular looking for output selection */ + struct sctp_stream_out *last_out_stream; + + /* + * wait to the point the cum-ack passes req->send_reset_at_tsn for + * any req on the list. + */ + struct sctp_resethead resetHead; + + /* queue of chunks waiting to be sent into the local stack */ + struct sctp_readhead pending_reply_queue; + + /* JRS - the congestion control functions are in this struct */ + struct sctp_cc_functions cc_functions; + /* + * JRS - value to store the currently loaded congestion control + * module + */ + uint32_t congestion_control_module; + + uint32_t vrf_id; + + uint32_t cookie_preserve_req; + /* ASCONF next seq I am sending out, inits at init-tsn */ + uint32_t asconf_seq_out; + uint32_t asconf_seq_out_acked; + /* ASCONF last received ASCONF from peer, starts at peer's TSN-1 */ + uint32_t asconf_seq_in; + + /* next seq I am sending in str reset messages */ + uint32_t str_reset_seq_out; + /* next seq I am expecting in str reset messages */ + uint32_t str_reset_seq_in; + + /* various verification tag information */ + uint32_t my_vtag; /* The tag to be used. if assoc is re-initited + * by remote end, and I have unlocked this + * will be regenerated to a new random value. */ + uint32_t peer_vtag; /* The peers last tag */ + + uint32_t my_vtag_nonce; + uint32_t peer_vtag_nonce; + + uint32_t assoc_id; + + /* This is the SCTP fragmentation threshold */ + uint32_t smallest_mtu; + + /* + * Special hook for Fast retransmit, allows us to track the highest + * TSN that is NEW in this SACK if gap ack blocks are present. + */ + uint32_t this_sack_highest_gap; + + /* + * The highest consecutive TSN that has been acked by peer on my + * sends + */ + uint32_t last_acked_seq; + + /* The next TSN that I will use in sending. */ + uint32_t sending_seq; + + /* Original seq number I used ??questionable to keep?? */ + uint32_t init_seq_number; + + + /* The Advanced Peer Ack Point, as required by the PR-SCTP */ + /* (A1 in Section 4.2) */ + uint32_t advanced_peer_ack_point; + + /* + * The highest consequetive TSN at the bottom of the mapping array + * (for his sends). + */ + uint32_t cumulative_tsn; + /* + * Used to track the mapping array and its offset bits. This MAY be + * lower then cumulative_tsn. + */ + uint32_t mapping_array_base_tsn; + /* + * used to track highest TSN we have received and is listed in the + * mapping array. + */ + uint32_t highest_tsn_inside_map; + + /* EY - new NR variables used for nr_sack based on mapping_array */ + uint8_t *nr_mapping_array; + uint32_t highest_tsn_inside_nr_map; + + uint32_t last_echo_tsn; + uint32_t last_cwr_tsn; + uint32_t fast_recovery_tsn; + uint32_t sat_t3_recovery_tsn; + uint32_t tsn_last_delivered; + /* + * For the pd-api we should re-write this a bit more efficent. We + * could have multiple sctp_queued_to_read's that we are building at + * once. Now we only do this when we get ready to deliver to the + * socket buffer. Note that we depend on the fact that the struct is + * "stuck" on the read queue until we finish all the pd-api. + */ + struct sctp_queued_to_read *control_pdapi; + + uint32_t tsn_of_pdapi_last_delivered; + uint32_t pdapi_ppid; + uint32_t context; + uint32_t last_reset_action[SCTP_MAX_RESET_PARAMS]; + uint32_t last_sending_seq[SCTP_MAX_RESET_PARAMS]; + uint32_t last_base_tsnsent[SCTP_MAX_RESET_PARAMS]; +#ifdef SCTP_ASOCLOG_OF_TSNS + /* + * special log - This adds considerable size to the asoc, but + * provides a log that you can use to detect problems via kgdb. + */ + struct sctp_tsn_log in_tsnlog[SCTP_TSN_LOG_SIZE]; + struct sctp_tsn_log out_tsnlog[SCTP_TSN_LOG_SIZE]; + uint32_t cumack_log[SCTP_TSN_LOG_SIZE]; + uint32_t cumack_logsnt[SCTP_TSN_LOG_SIZE]; + uint16_t tsn_in_at; + uint16_t tsn_out_at; + uint16_t tsn_in_wrapped; + uint16_t tsn_out_wrapped; + uint16_t cumack_log_at; + uint16_t cumack_log_atsnt; +#endif /* SCTP_ASOCLOG_OF_TSNS */ +#ifdef SCTP_FS_SPEC_LOG + struct sctp_fs_spec_log fslog[SCTP_FS_SPEC_LOG_SIZE]; + uint16_t fs_index; +#endif + + /* + * window state information and smallest MTU that I use to bound + * segmentation + */ + uint32_t peers_rwnd; + uint32_t my_rwnd; + uint32_t my_last_reported_rwnd; + uint32_t sctp_frag_point; + + uint32_t total_output_queue_size; + + uint32_t sb_cc; /* shadow of sb_cc */ + uint32_t sb_send_resv; /* amount reserved on a send */ + uint32_t my_rwnd_control_len; /* shadow of sb_mbcnt used for rwnd + * control */ + /* 32 bit nonce stuff */ + uint32_t nonce_resync_tsn; + uint32_t nonce_wait_tsn; + uint32_t default_flowlabel; + uint32_t pr_sctp_cnt; + int ctrl_queue_cnt; /* could be removed REM */ + /* + * All outbound datagrams queue into this list from the individual + * stream queue. Here they get assigned a TSN and then await + * sending. The stream seq comes when it is first put in the + * individual str queue + */ + unsigned int stream_queue_cnt; + unsigned int send_queue_cnt; + unsigned int sent_queue_cnt; + unsigned int sent_queue_cnt_removeable; + /* + * Number on sent queue that are marked for retran until this value + * is 0 we only send one packet of retran'ed data. + */ + unsigned int sent_queue_retran_cnt; + + unsigned int size_on_reasm_queue; + unsigned int cnt_on_reasm_queue; + unsigned int fwd_tsn_cnt; + /* amount of data (bytes) currently in flight (on all destinations) */ + unsigned int total_flight; + /* Total book size in flight */ + unsigned int total_flight_count; /* count of chunks used with + * book total */ + /* count of destinaton nets and list of destination nets */ + unsigned int numnets; + + /* Total error count on this association */ + unsigned int overall_error_count; + + unsigned int cnt_msg_on_sb; + + /* All stream count of chunks for delivery */ + unsigned int size_on_all_streams; + unsigned int cnt_on_all_streams; + + /* Heart Beat delay in ticks */ + unsigned int heart_beat_delay; + + /* autoclose */ + unsigned int sctp_autoclose_ticks; + + /* how many preopen streams we have */ + unsigned int pre_open_streams; + + /* How many streams I support coming into me */ + unsigned int max_inbound_streams; + + /* the cookie life I award for any cookie, in seconds */ + unsigned int cookie_life; + /* time to delay acks for */ + unsigned int delayed_ack; + unsigned int old_delayed_ack; + unsigned int sack_freq; + unsigned int data_pkts_seen; + + unsigned int numduptsns; + int dup_tsns[SCTP_MAX_DUP_TSNS]; + unsigned int initial_init_rto_max; /* initial RTO for INIT's */ + unsigned int initial_rto; /* initial send RTO */ + unsigned int minrto; /* per assoc RTO-MIN */ + unsigned int maxrto; /* per assoc RTO-MAX */ + + /* authentication fields */ + sctp_auth_chklist_t *local_auth_chunks; + sctp_auth_chklist_t *peer_auth_chunks; + sctp_hmaclist_t *local_hmacs; /* local HMACs supported */ + sctp_hmaclist_t *peer_hmacs; /* peer HMACs supported */ + struct sctp_keyhead shared_keys; /* assoc's shared keys */ + sctp_authinfo_t authinfo; /* randoms, cached keys */ + /* + * refcnt to block freeing when a sender or receiver is off coping + * user data in. + */ + uint32_t refcnt; + uint32_t chunks_on_out_queue; /* total chunks floating around, + * locked by send socket buffer */ + uint32_t peers_adaptation; + uint16_t peer_hmac_id; /* peer HMAC id to send */ + + /* + * Being that we have no bag to collect stale cookies, and that we + * really would not want to anyway.. we will count them in this + * counter. We of course feed them to the pigeons right away (I have + * always thought of pigeons as flying rats). + */ + uint16_t stale_cookie_count; + + /* + * For the partial delivery API, if up, invoked this is what last + * TSN I delivered + */ + uint16_t str_of_pdapi; + uint16_t ssn_of_pdapi; + + /* counts of actual built streams. Allocation may be more however */ + /* could re-arrange to optimize space here. */ + uint16_t streamincnt; + uint16_t streamoutcnt; + uint16_t strm_realoutsize; + /* my maximum number of retrans of INIT and SEND */ + /* copied from SCTP but should be individually setable */ + uint16_t max_init_times; + uint16_t max_send_times; + + uint16_t def_net_failure; + + /* + * lock flag: 0 is ok to send, 1+ (duals as a retran count) is + * awaiting ACK + */ + uint16_t mapping_array_size; + + uint16_t last_strm_seq_delivered; + uint16_t last_strm_no_delivered; + + uint16_t last_revoke_count; + int16_t num_send_timers_up; + + uint16_t stream_locked_on; + uint16_t ecn_echo_cnt_onq; + + uint16_t free_chunk_cnt; + + uint8_t stream_locked; + uint8_t authenticated; /* packet authenticated ok */ + /* + * This flag indicates that a SACK need to be sent. Initially this + * is 1 to send the first sACK immediately. + */ + uint8_t send_sack; + + /* max burst after fast retransmit completes */ + uint8_t max_burst; + + uint8_t sat_network; /* RTT is in range of sat net or greater */ + uint8_t sat_network_lockout; /* lockout code */ + uint8_t burst_limit_applied; /* Burst limit in effect at last send? */ + /* flag goes on when we are doing a partial delivery api */ + uint8_t hb_random_values[4]; + uint8_t fragmented_delivery_inprogress; + uint8_t fragment_flags; + uint8_t last_flags_delivered; + uint8_t hb_ect_randombit; + uint8_t hb_random_idx; + uint8_t hb_is_disabled; /* is the hb disabled? */ + uint8_t default_tos; + uint8_t asconf_del_pending; /* asconf delete last addr pending */ + + /* ECN Nonce stuff */ + uint8_t receiver_nonce_sum; /* nonce I sum and put in my sack */ + uint8_t ecn_nonce_allowed; /* Tells us if ECN nonce is on */ + uint8_t nonce_sum_check;/* On off switch used during re-sync */ + uint8_t nonce_wait_for_ecne; /* flag when we expect a ECN */ + uint8_t peer_supports_ecn_nonce; + + /* + * This value, plus all other ack'd but above cum-ack is added + * together to cross check against the bit that we have yet to + * define (probably in the SACK). When the cum-ack is updated, this + * sum is updated as well. + */ + uint8_t nonce_sum_expect_base; + /* Flag to tell if ECN is allowed */ + uint8_t ecn_allowed; + + /* flag to indicate if peer can do asconf */ + uint8_t peer_supports_asconf; + /* EY - flag to indicate if peer can do nr_sack */ + uint8_t peer_supports_nr_sack; + /* pr-sctp support flag */ + uint8_t peer_supports_prsctp; + /* peer authentication support flag */ + uint8_t peer_supports_auth; + /* stream resets are supported by the peer */ + uint8_t peer_supports_strreset; + + uint8_t peer_supports_nat; + /* + * packet drop's are supported by the peer, we don't really care + * about this but we bookkeep it anyway. + */ + uint8_t peer_supports_pktdrop; + + /* Do we allow V6/V4? */ + uint8_t ipv4_addr_legal; + uint8_t ipv6_addr_legal; + /* Address scoping flags */ + /* scope value for IPv4 */ + uint8_t ipv4_local_scope; + /* scope values for IPv6 */ + uint8_t local_scope; + uint8_t site_scope; + /* loopback scope */ + uint8_t loopback_scope; + /* flags to handle send alternate net tracking */ + uint8_t used_alt_onsack; + uint8_t used_alt_asconfack; + uint8_t fast_retran_loss_recovery; + uint8_t sat_t3_loss_recovery; + uint8_t dropped_special_cnt; + uint8_t seen_a_sack_this_pkt; + uint8_t stream_reset_outstanding; + uint8_t stream_reset_out_is_outstanding; + uint8_t delayed_connection; + uint8_t ifp_had_enobuf; + uint8_t saw_sack_with_frags; + uint8_t saw_sack_with_nr_frags; + uint8_t in_asocid_hash; + uint8_t assoc_up_sent; + uint8_t adaptation_needed; + uint8_t adaptation_sent; + /* CMT variables */ + uint8_t cmt_dac_pkts_rcvd; + uint8_t sctp_cmt_on_off; + uint8_t iam_blocking; + uint8_t cookie_how[8]; + /* EY 05/05/08 - NR_SACK variable */ + uint8_t sctp_nr_sack_on_off; + /* JRS 5/21/07 - CMT PF variable */ + uint8_t sctp_cmt_pf; + /* + * The mapping array is used to track out of order sequences above + * last_acked_seq. 0 indicates packet missing 1 indicates packet + * rec'd. We slide it up every time we raise last_acked_seq and 0 + * trailing locactions out. If I get a TSN above the array + * mappingArraySz, I discard the datagram and let retransmit happen. + */ + uint32_t marked_retrans; + uint32_t timoinit; + uint32_t timodata; + uint32_t timosack; + uint32_t timoshutdown; + uint32_t timoheartbeat; + uint32_t timocookie; + uint32_t timoshutdownack; + struct timeval start_time; + struct timeval discontinuity_time; +}; + +#endif diff --git a/freebsd/sys/netinet/sctp_sysctl.c b/freebsd/sys/netinet/sctp_sysctl.c new file mode 100644 index 00000000..b5700e4e --- /dev/null +++ b/freebsd/sys/netinet/sctp_sysctl.c @@ -0,0 +1,1108 @@ +#include + +/*- + * Copyright (c) 2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * sysctl tunable variables + */ + +void +sctp_init_sysctls() +{ + SCTP_BASE_SYSCTL(sctp_sendspace) = SCTPCTL_MAXDGRAM_DEFAULT; + SCTP_BASE_SYSCTL(sctp_recvspace) = SCTPCTL_RECVSPACE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_auto_asconf) = SCTPCTL_AUTOASCONF_DEFAULT; + SCTP_BASE_SYSCTL(sctp_multiple_asconfs) = SCTPCTL_MULTIPLEASCONFS_DEFAULT; + SCTP_BASE_SYSCTL(sctp_ecn_enable) = SCTPCTL_ECN_ENABLE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_ecn_nonce) = SCTPCTL_ECN_NONCE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_strict_sacks) = SCTPCTL_STRICT_SACKS_DEFAULT; +#if !defined(SCTP_WITH_NO_CSUM) + SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback) = SCTPCTL_LOOPBACK_NOCSUM_DEFAULT; +#endif + SCTP_BASE_SYSCTL(sctp_strict_init) = SCTPCTL_STRICT_INIT_DEFAULT; + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh) = SCTPCTL_PEER_CHKOH_DEFAULT; + SCTP_BASE_SYSCTL(sctp_max_burst_default) = SCTPCTL_MAXBURST_DEFAULT; + SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue) = SCTPCTL_MAXCHUNKS_DEFAULT; + SCTP_BASE_SYSCTL(sctp_hashtblsize) = SCTPCTL_TCBHASHSIZE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_pcbtblsize) = SCTPCTL_PCBHASHSIZE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_min_split_point) = SCTPCTL_MIN_SPLIT_POINT_DEFAULT; + SCTP_BASE_SYSCTL(sctp_chunkscale) = SCTPCTL_CHUNKSCALE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default) = SCTPCTL_DELAYED_SACK_TIME_DEFAULT; + SCTP_BASE_SYSCTL(sctp_sack_freq_default) = SCTPCTL_SACK_FREQ_DEFAULT; + SCTP_BASE_SYSCTL(sctp_system_free_resc_limit) = SCTPCTL_SYS_RESOURCE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit) = SCTPCTL_ASOC_RESOURCE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default) = SCTPCTL_HEARTBEAT_INTERVAL_DEFAULT; + SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default) = SCTPCTL_PMTU_RAISE_TIME_DEFAULT; + SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default) = SCTPCTL_SHUTDOWN_GUARD_TIME_DEFAULT; + SCTP_BASE_SYSCTL(sctp_secret_lifetime_default) = SCTPCTL_SECRET_LIFETIME_DEFAULT; + SCTP_BASE_SYSCTL(sctp_rto_max_default) = SCTPCTL_RTO_MAX_DEFAULT; + SCTP_BASE_SYSCTL(sctp_rto_min_default) = SCTPCTL_RTO_MIN_DEFAULT; + SCTP_BASE_SYSCTL(sctp_rto_initial_default) = SCTPCTL_RTO_INITIAL_DEFAULT; + SCTP_BASE_SYSCTL(sctp_init_rto_max_default) = SCTPCTL_INIT_RTO_MAX_DEFAULT; + SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default) = SCTPCTL_VALID_COOKIE_LIFE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_init_rtx_max_default) = SCTPCTL_INIT_RTX_MAX_DEFAULT; + SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default) = SCTPCTL_ASSOC_RTX_MAX_DEFAULT; + SCTP_BASE_SYSCTL(sctp_path_rtx_max_default) = SCTPCTL_PATH_RTX_MAX_DEFAULT; + SCTP_BASE_SYSCTL(sctp_add_more_threshold) = SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT; + SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default) = SCTPCTL_OUTGOING_STREAMS_DEFAULT; + SCTP_BASE_SYSCTL(sctp_cmt_on_off) = SCTPCTL_CMT_ON_OFF_DEFAULT; + /* EY */ + SCTP_BASE_SYSCTL(sctp_nr_sack_on_off) = SCTPCTL_NR_SACK_ON_OFF_DEFAULT; + SCTP_BASE_SYSCTL(sctp_cmt_use_dac) = SCTPCTL_CMT_USE_DAC_DEFAULT; + SCTP_BASE_SYSCTL(sctp_cmt_pf) = SCTPCTL_CMT_PF_DEFAULT; + SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) = SCTPCTL_CWND_MAXBURST_DEFAULT; + SCTP_BASE_SYSCTL(sctp_early_fr) = SCTPCTL_EARLY_FAST_RETRAN_DEFAULT; + SCTP_BASE_SYSCTL(sctp_early_fr_msec) = SCTPCTL_EARLY_FAST_RETRAN_MSEC_DEFAULT; + SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk) = SCTPCTL_ASCONF_AUTH_NOCHK_DEFAULT; + SCTP_BASE_SYSCTL(sctp_auth_disable) = SCTPCTL_AUTH_DISABLE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_nat_friendly) = SCTPCTL_NAT_FRIENDLY_DEFAULT; + SCTP_BASE_SYSCTL(sctp_L2_abc_variable) = SCTPCTL_ABC_L_VAR_DEFAULT; + SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count) = SCTPCTL_MAX_CHAINED_MBUFS_DEFAULT; + SCTP_BASE_SYSCTL(sctp_do_drain) = SCTPCTL_DO_SCTP_DRAIN_DEFAULT; + SCTP_BASE_SYSCTL(sctp_hb_maxburst) = SCTPCTL_HB_MAX_BURST_DEFAULT; + SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit) = SCTPCTL_ABORT_AT_LIMIT_DEFAULT; + SCTP_BASE_SYSCTL(sctp_strict_data_order) = SCTPCTL_STRICT_DATA_ORDER_DEFAULT; + SCTP_BASE_SYSCTL(sctp_min_residual) = SCTPCTL_MIN_RESIDUAL_DEFAULT; + SCTP_BASE_SYSCTL(sctp_max_retran_chunk) = SCTPCTL_MAX_RETRAN_CHUNK_DEFAULT; + SCTP_BASE_SYSCTL(sctp_logging_level) = SCTPCTL_LOGGING_LEVEL_DEFAULT; + /* JRS - Variable for default congestion control module */ + SCTP_BASE_SYSCTL(sctp_default_cc_module) = SCTPCTL_DEFAULT_CC_MODULE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_default_frag_interleave) = SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_mobility_base) = SCTPCTL_MOBILITY_BASE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff) = SCTPCTL_MOBILITY_FASTHANDOFF_DEFAULT; + SCTP_BASE_SYSCTL(sctp_vtag_time_wait) = SCTPCTL_TIME_WAIT_DEFAULT; + SCTP_BASE_SYSCTL(sctp_buffer_splitting) = SCTPCTL_BUFFER_SPLITTING_DEFAULT; + SCTP_BASE_SYSCTL(sctp_initial_cwnd) = SCTPCTL_INITIAL_CWND_DEFAULT; +#if defined(SCTP_LOCAL_TRACE_BUF) + memset(&SCTP_BASE_SYSCTL(sctp_log), 0, sizeof(struct sctp_log)); +#endif + SCTP_BASE_SYSCTL(sctp_udp_tunneling_for_client_enable) = SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_udp_tunneling_port) = SCTPCTL_UDP_TUNNELING_PORT_DEFAULT; + SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) = SCTPCTL_SACK_IMMEDIATELY_ENABLE_DEFAULT; + SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly) = SCTPCTL_NAT_FRIENDLY_INITS_DEFAULT; +#if defined(SCTP_DEBUG) + SCTP_BASE_SYSCTL(sctp_debug_on) = SCTPCTL_DEBUG_DEFAULT; +#endif +#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_BASE_SYSCTL(sctp_output_unlocked) = SCTPCTL_OUTPUT_UNLOCKED_DEFAULT; +#endif +} + + +/* It returns an upper limit. No filtering is done here */ +static unsigned int +number_of_addresses(struct sctp_inpcb *inp) +{ + int cnt; + struct sctp_vrf *vrf; + struct sctp_ifn *sctp_ifn; + struct sctp_ifa *sctp_ifa; + struct sctp_laddr *laddr; + + cnt = 0; + /* neither Mac OS X nor FreeBSD support mulitple routing functions */ + if ((vrf = sctp_find_vrf(inp->def_vrf_id)) == NULL) { + return (0); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + if ((sctp_ifa->address.sa.sa_family == AF_INET) || + (sctp_ifa->address.sa.sa_family == AF_INET6)) { + cnt++; + } + } + } + } else { + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if ((laddr->ifa->address.sa.sa_family == AF_INET) || + (laddr->ifa->address.sa.sa_family == AF_INET6)) { + cnt++; + } + } + } + return (cnt); +} + +static int +copy_out_local_addresses(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sysctl_req *req) +{ + struct sctp_ifn *sctp_ifn; + struct sctp_ifa *sctp_ifa; + int loopback_scope, ipv4_local_scope, local_scope, site_scope; + int ipv4_addr_legal, ipv6_addr_legal; + struct sctp_vrf *vrf; + struct xsctp_laddr xladdr; + struct sctp_laddr *laddr; + int error; + + /* Turn on all the appropriate scope */ + if (stcb) { + /* use association specific values */ + loopback_scope = stcb->asoc.loopback_scope; + ipv4_local_scope = stcb->asoc.ipv4_local_scope; + local_scope = stcb->asoc.local_scope; + site_scope = stcb->asoc.site_scope; + } else { + /* use generic values for endpoints */ + loopback_scope = 1; + ipv4_local_scope = 1; + local_scope = 1; + site_scope = 1; + } + + /* use only address families of interest */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ipv6_addr_legal = 1; + if (SCTP_IPV6_V6ONLY(inp)) { + ipv4_addr_legal = 0; + } else { + ipv4_addr_legal = 1; + } + } else { + ipv4_addr_legal = 1; + ipv6_addr_legal = 0; + } + + /* neither Mac OS X nor FreeBSD support mulitple routing functions */ + if ((vrf = sctp_find_vrf(inp->def_vrf_id)) == NULL) { + SCTP_INP_RUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + return (-1); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { + if ((loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) + /* Skip loopback if loopback_scope not set */ + continue; + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + if (stcb) { + /* + * ignore if blacklisted at + * association level + */ + if (sctp_is_addr_restricted(stcb, sctp_ifa)) + continue; + } + switch (sctp_ifa->address.sa.sa_family) { + case AF_INET: + if (ipv4_addr_legal) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&sctp_ifa->address.sa; + if (sin->sin_addr.s_addr == 0) + continue; + if ((ipv4_local_scope == 0) && (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) + continue; + } else { + continue; + } + break; +#ifdef INET6 + case AF_INET6: + if (ipv6_addr_legal) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + continue; + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { + if (local_scope == 0) + continue; + if (sin6->sin6_scope_id == 0) { + /* + * bad link + * local + * address + */ + if (sa6_recoverscope(sin6) != 0) + continue; + } + } + if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) + continue; + } else { + continue; + } + break; +#endif + default: + continue; + } + memset((void *)&xladdr, 0, sizeof(struct xsctp_laddr)); + memcpy((void *)&xladdr.address, (const void *)&sctp_ifa->address, sizeof(union sctp_sockstore)); + SCTP_INP_RUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + error = SYSCTL_OUT(req, &xladdr, sizeof(struct xsctp_laddr)); + if (error) { + return (error); + } else { + SCTP_INP_INFO_RLOCK(); + SCTP_INP_RLOCK(inp); + } + } + } + } else { + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + /* ignore if blacklisted at association level */ + if (stcb && sctp_is_addr_restricted(stcb, laddr->ifa)) + continue; + memset((void *)&xladdr, 0, sizeof(struct xsctp_laddr)); + memcpy((void *)&xladdr.address, (const void *)&laddr->ifa->address, sizeof(union sctp_sockstore)); + xladdr.start_time.tv_sec = (uint32_t) laddr->start_time.tv_sec; + xladdr.start_time.tv_usec = (uint32_t) laddr->start_time.tv_usec; + SCTP_INP_RUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + error = SYSCTL_OUT(req, &xladdr, sizeof(struct xsctp_laddr)); + if (error) { + return (error); + } else { + SCTP_INP_INFO_RLOCK(); + SCTP_INP_RLOCK(inp); + } + } + } + memset((void *)&xladdr, 0, sizeof(struct xsctp_laddr)); + xladdr.last = 1; + SCTP_INP_RUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + error = SYSCTL_OUT(req, &xladdr, sizeof(struct xsctp_laddr)); + + if (error) { + return (error); + } else { + SCTP_INP_INFO_RLOCK(); + SCTP_INP_RLOCK(inp); + return (0); + } +} + +/* + * sysctl functions + */ +static int +sctp_assoclist(SYSCTL_HANDLER_ARGS) +{ + unsigned int number_of_endpoints; + unsigned int number_of_local_addresses; + unsigned int number_of_associations; + unsigned int number_of_remote_addresses; + unsigned int n; + int error; + struct sctp_inpcb *inp; + struct sctp_tcb *stcb; + struct sctp_nets *net; + struct xsctp_inpcb xinpcb; + struct xsctp_tcb xstcb; + struct xsctp_raddr xraddr; + struct socket *so; + + number_of_endpoints = 0; + number_of_local_addresses = 0; + number_of_associations = 0; + number_of_remote_addresses = 0; + + SCTP_INP_INFO_RLOCK(); + if (req->oldptr == USER_ADDR_NULL) { + LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) { + SCTP_INP_RLOCK(inp); + number_of_endpoints++; + number_of_local_addresses += number_of_addresses(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + number_of_associations++; + number_of_local_addresses += number_of_addresses(inp); + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + number_of_remote_addresses++; + } + } + SCTP_INP_RUNLOCK(inp); + } + SCTP_INP_INFO_RUNLOCK(); + n = (number_of_endpoints + 1) * sizeof(struct xsctp_inpcb) + + (number_of_local_addresses + number_of_endpoints + number_of_associations) * sizeof(struct xsctp_laddr) + + (number_of_associations + number_of_endpoints) * sizeof(struct xsctp_tcb) + + (number_of_remote_addresses + number_of_associations) * sizeof(struct xsctp_raddr); + + /* request some more memory than needed */ + req->oldidx = (n + n / 8); + return 0; + } + if (req->newptr != USER_ADDR_NULL) { + SCTP_INP_INFO_RUNLOCK(); + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_SYSCTL, EPERM); + return EPERM; + } + LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) { + SCTP_INP_RLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { + /* if its allgone it is being freed - skip it */ + goto skip; + } + xinpcb.last = 0; + xinpcb.local_port = ntohs(inp->sctp_lport); + xinpcb.flags = inp->sctp_flags; + xinpcb.features = inp->sctp_features; + xinpcb.total_sends = inp->total_sends; + xinpcb.total_recvs = inp->total_recvs; + xinpcb.total_nospaces = inp->total_nospaces; + xinpcb.fragmentation_point = inp->sctp_frag_point; + so = inp->sctp_socket; + if ((so == NULL) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { + xinpcb.qlen = 0; + xinpcb.maxqlen = 0; + } else { + xinpcb.qlen = so->so_qlen; + xinpcb.maxqlen = so->so_qlimit; + } + SCTP_INP_INCR_REF(inp); + SCTP_INP_RUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + error = SYSCTL_OUT(req, &xinpcb, sizeof(struct xsctp_inpcb)); + if (error) { + SCTP_INP_DECR_REF(inp); + return error; + } + SCTP_INP_INFO_RLOCK(); + SCTP_INP_RLOCK(inp); + error = copy_out_local_addresses(inp, NULL, req); + if (error) { + SCTP_INP_DECR_REF(inp); + return error; + } + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + SCTP_TCB_LOCK(stcb); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + xstcb.last = 0; + xstcb.local_port = ntohs(inp->sctp_lport); + xstcb.remote_port = ntohs(stcb->rport); + if (stcb->asoc.primary_destination != NULL) + xstcb.primary_addr = stcb->asoc.primary_destination->ro._l_addr; + xstcb.heartbeat_interval = stcb->asoc.heart_beat_delay; + xstcb.state = SCTP_GET_STATE(&stcb->asoc); /* FIXME */ + /* 7.0 does not support these */ + xstcb.assoc_id = sctp_get_associd(stcb); + xstcb.peers_rwnd = stcb->asoc.peers_rwnd; + xstcb.in_streams = stcb->asoc.streamincnt; + xstcb.out_streams = stcb->asoc.streamoutcnt; + xstcb.max_nr_retrans = stcb->asoc.overall_error_count; + xstcb.primary_process = 0; /* not really supported + * yet */ + xstcb.T1_expireries = stcb->asoc.timoinit + stcb->asoc.timocookie; + xstcb.T2_expireries = stcb->asoc.timoshutdown + stcb->asoc.timoshutdownack; + xstcb.retransmitted_tsns = stcb->asoc.marked_retrans; + xstcb.start_time.tv_sec = (uint32_t) stcb->asoc.start_time.tv_sec; + xstcb.start_time.tv_usec = (uint32_t) stcb->asoc.start_time.tv_usec; + xstcb.discontinuity_time.tv_sec = (uint32_t) stcb->asoc.discontinuity_time.tv_sec; + xstcb.discontinuity_time.tv_usec = (uint32_t) stcb->asoc.discontinuity_time.tv_usec; + xstcb.total_sends = stcb->total_sends; + xstcb.total_recvs = stcb->total_recvs; + xstcb.local_tag = stcb->asoc.my_vtag; + xstcb.remote_tag = stcb->asoc.peer_vtag; + xstcb.initial_tsn = stcb->asoc.init_seq_number; + xstcb.highest_tsn = stcb->asoc.sending_seq - 1; + xstcb.cumulative_tsn = stcb->asoc.last_acked_seq; + xstcb.cumulative_tsn_ack = stcb->asoc.cumulative_tsn; + xstcb.mtu = stcb->asoc.smallest_mtu; + xstcb.refcnt = stcb->asoc.refcnt; + SCTP_INP_RUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + error = SYSCTL_OUT(req, &xstcb, sizeof(struct xsctp_tcb)); + if (error) { + SCTP_INP_DECR_REF(inp); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return error; + } + SCTP_INP_INFO_RLOCK(); + SCTP_INP_RLOCK(inp); + error = copy_out_local_addresses(inp, stcb, req); + if (error) { + SCTP_INP_DECR_REF(inp); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return error; + } + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + xraddr.last = 0; + xraddr.address = net->ro._l_addr; + xraddr.active = ((net->dest_state & SCTP_ADDR_REACHABLE) == SCTP_ADDR_REACHABLE); + xraddr.confirmed = ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0); + xraddr.heartbeat_enabled = ((net->dest_state & SCTP_ADDR_NOHB) == 0); + xraddr.rto = net->RTO; + xraddr.max_path_rtx = net->failure_threshold; + xraddr.rtx = net->marked_retrans; + xraddr.error_counter = net->error_count; + xraddr.cwnd = net->cwnd; + xraddr.flight_size = net->flight_size; + xraddr.mtu = net->mtu; + xraddr.rtt = net->rtt; + xraddr.start_time.tv_sec = (uint32_t) net->start_time.tv_sec; + xraddr.start_time.tv_usec = (uint32_t) net->start_time.tv_usec; + SCTP_INP_RUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + error = SYSCTL_OUT(req, &xraddr, sizeof(struct xsctp_raddr)); + if (error) { + SCTP_INP_DECR_REF(inp); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + return error; + } + SCTP_INP_INFO_RLOCK(); + SCTP_INP_RLOCK(inp); + } + atomic_subtract_int(&stcb->asoc.refcnt, 1); + memset((void *)&xraddr, 0, sizeof(struct xsctp_raddr)); + xraddr.last = 1; + SCTP_INP_RUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + error = SYSCTL_OUT(req, &xraddr, sizeof(struct xsctp_raddr)); + if (error) { + SCTP_INP_DECR_REF(inp); + return error; + } + SCTP_INP_INFO_RLOCK(); + SCTP_INP_RLOCK(inp); + } + SCTP_INP_DECR_REF(inp); + SCTP_INP_RUNLOCK(inp); + SCTP_INP_INFO_RUNLOCK(); + memset((void *)&xstcb, 0, sizeof(struct xsctp_tcb)); + xstcb.last = 1; + error = SYSCTL_OUT(req, &xstcb, sizeof(struct xsctp_tcb)); + if (error) { + return error; + } +skip: + SCTP_INP_INFO_RLOCK(); + } + SCTP_INP_INFO_RUNLOCK(); + + memset((void *)&xinpcb, 0, sizeof(struct xsctp_inpcb)); + xinpcb.last = 1; + error = SYSCTL_OUT(req, &xinpcb, sizeof(struct xsctp_inpcb)); + return error; +} + + +#define RANGECHK(var, min, max) \ + if ((var) < (min)) { (var) = (min); } \ + else if ((var) > (max)) { (var) = (max); } + +static int +sysctl_sctp_udp_tunneling_check(SYSCTL_HANDLER_ARGS) +{ + int error; + uint32_t old_sctp_udp_tunneling_port; + + SCTP_INP_INFO_RLOCK(); + old_sctp_udp_tunneling_port = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port); + SCTP_INP_INFO_RUNLOCK(); + error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + if (error == 0) { + RANGECHK(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port), SCTPCTL_UDP_TUNNELING_PORT_MIN, SCTPCTL_UDP_TUNNELING_PORT_MAX); + if (old_sctp_udp_tunneling_port == SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) { + error = 0; + goto out; + } + SCTP_INP_INFO_WLOCK(); + if (old_sctp_udp_tunneling_port) { + sctp_over_udp_stop(); + } + if (SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) { + if (sctp_over_udp_start()) { + SCTP_BASE_SYSCTL(sctp_udp_tunneling_port) = 0; + } + } + SCTP_INP_INFO_WUNLOCK(); + } +out: + return (error); +} + + +static int +sysctl_sctp_check(SYSCTL_HANDLER_ARGS) +{ + int error; + + error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); + if (error == 0) { + RANGECHK(SCTP_BASE_SYSCTL(sctp_sendspace), SCTPCTL_MAXDGRAM_MIN, SCTPCTL_MAXDGRAM_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_recvspace), SCTPCTL_RECVSPACE_MIN, SCTPCTL_RECVSPACE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_auto_asconf), SCTPCTL_AUTOASCONF_MIN, SCTPCTL_AUTOASCONF_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_ecn_enable), SCTPCTL_ECN_ENABLE_MIN, SCTPCTL_ECN_ENABLE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_ecn_nonce), SCTPCTL_ECN_NONCE_MIN, SCTPCTL_ECN_NONCE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_sacks), SCTPCTL_STRICT_SACKS_MIN, SCTPCTL_STRICT_SACKS_MAX); +#if !defined(SCTP_WITH_NO_CSUM) + RANGECHK(SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback), SCTPCTL_LOOPBACK_NOCSUM_MIN, SCTPCTL_LOOPBACK_NOCSUM_MAX); +#endif + RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_init), SCTPCTL_STRICT_INIT_MIN, SCTPCTL_STRICT_INIT_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_peer_chunk_oh), SCTPCTL_PEER_CHKOH_MIN, SCTPCTL_PEER_CHKOH_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_max_burst_default), SCTPCTL_MAXBURST_MIN, SCTPCTL_MAXBURST_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue), SCTPCTL_MAXCHUNKS_MIN, SCTPCTL_MAXCHUNKS_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_hashtblsize), SCTPCTL_TCBHASHSIZE_MIN, SCTPCTL_TCBHASHSIZE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_pcbtblsize), SCTPCTL_PCBHASHSIZE_MIN, SCTPCTL_PCBHASHSIZE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_min_split_point), SCTPCTL_MIN_SPLIT_POINT_MIN, SCTPCTL_MIN_SPLIT_POINT_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_chunkscale), SCTPCTL_CHUNKSCALE_MIN, SCTPCTL_CHUNKSCALE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default), SCTPCTL_DELAYED_SACK_TIME_MIN, SCTPCTL_DELAYED_SACK_TIME_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_sack_freq_default), SCTPCTL_SACK_FREQ_MIN, SCTPCTL_SACK_FREQ_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_system_free_resc_limit), SCTPCTL_SYS_RESOURCE_MIN, SCTPCTL_SYS_RESOURCE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit), SCTPCTL_ASOC_RESOURCE_MIN, SCTPCTL_ASOC_RESOURCE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default), SCTPCTL_HEARTBEAT_INTERVAL_MIN, SCTPCTL_HEARTBEAT_INTERVAL_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default), SCTPCTL_PMTU_RAISE_TIME_MIN, SCTPCTL_PMTU_RAISE_TIME_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default), SCTPCTL_SHUTDOWN_GUARD_TIME_MIN, SCTPCTL_SHUTDOWN_GUARD_TIME_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default), SCTPCTL_SECRET_LIFETIME_MIN, SCTPCTL_SECRET_LIFETIME_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_max_default), SCTPCTL_RTO_MAX_MIN, SCTPCTL_RTO_MAX_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_min_default), SCTPCTL_RTO_MIN_MIN, SCTPCTL_RTO_MIN_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_rto_initial_default), SCTPCTL_RTO_INITIAL_MIN, SCTPCTL_RTO_INITIAL_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_init_rto_max_default), SCTPCTL_INIT_RTO_MAX_MIN, SCTPCTL_INIT_RTO_MAX_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default), SCTPCTL_VALID_COOKIE_LIFE_MIN, SCTPCTL_VALID_COOKIE_LIFE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_init_rtx_max_default), SCTPCTL_INIT_RTX_MAX_MIN, SCTPCTL_INIT_RTX_MAX_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default), SCTPCTL_ASSOC_RTX_MAX_MIN, SCTPCTL_ASSOC_RTX_MAX_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), SCTPCTL_PATH_RTX_MAX_MIN, SCTPCTL_PATH_RTX_MAX_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTPCTL_ADD_MORE_ON_OUTPUT_MIN, SCTPCTL_ADD_MORE_ON_OUTPUT_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), SCTPCTL_OUTGOING_STREAMS_MIN, SCTPCTL_OUTGOING_STREAMS_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_on_off), SCTPCTL_CMT_ON_OFF_MIN, SCTPCTL_CMT_ON_OFF_MAX); + /* EY */ + RANGECHK(SCTP_BASE_SYSCTL(sctp_nr_sack_on_off), SCTPCTL_NR_SACK_ON_OFF_MIN, SCTPCTL_NR_SACK_ON_OFF_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_use_dac), SCTPCTL_CMT_USE_DAC_MIN, SCTPCTL_CMT_USE_DAC_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_cmt_pf), SCTPCTL_CMT_PF_MIN, SCTPCTL_CMT_PF_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst), SCTPCTL_CWND_MAXBURST_MIN, SCTPCTL_CWND_MAXBURST_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_early_fr), SCTPCTL_EARLY_FAST_RETRAN_MIN, SCTPCTL_EARLY_FAST_RETRAN_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_early_fr_msec), SCTPCTL_EARLY_FAST_RETRAN_MSEC_MIN, SCTPCTL_EARLY_FAST_RETRAN_MSEC_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk), SCTPCTL_ASCONF_AUTH_NOCHK_MIN, SCTPCTL_ASCONF_AUTH_NOCHK_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_auth_disable), SCTPCTL_AUTH_DISABLE_MIN, SCTPCTL_AUTH_DISABLE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_nat_friendly), SCTPCTL_NAT_FRIENDLY_MIN, SCTPCTL_NAT_FRIENDLY_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_L2_abc_variable), SCTPCTL_ABC_L_VAR_MIN, SCTPCTL_ABC_L_VAR_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count), SCTPCTL_MAX_CHAINED_MBUFS_MIN, SCTPCTL_MAX_CHAINED_MBUFS_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_do_drain), SCTPCTL_DO_SCTP_DRAIN_MIN, SCTPCTL_DO_SCTP_DRAIN_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_hb_maxburst), SCTPCTL_HB_MAX_BURST_MIN, SCTPCTL_HB_MAX_BURST_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit), SCTPCTL_ABORT_AT_LIMIT_MIN, SCTPCTL_ABORT_AT_LIMIT_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_strict_data_order), SCTPCTL_STRICT_DATA_ORDER_MIN, SCTPCTL_STRICT_DATA_ORDER_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_min_residual), SCTPCTL_MIN_RESIDUAL_MIN, SCTPCTL_MIN_RESIDUAL_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_max_retran_chunk), SCTPCTL_MAX_RETRAN_CHUNK_MIN, SCTPCTL_MAX_RETRAN_CHUNK_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_logging_level), SCTPCTL_LOGGING_LEVEL_MIN, SCTPCTL_LOGGING_LEVEL_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_default_cc_module), SCTPCTL_DEFAULT_CC_MODULE_MIN, SCTPCTL_DEFAULT_CC_MODULE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_default_frag_interleave), SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MIN, SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_vtag_time_wait), SCTPCTL_TIME_WAIT_MIN, SCTPCTL_TIME_WAIT_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_buffer_splitting), SCTPCTL_BUFFER_SPLITTING_MIN, SCTPCTL_BUFFER_SPLITTING_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_initial_cwnd), SCTPCTL_INITIAL_CWND_MIN, SCTPCTL_INITIAL_CWND_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_mobility_base), SCTPCTL_MOBILITY_BASE_MIN, SCTPCTL_MOBILITY_BASE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff), SCTPCTL_MOBILITY_FASTHANDOFF_MIN, SCTPCTL_MOBILITY_FASTHANDOFF_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_udp_tunneling_for_client_enable), SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_MIN, SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_enable_sack_immediately), SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN, SCTPCTL_SACK_IMMEDIATELY_ENABLE_MAX); + RANGECHK(SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly), SCTPCTL_NAT_FRIENDLY_INITS_MIN, SCTPCTL_NAT_FRIENDLY_INITS_MAX); + +#ifdef SCTP_DEBUG + RANGECHK(SCTP_BASE_SYSCTL(sctp_debug_on), SCTPCTL_DEBUG_MIN, SCTPCTL_DEBUG_MAX); +#endif +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + RANGECHK(SCTP_BASE_SYSCTL(sctp_output_unlocked), SCTPCTL_OUTPUT_UNLOCKED_MIN, SCTPCTL_OUTPUT_UNLOCKED_MAX); +#endif + } + return (error); +} + +#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT) +static int +sysctl_stat_get(SYSCTL_HANDLER_ARGS) +{ + int cpu, error; + struct sctpstat sb, *sarry; + + memset(&sb, 0, sizeof(sb)); + for (cpu = 0; cpu < mp_ncpus; cpu++) { + sarry = &SCTP_BASE_STATS[cpu]; + if (sarry->sctps_discontinuitytime.tv_sec > sb.sctps_discontinuitytime.tv_sec) { + sb.sctps_discontinuitytime.tv_sec = sarry->sctps_discontinuitytime.tv_sec; + sb.sctps_discontinuitytime.tv_usec = sarry->sctps_discontinuitytime.tv_usec; + } + sb.sctps_currestab += sarry->sctps_currestab; + sb.sctps_activeestab += sarry->sctps_activeestab; + sb.sctps_restartestab += sarry->sctps_restartestab; + sb.sctps_collisionestab += sarry->sctps_collisionestab; + sb.sctps_passiveestab += sarry->sctps_passiveestab; + sb.sctps_aborted += sarry->sctps_aborted; + sb.sctps_shutdown += sarry->sctps_shutdown; + sb.sctps_outoftheblue += sarry->sctps_outoftheblue; + sb.sctps_checksumerrors += sarry->sctps_checksumerrors; + sb.sctps_outcontrolchunks += sarry->sctps_outcontrolchunks; + sb.sctps_outorderchunks += sarry->sctps_outorderchunks; + sb.sctps_outunorderchunks += sarry->sctps_outunorderchunks; + sb.sctps_incontrolchunks += sarry->sctps_incontrolchunks; + sb.sctps_inorderchunks += sarry->sctps_inorderchunks; + sb.sctps_inunorderchunks += sarry->sctps_inunorderchunks; + sb.sctps_fragusrmsgs += sarry->sctps_fragusrmsgs; + sb.sctps_reasmusrmsgs += sarry->sctps_reasmusrmsgs; + sb.sctps_outpackets += sarry->sctps_outpackets; + sb.sctps_inpackets += sarry->sctps_inpackets; + sb.sctps_recvpackets += sarry->sctps_recvpackets; + sb.sctps_recvdatagrams += sarry->sctps_recvdatagrams; + sb.sctps_recvpktwithdata += sarry->sctps_recvpktwithdata; + sb.sctps_recvsacks += sarry->sctps_recvsacks; + sb.sctps_recvdata += sarry->sctps_recvdata; + sb.sctps_recvdupdata += sarry->sctps_recvdupdata; + sb.sctps_recvheartbeat += sarry->sctps_recvheartbeat; + sb.sctps_recvheartbeatack += sarry->sctps_recvheartbeatack; + sb.sctps_recvecne += sarry->sctps_recvecne; + sb.sctps_recvauth += sarry->sctps_recvauth; + sb.sctps_recvauthmissing += sarry->sctps_recvauthmissing; + sb.sctps_recvivalhmacid += sarry->sctps_recvivalhmacid; + sb.sctps_recvivalkeyid += sarry->sctps_recvivalkeyid; + sb.sctps_recvauthfailed += sarry->sctps_recvauthfailed; + sb.sctps_recvexpress += sarry->sctps_recvexpress; + sb.sctps_recvexpressm += sarry->sctps_recvexpressm; + sb.sctps_recvnocrc += sarry->sctps_recvnocrc; + sb.sctps_recvswcrc += sarry->sctps_recvswcrc; + sb.sctps_recvhwcrc += sarry->sctps_recvhwcrc; + sb.sctps_sendpackets += sarry->sctps_sendpackets; + sb.sctps_sendsacks += sarry->sctps_sendsacks; + sb.sctps_senddata += sarry->sctps_senddata; + sb.sctps_sendretransdata += sarry->sctps_sendretransdata; + sb.sctps_sendfastretrans += sarry->sctps_sendfastretrans; + sb.sctps_sendmultfastretrans += sarry->sctps_sendmultfastretrans; + sb.sctps_sendheartbeat += sarry->sctps_sendheartbeat; + sb.sctps_sendecne += sarry->sctps_sendecne; + sb.sctps_sendauth += sarry->sctps_sendauth; + sb.sctps_senderrors += sarry->sctps_senderrors; + sb.sctps_sendnocrc += sarry->sctps_sendnocrc; + sb.sctps_sendswcrc += sarry->sctps_sendswcrc; + sb.sctps_sendhwcrc += sarry->sctps_sendhwcrc; + sb.sctps_pdrpfmbox += sarry->sctps_pdrpfmbox; + sb.sctps_pdrpfehos += sarry->sctps_pdrpfehos; + sb.sctps_pdrpmbda += sarry->sctps_pdrpmbda; + sb.sctps_pdrpmbct += sarry->sctps_pdrpmbct; + sb.sctps_pdrpbwrpt += sarry->sctps_pdrpbwrpt; + sb.sctps_pdrpcrupt += sarry->sctps_pdrpcrupt; + sb.sctps_pdrpnedat += sarry->sctps_pdrpnedat; + sb.sctps_pdrppdbrk += sarry->sctps_pdrppdbrk; + sb.sctps_pdrptsnnf += sarry->sctps_pdrptsnnf; + sb.sctps_pdrpdnfnd += sarry->sctps_pdrpdnfnd; + sb.sctps_pdrpdiwnp += sarry->sctps_pdrpdiwnp; + sb.sctps_pdrpdizrw += sarry->sctps_pdrpdizrw; + sb.sctps_pdrpbadd += sarry->sctps_pdrpbadd; + sb.sctps_pdrpmark += sarry->sctps_pdrpmark; + sb.sctps_timoiterator += sarry->sctps_timoiterator; + sb.sctps_timodata += sarry->sctps_timodata; + sb.sctps_timowindowprobe += sarry->sctps_timowindowprobe; + sb.sctps_timoinit += sarry->sctps_timoinit; + sb.sctps_timosack += sarry->sctps_timosack; + sb.sctps_timoshutdown += sarry->sctps_timoshutdown; + sb.sctps_timoheartbeat += sarry->sctps_timoheartbeat; + sb.sctps_timocookie += sarry->sctps_timocookie; + sb.sctps_timosecret += sarry->sctps_timosecret; + sb.sctps_timopathmtu += sarry->sctps_timopathmtu; + sb.sctps_timoshutdownack += sarry->sctps_timoshutdownack; + sb.sctps_timoshutdownguard += sarry->sctps_timoshutdownguard; + sb.sctps_timostrmrst += sarry->sctps_timostrmrst; + sb.sctps_timoearlyfr += sarry->sctps_timoearlyfr; + sb.sctps_timoasconf += sarry->sctps_timoasconf; + sb.sctps_timodelprim += sarry->sctps_timodelprim; + sb.sctps_timoautoclose += sarry->sctps_timoautoclose; + sb.sctps_timoassockill += sarry->sctps_timoassockill; + sb.sctps_timoinpkill += sarry->sctps_timoinpkill; + sb.sctps_earlyfrstart += sarry->sctps_earlyfrstart; + sb.sctps_earlyfrstop += sarry->sctps_earlyfrstop; + sb.sctps_earlyfrmrkretrans += sarry->sctps_earlyfrmrkretrans; + sb.sctps_earlyfrstpout += sarry->sctps_earlyfrstpout; + sb.sctps_earlyfrstpidsck1 += sarry->sctps_earlyfrstpidsck1; + sb.sctps_earlyfrstpidsck2 += sarry->sctps_earlyfrstpidsck2; + sb.sctps_earlyfrstpidsck3 += sarry->sctps_earlyfrstpidsck3; + sb.sctps_earlyfrstpidsck4 += sarry->sctps_earlyfrstpidsck4; + sb.sctps_earlyfrstrid += sarry->sctps_earlyfrstrid; + sb.sctps_earlyfrstrout += sarry->sctps_earlyfrstrout; + sb.sctps_earlyfrstrtmr += sarry->sctps_earlyfrstrtmr; + sb.sctps_hdrops += sarry->sctps_hdrops; + sb.sctps_badsum += sarry->sctps_badsum; + sb.sctps_noport += sarry->sctps_noport; + sb.sctps_badvtag += sarry->sctps_badvtag; + sb.sctps_badsid += sarry->sctps_badsid; + sb.sctps_nomem += sarry->sctps_nomem; + sb.sctps_fastretransinrtt += sarry->sctps_fastretransinrtt; + sb.sctps_markedretrans += sarry->sctps_markedretrans; + sb.sctps_naglesent += sarry->sctps_naglesent; + sb.sctps_naglequeued += sarry->sctps_naglequeued; + sb.sctps_maxburstqueued += sarry->sctps_maxburstqueued; + sb.sctps_ifnomemqueued += sarry->sctps_ifnomemqueued; + sb.sctps_windowprobed += sarry->sctps_windowprobed; + sb.sctps_lowlevelerr += sarry->sctps_lowlevelerr; + sb.sctps_lowlevelerrusr += sarry->sctps_lowlevelerrusr; + sb.sctps_datadropchklmt += sarry->sctps_datadropchklmt; + sb.sctps_datadroprwnd += sarry->sctps_datadroprwnd; + sb.sctps_ecnereducedcwnd += sarry->sctps_ecnereducedcwnd; + sb.sctps_vtagexpress += sarry->sctps_vtagexpress; + sb.sctps_vtagbogus += sarry->sctps_vtagbogus; + sb.sctps_primary_randry += sarry->sctps_primary_randry; + sb.sctps_cmt_randry += sarry->sctps_cmt_randry; + sb.sctps_slowpath_sack += sarry->sctps_slowpath_sack; + sb.sctps_wu_sacks_sent += sarry->sctps_wu_sacks_sent; + sb.sctps_sends_with_flags += sarry->sctps_sends_with_flags; + sb.sctps_sends_with_unord += sarry->sctps_sends_with_unord; + sb.sctps_sends_with_eof += sarry->sctps_sends_with_eof; + sb.sctps_sends_with_abort += sarry->sctps_sends_with_abort; + sb.sctps_protocol_drain_calls += sarry->sctps_protocol_drain_calls; + sb.sctps_protocol_drains_done += sarry->sctps_protocol_drains_done; + sb.sctps_read_peeks += sarry->sctps_read_peeks; + sb.sctps_cached_chk += sarry->sctps_cached_chk; + sb.sctps_cached_strmoq += sarry->sctps_cached_strmoq; + sb.sctps_left_abandon += sarry->sctps_left_abandon; + sb.sctps_send_burst_avoid += sarry->sctps_send_burst_avoid; + sb.sctps_send_cwnd_avoid += sarry->sctps_send_cwnd_avoid; + sb.sctps_fwdtsn_map_over += sarry->sctps_fwdtsn_map_over; + } + error = SYSCTL_OUT(req, &sb, sizeof(sb)); + return (error); +} + +#endif + +#if defined(SCTP_LOCAL_TRACE_BUF) +static int +sysctl_sctp_cleartrace(SYSCTL_HANDLER_ARGS) +{ + int error = 0; + + memset(&SCTP_BASE_SYSCTL(sctp_log), 0, sizeof(struct sctp_log)); + return (error); +} + +#endif + + +/* + * sysctl definitions + */ + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, sendspace, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_sendspace), 0, sysctl_sctp_check, "IU", + SCTPCTL_MAXDGRAM_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, recvspace, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_recvspace), 0, sysctl_sctp_check, "IU", + SCTPCTL_RECVSPACE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, auto_asconf, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_auto_asconf), 0, sysctl_sctp_check, "IU", + SCTPCTL_AUTOASCONF_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, ecn_enable, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_ecn_enable), 0, sysctl_sctp_check, "IU", + SCTPCTL_ECN_ENABLE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, ecn_nonce, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_ecn_nonce), 0, sysctl_sctp_check, "IU", + SCTPCTL_ECN_NONCE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, strict_sacks, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_strict_sacks), 0, sysctl_sctp_check, "IU", + SCTPCTL_STRICT_SACKS_DESC); + +#if !defined(SCTP_WITH_NO_CSUM) +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, loopback_nocsum, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_no_csum_on_loopback), 0, sysctl_sctp_check, "IU", + SCTPCTL_LOOPBACK_NOCSUM_DESC); +#endif + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, strict_init, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_strict_init), 0, sysctl_sctp_check, "IU", + SCTPCTL_STRICT_INIT_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, peer_chkoh, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_peer_chunk_oh), 0, sysctl_sctp_check, "IU", + SCTPCTL_PEER_CHKOH_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, maxburst, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_max_burst_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_MAXBURST_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, maxchunks, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue), 0, sysctl_sctp_check, "IU", + SCTPCTL_MAXCHUNKS_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, tcbhashsize, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_hashtblsize), 0, sysctl_sctp_check, "IU", + SCTPCTL_TCBHASHSIZE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, pcbhashsize, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_pcbtblsize), 0, sysctl_sctp_check, "IU", + SCTPCTL_PCBHASHSIZE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, min_split_point, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_min_split_point), 0, sysctl_sctp_check, "IU", + SCTPCTL_MIN_SPLIT_POINT_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, chunkscale, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_chunkscale), 0, sysctl_sctp_check, "IU", + SCTPCTL_CHUNKSCALE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, delayed_sack_time, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_DELAYED_SACK_TIME_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, sack_freq, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_sack_freq_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_SACK_FREQ_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, sys_resource, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_system_free_resc_limit), 0, sysctl_sctp_check, "IU", + SCTPCTL_SYS_RESOURCE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, asoc_resource, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit), 0, sysctl_sctp_check, "IU", + SCTPCTL_ASOC_RESOURCE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, heartbeat_interval, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_HEARTBEAT_INTERVAL_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, pmtu_raise_time, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_PMTU_RAISE_TIME_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, shutdown_guard_time, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_SHUTDOWN_GUARD_TIME_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, secret_lifetime, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_secret_lifetime_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_SECRET_LIFETIME_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, rto_max, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_rto_max_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_RTO_MAX_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, rto_min, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_rto_min_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_RTO_MIN_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, rto_initial, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_rto_initial_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_RTO_INITIAL_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, init_rto_max, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_init_rto_max_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_INIT_RTO_MAX_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, valid_cookie_life, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_VALID_COOKIE_LIFE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, init_rtx_max, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_init_rtx_max_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_INIT_RTX_MAX_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, assoc_rtx_max, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_ASSOC_RTX_MAX_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, path_rtx_max, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_path_rtx_max_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_PATH_RTX_MAX_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, add_more_on_output, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_add_more_threshold), 0, sysctl_sctp_check, "IU", + SCTPCTL_ADD_MORE_ON_OUTPUT_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, outgoing_streams, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default), 0, sysctl_sctp_check, "IU", + SCTPCTL_OUTGOING_STREAMS_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, cmt_on_off, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_cmt_on_off), 0, sysctl_sctp_check, "IU", + SCTPCTL_CMT_ON_OFF_DESC); + +/* EY */ +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, nr_sack_on_off, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_nr_sack_on_off), 0, sysctl_sctp_check, "IU", + SCTPCTL_NR_SACK_ON_OFF_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, cmt_use_dac, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_cmt_use_dac), 0, sysctl_sctp_check, "IU", + SCTPCTL_CMT_USE_DAC_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, cmt_pf, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_cmt_pf), 0, sysctl_sctp_check, "IU", + SCTPCTL_CMT_PF_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, cwnd_maxburst, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst), 0, sysctl_sctp_check, "IU", + SCTPCTL_CWND_MAXBURST_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, early_fast_retran, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_early_fr), 0, sysctl_sctp_check, "IU", + SCTPCTL_EARLY_FAST_RETRAN_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, early_fast_retran_msec, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_early_fr_msec), 0, sysctl_sctp_check, "IU", + SCTPCTL_EARLY_FAST_RETRAN_MSEC_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, asconf_auth_nochk, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_asconf_auth_nochk), 0, sysctl_sctp_check, "IU", + SCTPCTL_ASCONF_AUTH_NOCHK_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, auth_disable, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_auth_disable), 0, sysctl_sctp_check, "IU", + SCTPCTL_AUTH_DISABLE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, nat_friendly, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_nat_friendly), 0, sysctl_sctp_check, "IU", + SCTPCTL_NAT_FRIENDLY_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, abc_l_var, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_L2_abc_variable), 0, sysctl_sctp_check, "IU", + SCTPCTL_ABC_L_VAR_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, max_chained_mbufs, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count), 0, sysctl_sctp_check, "IU", + SCTPCTL_MAX_CHAINED_MBUFS_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, do_sctp_drain, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_do_drain), 0, sysctl_sctp_check, "IU", + SCTPCTL_DO_SCTP_DRAIN_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, hb_max_burst, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_hb_maxburst), 0, sysctl_sctp_check, "IU", + SCTPCTL_HB_MAX_BURST_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, abort_at_limit, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit), 0, sysctl_sctp_check, "IU", + SCTPCTL_ABORT_AT_LIMIT_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, strict_data_order, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_strict_data_order), 0, sysctl_sctp_check, "IU", + SCTPCTL_STRICT_DATA_ORDER_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, min_residual, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_min_residual), 0, sysctl_sctp_check, "IU", + SCTPCTL_MIN_RESIDUAL_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, max_retran_chunk, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_max_retran_chunk), 0, sysctl_sctp_check, "IU", + SCTPCTL_MAX_RETRAN_CHUNK_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, log_level, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_logging_level), 0, sysctl_sctp_check, "IU", + SCTPCTL_LOGGING_LEVEL_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, default_cc_module, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_default_cc_module), 0, sysctl_sctp_check, "IU", + SCTPCTL_DEFAULT_CC_MODULE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, default_frag_interleave, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_default_frag_interleave), 0, sysctl_sctp_check, "IU", + SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, mobility_base, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_mobility_base), 0, sysctl_sctp_check, "IU", + SCTPCTL_MOBILITY_BASE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, mobility_fasthandoff, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff), 0, sysctl_sctp_check, "IU", + SCTPCTL_MOBILITY_FASTHANDOFF_DESC); + +#if defined(SCTP_LOCAL_TRACE_BUF) +SYSCTL_STRUCT(_net_inet_sctp, OID_AUTO, log, CTLFLAG_RD, + &SCTP_BASE_SYSCTL(sctp_log), sctp_log, + "SCTP logging (struct sctp_log)"); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, clear_trace, CTLTYPE_OPAQUE | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_log), 0, sysctl_sctp_cleartrace, "IU", + "Clear SCTP Logging buffer"); + + + +#endif + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, udp_tunneling_for_client_enable, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_udp_tunneling_for_client_enable), 0, sysctl_sctp_check, "IU", + SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, udp_tunneling_port, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_udp_tunneling_port), 0, sysctl_sctp_udp_tunneling_check, "IU", + SCTPCTL_UDP_TUNNELING_PORT_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, enable_sack_immediately, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_enable_sack_immediately), 0, sysctl_sctp_check, "IU", + SCTPCTL_SACK_IMMEDIATELY_ENABLE_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, nat_friendly_init, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly), 0, sysctl_sctp_check, "IU", + SCTPCTL_NAT_FRIENDLY_INITS_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, vtag_time_wait, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_vtag_time_wait), 0, sysctl_sctp_check, "IU", + SCTPCTL_TIME_WAIT_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, buffer_splitting, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_buffer_splitting), 0, sysctl_sctp_check, "IU", + SCTPCTL_BUFFER_SPLITTING_DESC); + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, initial_cwnd, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_initial_cwnd), 0, sysctl_sctp_check, "IU", + SCTPCTL_INITIAL_CWND_DESC); + +#ifdef SCTP_DEBUG +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, debug, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_debug_on), 0, sysctl_sctp_check, "IU", + SCTPCTL_DEBUG_DESC); +#endif /* SCTP_DEBUG */ + + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, output_unlocked, CTLTYPE_INT | CTLFLAG_RW, + &SCTP_BASE_SYSCTL(sctp_output_unlocked), 0, sysctl_sctp_check, "IU", + SCTPCTL_OUTPUT_UNLOCKED_DESC); +#endif +#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT) +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, stats, + CTLTYPE_STRUCT | CTLFLAG_RD, + 0, 0, sysctl_stat_get, "S,sctpstat", + "SCTP statistics (struct sctp_stat)"); +#else +SYSCTL_STRUCT(_net_inet_sctp, OID_AUTO, stats, CTLFLAG_RW, + &SCTP_BASE_STATS_SYSCTL, sctpstat, + "SCTP statistics (struct sctp_stat)"); +#endif + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, assoclist, CTLFLAG_RD, + 0, 0, sctp_assoclist, + "S,xassoc", "List of active SCTP associations"); diff --git a/freebsd/sys/netinet/sctp_sysctl.h b/freebsd/sys/netinet/sctp_sysctl.h new file mode 100644 index 00000000..5f7f270d --- /dev/null +++ b/freebsd/sys/netinet/sctp_sysctl.h @@ -0,0 +1,532 @@ +/*- + * Copyright (c) 2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_sysctl_h__ +#define __sctp_sysctl_h__ + +#include +#include + +struct sctp_sysctl { + uint32_t sctp_sendspace; + uint32_t sctp_recvspace; + uint32_t sctp_auto_asconf; + uint32_t sctp_multiple_asconfs; + uint32_t sctp_ecn_enable; + uint32_t sctp_ecn_nonce; + uint32_t sctp_strict_sacks; +#if !defined(SCTP_WITH_NO_CSUM) + uint32_t sctp_no_csum_on_loopback; +#endif + uint32_t sctp_strict_init; + uint32_t sctp_peer_chunk_oh; + uint32_t sctp_max_burst_default; + uint32_t sctp_max_chunks_on_queue; + uint32_t sctp_hashtblsize; + uint32_t sctp_pcbtblsize; + uint32_t sctp_min_split_point; + uint32_t sctp_chunkscale; + uint32_t sctp_delayed_sack_time_default; + uint32_t sctp_sack_freq_default; + uint32_t sctp_system_free_resc_limit; + uint32_t sctp_asoc_free_resc_limit; + uint32_t sctp_heartbeat_interval_default; + uint32_t sctp_pmtu_raise_time_default; + uint32_t sctp_shutdown_guard_time_default; + uint32_t sctp_secret_lifetime_default; + uint32_t sctp_rto_max_default; + uint32_t sctp_rto_min_default; + uint32_t sctp_rto_initial_default; + uint32_t sctp_init_rto_max_default; + uint32_t sctp_valid_cookie_life_default; + uint32_t sctp_init_rtx_max_default; + uint32_t sctp_assoc_rtx_max_default; + uint32_t sctp_path_rtx_max_default; + uint32_t sctp_add_more_threshold; + uint32_t sctp_nr_outgoing_streams_default; + uint32_t sctp_cmt_on_off; + uint32_t sctp_cmt_use_dac; + /* EY 5/5/08 - nr_sack flag variable */ + uint32_t sctp_nr_sack_on_off; + uint32_t sctp_cmt_pf; + uint32_t sctp_use_cwnd_based_maxburst; + uint32_t sctp_early_fr; + uint32_t sctp_early_fr_msec; + uint32_t sctp_asconf_auth_nochk; + uint32_t sctp_auth_disable; + uint32_t sctp_nat_friendly; + uint32_t sctp_L2_abc_variable; + uint32_t sctp_mbuf_threshold_count; + uint32_t sctp_do_drain; + uint32_t sctp_hb_maxburst; + uint32_t sctp_abort_if_one_2_one_hits_limit; + uint32_t sctp_strict_data_order; + uint32_t sctp_min_residual; + uint32_t sctp_max_retran_chunk; + uint32_t sctp_logging_level; + /* JRS - Variable for default congestion control module */ + uint32_t sctp_default_cc_module; + uint32_t sctp_default_frag_interleave; + uint32_t sctp_mobility_base; + uint32_t sctp_mobility_fasthandoff; + uint32_t sctp_inits_include_nat_friendly; +#if defined(SCTP_LOCAL_TRACE_BUF) + struct sctp_log sctp_log; +#endif + uint32_t sctp_udp_tunneling_for_client_enable; + uint32_t sctp_udp_tunneling_port; + uint32_t sctp_enable_sack_immediately; + uint32_t sctp_vtag_time_wait; + uint32_t sctp_buffer_splitting; + uint32_t sctp_initial_cwnd; +#if defined(SCTP_DEBUG) + uint32_t sctp_debug_on; +#endif +#if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + uint32_t sctp_output_unlocked; +#endif +}; + +/* + * limits for the sysctl variables + */ +/* maxdgram: Maximum outgoing SCTP buffer size */ +#define SCTPCTL_MAXDGRAM_DESC "Maximum outgoing SCTP buffer size" +#define SCTPCTL_MAXDGRAM_MIN 0 +#define SCTPCTL_MAXDGRAM_MAX 0xFFFFFFFF +#define SCTPCTL_MAXDGRAM_DEFAULT 262144 /* 256k */ + +/* recvspace: Maximum incoming SCTP buffer size */ +#define SCTPCTL_RECVSPACE_DESC "Maximum incoming SCTP buffer size" +#define SCTPCTL_RECVSPACE_MIN 0 +#define SCTPCTL_RECVSPACE_MAX 0xFFFFFFFF +#define SCTPCTL_RECVSPACE_DEFAULT 262144 /* 256k */ + +/* autoasconf: Enable SCTP Auto-ASCONF */ +#define SCTPCTL_AUTOASCONF_DESC "Enable SCTP Auto-ASCONF" +#define SCTPCTL_AUTOASCONF_MIN 0 +#define SCTPCTL_AUTOASCONF_MAX 1 +#define SCTPCTL_AUTOASCONF_DEFAULT SCTP_DEFAULT_AUTO_ASCONF + +/* autoasconf: Enable SCTP Auto-ASCONF */ +#define SCTPCTL_MULTIPLEASCONFS_DESC "Enable SCTP Muliple-ASCONFs" +#define SCTPCTL_MULTIPLEASCONFS_MIN 0 +#define SCTPCTL_MULTIPLEASCONFS_MAX 1 +#define SCTPCTL_MULTIPLEASCONFS_DEFAULT SCTP_DEFAULT_MULTIPLE_ASCONFS + +/* ecn_enable: Enable SCTP ECN */ +#define SCTPCTL_ECN_ENABLE_DESC "Enable SCTP ECN" +#define SCTPCTL_ECN_ENABLE_MIN 0 +#define SCTPCTL_ECN_ENABLE_MAX 1 +#define SCTPCTL_ECN_ENABLE_DEFAULT 1 + +/* ecn_nonce: Enable SCTP ECN Nonce */ +#define SCTPCTL_ECN_NONCE_DESC "Enable SCTP ECN Nonce" +#define SCTPCTL_ECN_NONCE_MIN 0 +#define SCTPCTL_ECN_NONCE_MAX 1 +#define SCTPCTL_ECN_NONCE_DEFAULT 0 + +/* strict_sacks: Enable SCTP Strict SACK checking */ +#define SCTPCTL_STRICT_SACKS_DESC "Enable SCTP Strict SACK checking" +#define SCTPCTL_STRICT_SACKS_MIN 0 +#define SCTPCTL_STRICT_SACKS_MAX 1 +#define SCTPCTL_STRICT_SACKS_DEFAULT 1 + +/* loopback_nocsum: Enable NO Csum on packets sent on loopback */ +#define SCTPCTL_LOOPBACK_NOCSUM_DESC "Enable NO Csum on packets sent on loopback" +#define SCTPCTL_LOOPBACK_NOCSUM_MIN 0 +#define SCTPCTL_LOOPBACK_NOCSUM_MAX 1 +#define SCTPCTL_LOOPBACK_NOCSUM_DEFAULT 1 + +/* strict_init: Enable strict INIT/INIT-ACK singleton enforcement */ +#define SCTPCTL_STRICT_INIT_DESC "Enable strict INIT/INIT-ACK singleton enforcement" +#define SCTPCTL_STRICT_INIT_MIN 0 +#define SCTPCTL_STRICT_INIT_MAX 1 +#define SCTPCTL_STRICT_INIT_DEFAULT 1 + +/* peer_chkoh: Amount to debit peers rwnd per chunk sent */ +#define SCTPCTL_PEER_CHKOH_DESC "Amount to debit peers rwnd per chunk sent" +#define SCTPCTL_PEER_CHKOH_MIN 0 +#define SCTPCTL_PEER_CHKOH_MAX 0xFFFFFFFF +#define SCTPCTL_PEER_CHKOH_DEFAULT 256 + +/* maxburst: Default max burst for sctp endpoints */ +#define SCTPCTL_MAXBURST_DESC "Default max burst for sctp endpoints" +#define SCTPCTL_MAXBURST_MIN 1 +#define SCTPCTL_MAXBURST_MAX 0xFFFFFFFF +#define SCTPCTL_MAXBURST_DEFAULT SCTP_DEF_MAX_BURST + +/* maxchunks: Default max chunks on queue per asoc */ +#define SCTPCTL_MAXCHUNKS_DESC "Default max chunks on queue per asoc" +#define SCTPCTL_MAXCHUNKS_MIN 0 +#define SCTPCTL_MAXCHUNKS_MAX 0xFFFFFFFF +#define SCTPCTL_MAXCHUNKS_DEFAULT SCTP_ASOC_MAX_CHUNKS_ON_QUEUE + +/* tcbhashsize: Tuneable for Hash table sizes */ +#define SCTPCTL_TCBHASHSIZE_DESC "Tunable for TCB hash table sizes" +#define SCTPCTL_TCBHASHSIZE_MIN 1 +#define SCTPCTL_TCBHASHSIZE_MAX 0xFFFFFFFF +#define SCTPCTL_TCBHASHSIZE_DEFAULT SCTP_TCBHASHSIZE + +/* pcbhashsize: Tuneable for PCB Hash table sizes */ +#define SCTPCTL_PCBHASHSIZE_DESC "Tunable for PCB hash table sizes" +#define SCTPCTL_PCBHASHSIZE_MIN 1 +#define SCTPCTL_PCBHASHSIZE_MAX 0xFFFFFFFF +#define SCTPCTL_PCBHASHSIZE_DEFAULT SCTP_PCBHASHSIZE + +/* min_split_point: Minimum size when splitting a chunk */ +#define SCTPCTL_MIN_SPLIT_POINT_DESC "Minimum size when splitting a chunk" +#define SCTPCTL_MIN_SPLIT_POINT_MIN 0 +#define SCTPCTL_MIN_SPLIT_POINT_MAX 0xFFFFFFFF +#define SCTPCTL_MIN_SPLIT_POINT_DEFAULT SCTP_DEFAULT_SPLIT_POINT_MIN + +/* chunkscale: Tuneable for Scaling of number of chunks and messages */ +#define SCTPCTL_CHUNKSCALE_DESC "Tuneable for Scaling of number of chunks and messages" +#define SCTPCTL_CHUNKSCALE_MIN 1 +#define SCTPCTL_CHUNKSCALE_MAX 0xFFFFFFFF +#define SCTPCTL_CHUNKSCALE_DEFAULT SCTP_CHUNKQUEUE_SCALE + +/* delayed_sack_time: Default delayed SACK timer in msec */ +#define SCTPCTL_DELAYED_SACK_TIME_DESC "Default delayed SACK timer in msec" +#define SCTPCTL_DELAYED_SACK_TIME_MIN 0 +#define SCTPCTL_DELAYED_SACK_TIME_MAX 0xFFFFFFFF +#define SCTPCTL_DELAYED_SACK_TIME_DEFAULT SCTP_RECV_MSEC + +/* sack_freq: Default SACK frequency */ +#define SCTPCTL_SACK_FREQ_DESC "Default SACK frequency" +#define SCTPCTL_SACK_FREQ_MIN 0 +#define SCTPCTL_SACK_FREQ_MAX 0xFFFFFFFF +#define SCTPCTL_SACK_FREQ_DEFAULT SCTP_DEFAULT_SACK_FREQ + +/* sys_resource: Max number of cached resources in the system */ +#define SCTPCTL_SYS_RESOURCE_DESC "Max number of cached resources in the system" +#define SCTPCTL_SYS_RESOURCE_MIN 0 +#define SCTPCTL_SYS_RESOURCE_MAX 0xFFFFFFFF +#define SCTPCTL_SYS_RESOURCE_DEFAULT SCTP_DEF_SYSTEM_RESC_LIMIT + +/* asoc_resource: Max number of cached resources in an asoc */ +#define SCTPCTL_ASOC_RESOURCE_DESC "Max number of cached resources in an asoc" +#define SCTPCTL_ASOC_RESOURCE_MIN 0 +#define SCTPCTL_ASOC_RESOURCE_MAX 0xFFFFFFFF +#define SCTPCTL_ASOC_RESOURCE_DEFAULT SCTP_DEF_ASOC_RESC_LIMIT + +/* heartbeat_interval: Default heartbeat interval in msec */ +#define SCTPCTL_HEARTBEAT_INTERVAL_DESC "Default heartbeat interval in msec" +#define SCTPCTL_HEARTBEAT_INTERVAL_MIN 0 +#define SCTPCTL_HEARTBEAT_INTERVAL_MAX 0xFFFFFFFF +#define SCTPCTL_HEARTBEAT_INTERVAL_DEFAULT SCTP_HB_DEFAULT_MSEC + +/* pmtu_raise_time: Default PMTU raise timer in sec */ +#define SCTPCTL_PMTU_RAISE_TIME_DESC "Default PMTU raise timer in sec" +#define SCTPCTL_PMTU_RAISE_TIME_MIN 0 +#define SCTPCTL_PMTU_RAISE_TIME_MAX 0xFFFFFFFF +#define SCTPCTL_PMTU_RAISE_TIME_DEFAULT SCTP_DEF_PMTU_RAISE_SEC + +/* shutdown_guard_time: Default shutdown guard timer in sec */ +#define SCTPCTL_SHUTDOWN_GUARD_TIME_DESC "Default shutdown guard timer in sec" +#define SCTPCTL_SHUTDOWN_GUARD_TIME_MIN 0 +#define SCTPCTL_SHUTDOWN_GUARD_TIME_MAX 0xFFFFFFFF +#define SCTPCTL_SHUTDOWN_GUARD_TIME_DEFAULT SCTP_DEF_MAX_SHUTDOWN_SEC + +/* secret_lifetime: Default secret lifetime in sec */ +#define SCTPCTL_SECRET_LIFETIME_DESC "Default secret lifetime in sec" +#define SCTPCTL_SECRET_LIFETIME_MIN 0 +#define SCTPCTL_SECRET_LIFETIME_MAX 0xFFFFFFFF +#define SCTPCTL_SECRET_LIFETIME_DEFAULT SCTP_DEFAULT_SECRET_LIFE_SEC + +/* rto_max: Default maximum retransmission timeout in msec */ +#define SCTPCTL_RTO_MAX_DESC "Default maximum retransmission timeout in msec" +#define SCTPCTL_RTO_MAX_MIN 0 +#define SCTPCTL_RTO_MAX_MAX 0xFFFFFFFF +#define SCTPCTL_RTO_MAX_DEFAULT SCTP_RTO_UPPER_BOUND + +/* rto_min: Default minimum retransmission timeout in msec */ +#define SCTPCTL_RTO_MIN_DESC "Default minimum retransmission timeout in msec" +#define SCTPCTL_RTO_MIN_MIN 0 +#define SCTPCTL_RTO_MIN_MAX 0xFFFFFFFF +#define SCTPCTL_RTO_MIN_DEFAULT SCTP_RTO_LOWER_BOUND + +/* rto_initial: Default initial retransmission timeout in msec */ +#define SCTPCTL_RTO_INITIAL_DESC "Default initial retransmission timeout in msec" +#define SCTPCTL_RTO_INITIAL_MIN 0 +#define SCTPCTL_RTO_INITIAL_MAX 0xFFFFFFFF +#define SCTPCTL_RTO_INITIAL_DEFAULT SCTP_RTO_INITIAL + +/* init_rto_max: Default maximum retransmission timeout during association setup in msec */ +#define SCTPCTL_INIT_RTO_MAX_DESC "Default maximum retransmission timeout during association setup in msec" +#define SCTPCTL_INIT_RTO_MAX_MIN 0 +#define SCTPCTL_INIT_RTO_MAX_MAX 0xFFFFFFFF +#define SCTPCTL_INIT_RTO_MAX_DEFAULT SCTP_RTO_UPPER_BOUND + +/* valid_cookie_life: Default cookie lifetime in sec */ +#define SCTPCTL_VALID_COOKIE_LIFE_DESC "Default cookie lifetime in sec" +#define SCTPCTL_VALID_COOKIE_LIFE_MIN 0 +#define SCTPCTL_VALID_COOKIE_LIFE_MAX 0xFFFFFFFF +#define SCTPCTL_VALID_COOKIE_LIFE_DEFAULT SCTP_DEFAULT_COOKIE_LIFE + +/* init_rtx_max: Default maximum number of retransmission for INIT chunks */ +#define SCTPCTL_INIT_RTX_MAX_DESC "Default maximum number of retransmission for INIT chunks" +#define SCTPCTL_INIT_RTX_MAX_MIN 0 +#define SCTPCTL_INIT_RTX_MAX_MAX 0xFFFFFFFF +#define SCTPCTL_INIT_RTX_MAX_DEFAULT SCTP_DEF_MAX_INIT + +/* assoc_rtx_max: Default maximum number of retransmissions per association */ +#define SCTPCTL_ASSOC_RTX_MAX_DESC "Default maximum number of retransmissions per association" +#define SCTPCTL_ASSOC_RTX_MAX_MIN 0 +#define SCTPCTL_ASSOC_RTX_MAX_MAX 0xFFFFFFFF +#define SCTPCTL_ASSOC_RTX_MAX_DEFAULT SCTP_DEF_MAX_SEND + +/* path_rtx_max: Default maximum of retransmissions per path */ +#define SCTPCTL_PATH_RTX_MAX_DESC "Default maximum of retransmissions per path" +#define SCTPCTL_PATH_RTX_MAX_MIN 0 +#define SCTPCTL_PATH_RTX_MAX_MAX 0xFFFFFFFF +#define SCTPCTL_PATH_RTX_MAX_DEFAULT SCTP_DEF_MAX_PATH_RTX + +/* add_more_on_output: When space wise is it worthwhile to try to add more to a socket send buffer */ +#define SCTPCTL_ADD_MORE_ON_OUTPUT_DESC "When space wise is it worthwhile to try to add more to a socket send buffer" +#define SCTPCTL_ADD_MORE_ON_OUTPUT_MIN 0 +#define SCTPCTL_ADD_MORE_ON_OUTPUT_MAX 0xFFFFFFFF +#define SCTPCTL_ADD_MORE_ON_OUTPUT_DEFAULT SCTP_DEFAULT_ADD_MORE + +/* outgoing_streams: Default number of outgoing streams */ +#define SCTPCTL_OUTGOING_STREAMS_DESC "Default number of outgoing streams" +#define SCTPCTL_OUTGOING_STREAMS_MIN 1 +#define SCTPCTL_OUTGOING_STREAMS_MAX 65535 +#define SCTPCTL_OUTGOING_STREAMS_DEFAULT SCTP_OSTREAM_INITIAL + +/* cmt_on_off: CMT on/off flag */ +#define SCTPCTL_CMT_ON_OFF_DESC "CMT on/off flag" +#define SCTPCTL_CMT_ON_OFF_MIN 0 +#define SCTPCTL_CMT_ON_OFF_MAX 1 +#define SCTPCTL_CMT_ON_OFF_DEFAULT 0 + +/* EY - nr_sack_on_off: NR_SACK on/off flag */ +#define SCTPCTL_NR_SACK_ON_OFF_DESC "NR_SACK on/off flag" +#define SCTPCTL_NR_SACK_ON_OFF_MIN 0 +#define SCTPCTL_NR_SACK_ON_OFF_MAX 1 +#define SCTPCTL_NR_SACK_ON_OFF_DEFAULT 0 + +/* cmt_use_dac: CMT DAC on/off flag */ +#define SCTPCTL_CMT_USE_DAC_DESC "CMT DAC on/off flag" +#define SCTPCTL_CMT_USE_DAC_MIN 0 +#define SCTPCTL_CMT_USE_DAC_MAX 1 +#define SCTPCTL_CMT_USE_DAC_DEFAULT 0 + +/* JRS 5/2107 - CMT PF type flag */ +#define SCTPCTL_CMT_PF_DESC "CMT PF type flag" +#define SCTPCTL_CMT_PF_MIN 0 +#define SCTPCTL_CMT_PF_MAX 2 +#define SCTPCTL_CMT_PF_DEFAULT 0 + +/* cwnd_maxburst: Use a CWND adjusting maxburst */ +#define SCTPCTL_CWND_MAXBURST_DESC "Use a CWND adjusting maxburst" +#define SCTPCTL_CWND_MAXBURST_MIN 0 +#define SCTPCTL_CWND_MAXBURST_MAX 1 +#define SCTPCTL_CWND_MAXBURST_DEFAULT 1 + +/* early_fast_retran: Early Fast Retransmit with timer */ +#define SCTPCTL_EARLY_FAST_RETRAN_DESC "Early Fast Retransmit with timer" +#define SCTPCTL_EARLY_FAST_RETRAN_MIN 0 +#define SCTPCTL_EARLY_FAST_RETRAN_MAX 0xFFFFFFFF +#define SCTPCTL_EARLY_FAST_RETRAN_DEFAULT 0 + +/* early_fast_retran_msec: Early Fast Retransmit minimum timer value */ +#define SCTPCTL_EARLY_FAST_RETRAN_MSEC_DESC "Early Fast Retransmit minimum timer value" +#define SCTPCTL_EARLY_FAST_RETRAN_MSEC_MIN 0 +#define SCTPCTL_EARLY_FAST_RETRAN_MSEC_MAX 0xFFFFFFFF +#define SCTPCTL_EARLY_FAST_RETRAN_MSEC_DEFAULT SCTP_MINFR_MSEC_TIMER + +/* asconf_auth_nochk: Disable SCTP ASCONF AUTH requirement */ +#define SCTPCTL_ASCONF_AUTH_NOCHK_DESC "Disable SCTP ASCONF AUTH requirement" +#define SCTPCTL_ASCONF_AUTH_NOCHK_MIN 0 +#define SCTPCTL_ASCONF_AUTH_NOCHK_MAX 1 +#define SCTPCTL_ASCONF_AUTH_NOCHK_DEFAULT 0 + +/* auth_disable: Disable SCTP AUTH function */ +#define SCTPCTL_AUTH_DISABLE_DESC "Disable SCTP AUTH function" +#define SCTPCTL_AUTH_DISABLE_MIN 0 +#define SCTPCTL_AUTH_DISABLE_MAX 1 +#define SCTPCTL_AUTH_DISABLE_DEFAULT 0 + +/* nat_friendly: SCTP NAT friendly operation */ +#define SCTPCTL_NAT_FRIENDLY_DESC "SCTP NAT friendly operation" +#define SCTPCTL_NAT_FRIENDLY_MIN 0 +#define SCTPCTL_NAT_FRIENDLY_MAX 1 +#define SCTPCTL_NAT_FRIENDLY_DEFAULT 1 + +/* abc_l_var: SCTP ABC max increase per SACK (L) */ +#define SCTPCTL_ABC_L_VAR_DESC "SCTP ABC max increase per SACK (L)" +#define SCTPCTL_ABC_L_VAR_MIN 0 +#define SCTPCTL_ABC_L_VAR_MAX 0xFFFFFFFF +#define SCTPCTL_ABC_L_VAR_DEFAULT 1 + +/* max_chained_mbufs: Default max number of small mbufs on a chain */ +#define SCTPCTL_MAX_CHAINED_MBUFS_DESC "Default max number of small mbufs on a chain" +#define SCTPCTL_MAX_CHAINED_MBUFS_MIN 0 +#define SCTPCTL_MAX_CHAINED_MBUFS_MAX 0xFFFFFFFF +#define SCTPCTL_MAX_CHAINED_MBUFS_DEFAULT SCTP_DEFAULT_MBUFS_IN_CHAIN + +/* do_sctp_drain: Should SCTP respond to the drain calls */ +#define SCTPCTL_DO_SCTP_DRAIN_DESC "Should SCTP respond to the drain calls" +#define SCTPCTL_DO_SCTP_DRAIN_MIN 0 +#define SCTPCTL_DO_SCTP_DRAIN_MAX 1 +#define SCTPCTL_DO_SCTP_DRAIN_DEFAULT 1 + +/* hb_max_burst: Confirmation Heartbeat max burst? */ +#define SCTPCTL_HB_MAX_BURST_DESC "Confirmation Heartbeat max burst" +#define SCTPCTL_HB_MAX_BURST_MIN 1 +#define SCTPCTL_HB_MAX_BURST_MAX 0xFFFFFFFF +#define SCTPCTL_HB_MAX_BURST_DEFAULT SCTP_DEF_MAX_BURST + +/* abort_at_limit: When one-2-one hits qlimit abort */ +#define SCTPCTL_ABORT_AT_LIMIT_DESC "When one-2-one hits qlimit abort" +#define SCTPCTL_ABORT_AT_LIMIT_MIN 0 +#define SCTPCTL_ABORT_AT_LIMIT_MAX 1 +#define SCTPCTL_ABORT_AT_LIMIT_DEFAULT 0 + +/* strict_data_order: Enforce strict data ordering, abort if control inside data */ +#define SCTPCTL_STRICT_DATA_ORDER_DESC "Enforce strict data ordering, abort if control inside data" +#define SCTPCTL_STRICT_DATA_ORDER_MIN 0 +#define SCTPCTL_STRICT_DATA_ORDER_MAX 1 +#define SCTPCTL_STRICT_DATA_ORDER_DEFAULT 0 + +/* min_residual: min residual in a data fragment leftover */ +#define SCTPCTL_MIN_RESIDUAL_DESC "Minimum residual data chunk in second part of split" +#define SCTPCTL_MIN_RESIDUAL_MIN 20 +#define SCTPCTL_MIN_RESIDUAL_MAX 65535 +#define SCTPCTL_MIN_RESIDUAL_DEFAULT 1452 + +/* max_retran_chunk: max chunk retransmissions */ +#define SCTPCTL_MAX_RETRAN_CHUNK_DESC "Maximum times an unlucky chunk can be retran'd before assoc abort" +#define SCTPCTL_MAX_RETRAN_CHUNK_MIN 0 +#define SCTPCTL_MAX_RETRAN_CHUNK_MAX 65535 +#define SCTPCTL_MAX_RETRAN_CHUNK_DEFAULT 30 + +/* sctp_logging: This gives us logging when the options are enabled */ +#define SCTPCTL_LOGGING_LEVEL_DESC "Ltrace/KTR trace logging level" +#define SCTPCTL_LOGGING_LEVEL_MIN 0 +#define SCTPCTL_LOGGING_LEVEL_MAX 0xffffffff +#define SCTPCTL_LOGGING_LEVEL_DEFAULT 0 + +/* JRS - default congestion control module sysctl */ +#define SCTPCTL_DEFAULT_CC_MODULE_DESC "Default congestion control module" +#define SCTPCTL_DEFAULT_CC_MODULE_MIN 0 +#define SCTPCTL_DEFAULT_CC_MODULE_MAX 2 +#define SCTPCTL_DEFAULT_CC_MODULE_DEFAULT 0 + +/* RRS - default fragment interleave */ +#define SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DESC "Default fragment interleave level" +#define SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MIN 0 +#define SCTPCTL_DEFAULT_FRAG_INTERLEAVE_MAX 2 +#define SCTPCTL_DEFAULT_FRAG_INTERLEAVE_DEFAULT 1 + +/* mobility_base: Enable SCTP mobility support */ +#define SCTPCTL_MOBILITY_BASE_DESC "Enable SCTP base mobility" +#define SCTPCTL_MOBILITY_BASE_MIN 0 +#define SCTPCTL_MOBILITY_BASE_MAX 1 +#define SCTPCTL_MOBILITY_BASE_DEFAULT SCTP_DEFAULT_MOBILITY_BASE + +/* mobility_fasthandoff: Enable SCTP fast handoff support */ +#define SCTPCTL_MOBILITY_FASTHANDOFF_DESC "Enable SCTP fast handoff" +#define SCTPCTL_MOBILITY_FASTHANDOFF_MIN 0 +#define SCTPCTL_MOBILITY_FASTHANDOFF_MAX 1 +#define SCTPCTL_MOBILITY_FASTHANDOFF_DEFAULT SCTP_DEFAULT_MOBILITY_FASTHANDOFF + +/* Enable SCTP/UDP tunneling for clients*/ +#define SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_DESC "Enable SCTP/UDP tunneling for client" +#define SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_MIN 0 +#define SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_MAX 1 +#define SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_DEFAULT SCTPCTL_UDP_TUNNELING_FOR_CLIENT_ENABLE_MIN + +/* Enable SCTP/UDP tunneling port */ +#define SCTPCTL_UDP_TUNNELING_PORT_DESC "Set the SCTP/UDP tunneling port" +#define SCTPCTL_UDP_TUNNELING_PORT_MIN 0 +#define SCTPCTL_UDP_TUNNELING_PORT_MAX 65535 +#define SCTPCTL_UDP_TUNNELING_PORT_DEFAULT SCTP_OVER_UDP_TUNNELING_PORT + +/* Enable sending of the SACK-IMMEDIATELY bit */ +#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_DESC "Enable sending of the SACK-IMMEDIATELY-bit." +#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN 0 +#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_MAX 1 +#define SCTPCTL_SACK_IMMEDIATELY_ENABLE_DEFAULT SCTPCTL_SACK_IMMEDIATELY_ENABLE_MIN + +/* Enable sending of the NAT-FRIENDLY message */ +#define SCTPCTL_NAT_FRIENDLY_INITS_DESC "Enable sending of the nat-friendly SCTP option on INITs." +#define SCTPCTL_NAT_FRIENDLY_INITS_MIN 0 +#define SCTPCTL_NAT_FRIENDLY_INITS_MAX 1 +#define SCTPCTL_NAT_FRIENDLY_INITS_DEFAULT SCTPCTL_NAT_FRIENDLY_INITS_MIN + +/* Vtag time wait in seconds */ +#define SCTPCTL_TIME_WAIT_DESC "Vtag time wait time in seconds, 0 disables it." +#define SCTPCTL_TIME_WAIT_MIN 0 +#define SCTPCTL_TIME_WAIT_MAX 0xffffffff +#define SCTPCTL_TIME_WAIT_DEFAULT SCTP_TIME_WAIT + +/* Enable Send/Receive buffer splitting */ +#define SCTPCTL_BUFFER_SPLITTING_DESC "Enable send/receive buffer splitting." +#define SCTPCTL_BUFFER_SPLITTING_MIN 0 +#define SCTPCTL_BUFFER_SPLITTING_MAX 0x3 +#define SCTPCTL_BUFFER_SPLITTING_DEFAULT SCTPCTL_BUFFER_SPLITTING_MIN + +/* Initial congestion window in MTU */ +#define SCTPCTL_INITIAL_CWND_DESC "Initial congestion window in MTUs" +#define SCTPCTL_INITIAL_CWND_MIN 1 +#define SCTPCTL_INITIAL_CWND_MAX 0xffffffff +#define SCTPCTL_INITIAL_CWND_DEFAULT 3 + +#if defined(SCTP_DEBUG) +/* debug: Configure debug output */ +#define SCTPCTL_DEBUG_DESC "Configure debug output" +#define SCTPCTL_DEBUG_MIN 0 +#define SCTPCTL_DEBUG_MAX 0xFFFFFFFF +#define SCTPCTL_DEBUG_DEFAULT 0 +#endif + + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) +#define SCTPCTL_OUTPUT_UNLOCKED_DESC "Unlock socket when sending packets down to IP." +#define SCTPCTL_OUTPUT_UNLOCKED_MIN 0 +#define SCTPCTL_OUTPUT_UNLOCKED_MAX 1 +#define SCTPCTL_OUTPUT_UNLOCKED_DEFAULT SCTPCTL_OUTPUT_UNLOCKED_MIN +#endif + + +#if defined(_KERNEL) || defined(__Userspace__) +#if defined(SYSCTL_DECL) +SYSCTL_DECL(_net_inet_sctp); +#endif + +void sctp_init_sysctls(void); + +#endif /* _KERNEL */ +#endif /* __sctp_sysctl_h__ */ diff --git a/freebsd/sys/netinet/sctp_timer.c b/freebsd/sys/netinet/sctp_timer.c new file mode 100644 index 00000000..090689b1 --- /dev/null +++ b/freebsd/sys/netinet/sctp_timer.c @@ -0,0 +1,1804 @@ +#include + +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_timer.c,v 1.29 2005/03/06 16:04:18 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#define _IP_VHL +#include +#include +#ifdef INET6 +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +void +sctp_early_fr_timer(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct sctp_tmit_chunk *chk, *tp2; + struct timeval now, min_wait, tv; + unsigned int cur_rtt, cnt = 0, cnt_resend = 0; + + /* an early FR is occuring. */ + (void)SCTP_GETTIME_TIMEVAL(&now); + /* get cur rto in micro-seconds */ + if (net->lastsa == 0) { + /* Hmm no rtt estimate yet? */ + cur_rtt = stcb->asoc.initial_rto >> 2; + } else { + + cur_rtt = ((net->lastsa >> 2) + net->lastsv) >> 1; + } + if (cur_rtt < SCTP_BASE_SYSCTL(sctp_early_fr_msec)) { + cur_rtt = SCTP_BASE_SYSCTL(sctp_early_fr_msec); + } + cur_rtt *= 1000; + tv.tv_sec = cur_rtt / 1000000; + tv.tv_usec = cur_rtt % 1000000; + min_wait = now; + timevalsub(&min_wait, &tv); + if (min_wait.tv_sec < 0 || min_wait.tv_usec < 0) { + /* + * if we hit here, we don't have enough seconds on the clock + * to account for the RTO. We just let the lower seconds be + * the bounds and don't worry about it. This may mean we + * will mark a lot more than we should. + */ + min_wait.tv_sec = min_wait.tv_usec = 0; + } + chk = TAILQ_LAST(&stcb->asoc.sent_queue, sctpchunk_listhead); + for (; chk != NULL; chk = tp2) { + tp2 = TAILQ_PREV(chk, sctpchunk_listhead, sctp_next); + if (chk->whoTo != net) { + continue; + } + if (chk->sent == SCTP_DATAGRAM_RESEND) + cnt_resend++; + else if ((chk->sent > SCTP_DATAGRAM_UNSENT) && + (chk->sent < SCTP_DATAGRAM_RESEND)) { + /* pending, may need retran */ + if (chk->sent_rcv_time.tv_sec > min_wait.tv_sec) { + /* + * we have reached a chunk that was sent + * some seconds past our min.. forget it we + * will find no more to send. + */ + continue; + } else if (chk->sent_rcv_time.tv_sec == min_wait.tv_sec) { + /* + * we must look at the micro seconds to + * know. + */ + if (chk->sent_rcv_time.tv_usec >= min_wait.tv_usec) { + /* + * ok it was sent after our boundary + * time. + */ + continue; + } + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_EARLYFR_LOGGING_ENABLE) { + sctp_log_fr(chk->rec.data.TSN_seq, chk->snd_count, + 4, SCTP_FR_MARKED_EARLY); + } + SCTP_STAT_INCR(sctps_earlyfrmrkretrans); + chk->sent = SCTP_DATAGRAM_RESEND; + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + /* double book size since we are doing an early FR */ + chk->book_size_scale++; + cnt += chk->send_size; + if ((cnt + net->flight_size) > net->cwnd) { + /* Mark all we could possibly resend */ + break; + } + } + } + if (cnt) { + /* + * JRS - Use the congestion control given in the congestion + * control module + */ + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer(inp, stcb, net); + } else if (cnt_resend) { + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_EARLY_FR_TMR, SCTP_SO_NOT_LOCKED); + } + /* Restart it? */ + if (net->flight_size < net->cwnd) { + SCTP_STAT_INCR(sctps_earlyfrstrtmr); + sctp_timer_start(SCTP_TIMER_TYPE_EARLYFR, stcb->sctp_ep, stcb, net); + } +} + +void +sctp_audit_retranmission_queue(struct sctp_association *asoc) +{ + struct sctp_tmit_chunk *chk; + + SCTPDBG(SCTP_DEBUG_TIMER4, "Audit invoked on send queue cnt:%d onqueue:%d\n", + asoc->sent_queue_retran_cnt, + asoc->sent_queue_cnt); + asoc->sent_queue_retran_cnt = 0; + asoc->sent_queue_cnt = 0; + TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { + if (chk->sent == SCTP_DATAGRAM_RESEND) { + sctp_ucount_incr(asoc->sent_queue_retran_cnt); + } + asoc->sent_queue_cnt++; + } + TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { + if (chk->sent == SCTP_DATAGRAM_RESEND) { + sctp_ucount_incr(asoc->sent_queue_retran_cnt); + } + } + TAILQ_FOREACH(chk, &asoc->asconf_send_queue, sctp_next) { + if (chk->sent == SCTP_DATAGRAM_RESEND) { + sctp_ucount_incr(asoc->sent_queue_retran_cnt); + } + } + SCTPDBG(SCTP_DEBUG_TIMER4, "Audit completes retran:%d onqueue:%d\n", + asoc->sent_queue_retran_cnt, + asoc->sent_queue_cnt); +} + +int +sctp_threshold_management(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net, uint16_t threshold) +{ + if (net) { + net->error_count++; + SCTPDBG(SCTP_DEBUG_TIMER4, "Error count for %p now %d thresh:%d\n", + net, net->error_count, + net->failure_threshold); + if (net->error_count > net->failure_threshold) { + /* We had a threshold failure */ + if (net->dest_state & SCTP_ADDR_REACHABLE) { + net->dest_state &= ~SCTP_ADDR_REACHABLE; + net->dest_state |= SCTP_ADDR_NOT_REACHABLE; + net->dest_state &= ~SCTP_ADDR_REQ_PRIMARY; + if (net == stcb->asoc.primary_destination) { + net->dest_state |= SCTP_ADDR_WAS_PRIMARY; + } + /* + * JRS 5/14/07 - If a destination is + * unreachable, the PF bit is turned off. + * This allows an unambiguous use of the PF + * bit for destinations that are reachable + * but potentially failed. If the + * destination is set to the unreachable + * state, also set the destination to the PF + * state. + */ + /* + * Add debug message here if destination is + * not in PF state. + */ + /* Stop any running T3 timers here? */ + if ((stcb->asoc.sctp_cmt_on_off == 1) && + (stcb->asoc.sctp_cmt_pf > 0)) { + net->dest_state &= ~SCTP_ADDR_PF; + SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from PF to unreachable.\n", + net); + } + sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, + stcb, + SCTP_FAILED_THRESHOLD, + (void *)net, SCTP_SO_NOT_LOCKED); + } + } + /*********HOLD THIS COMMENT FOR PATCH OF ALTERNATE + *********ROUTING CODE + */ + /*********HOLD THIS COMMENT FOR END OF PATCH OF ALTERNATE + *********ROUTING CODE + */ + } + if (stcb == NULL) + return (0); + + if (net) { + if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_INCR, + stcb->asoc.overall_error_count, + (stcb->asoc.overall_error_count + 1), + SCTP_FROM_SCTP_TIMER, + __LINE__); + } + stcb->asoc.overall_error_count++; + } + } else { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { + sctp_misc_ints(SCTP_THRESHOLD_INCR, + stcb->asoc.overall_error_count, + (stcb->asoc.overall_error_count + 1), + SCTP_FROM_SCTP_TIMER, + __LINE__); + } + stcb->asoc.overall_error_count++; + } + SCTPDBG(SCTP_DEBUG_TIMER4, "Overall error count for %p now %d thresh:%u state:%x\n", + &stcb->asoc, stcb->asoc.overall_error_count, + (uint32_t) threshold, + ((net == NULL) ? (uint32_t) 0 : (uint32_t) net->dest_state)); + /* + * We specifically do not do >= to give the assoc one more change + * before we fail it. + */ + if (stcb->asoc.overall_error_count > threshold) { + /* Abort notification sends a ULP notify */ + struct mbuf *oper; + + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) + + sizeof(uint32_t); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_TIMER + SCTP_LOC_1); + } + inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_1; + sctp_abort_an_association(inp, stcb, SCTP_FAILED_THRESHOLD, oper, SCTP_SO_NOT_LOCKED); + return (1); + } + return (0); +} + +/* + * sctp_find_alternate_net() returns a non-NULL pointer as long + * the argument net is non-NULL. + */ +struct sctp_nets * +sctp_find_alternate_net(struct sctp_tcb *stcb, + struct sctp_nets *net, + int mode) +{ + /* Find and return an alternate network if possible */ + struct sctp_nets *alt, *mnet, *min_errors_net = NULL, *max_cwnd_net = NULL; + int once; + + /* JRS 5/14/07 - Initialize min_errors to an impossible value. */ + int min_errors = -1; + uint32_t max_cwnd = 0; + + if (stcb->asoc.numnets == 1) { + /* No others but net */ + return (TAILQ_FIRST(&stcb->asoc.nets)); + } + /* + * JRS 5/14/07 - If mode is set to 2, use the CMT PF find alternate + * net algorithm. This algorithm chooses the active destination (not + * in PF state) with the largest cwnd value. If all destinations are + * in PF state, unreachable, or unconfirmed, choose the desination + * that is in PF state with the lowest error count. In case of a + * tie, choose the destination that was most recently active. + */ + if (mode == 2) { + TAILQ_FOREACH(mnet, &stcb->asoc.nets, sctp_next) { + /* + * JRS 5/14/07 - If the destination is unreachable + * or unconfirmed, skip it. + */ + if (((mnet->dest_state & SCTP_ADDR_REACHABLE) != SCTP_ADDR_REACHABLE) || + (mnet->dest_state & SCTP_ADDR_UNCONFIRMED)) { + continue; + } + /* + * JRS 5/14/07 - If the destination is reachable + * but in PF state, compare the error count of the + * destination to the minimum error count seen thus + * far. Store the destination with the lower error + * count. If the error counts are equal, store the + * destination that was most recently active. + */ + if (mnet->dest_state & SCTP_ADDR_PF) { + /* + * JRS 5/14/07 - If the destination under + * consideration is the current destination, + * work as if the error count is one higher. + * The actual error count will not be + * incremented until later in the t3 + * handler. + */ + if (mnet == net) { + if (min_errors == -1) { + min_errors = mnet->error_count + 1; + min_errors_net = mnet; + } else if (mnet->error_count + 1 < min_errors) { + min_errors = mnet->error_count + 1; + min_errors_net = mnet; + } else if (mnet->error_count + 1 == min_errors + && mnet->last_active > min_errors_net->last_active) { + min_errors_net = mnet; + min_errors = mnet->error_count + 1; + } + continue; + } else { + if (min_errors == -1) { + min_errors = mnet->error_count; + min_errors_net = mnet; + } else if (mnet->error_count < min_errors) { + min_errors = mnet->error_count; + min_errors_net = mnet; + } else if (mnet->error_count == min_errors + && mnet->last_active > min_errors_net->last_active) { + min_errors_net = mnet; + min_errors = mnet->error_count; + } + continue; + } + } + /* + * JRS 5/14/07 - If the destination is reachable and + * not in PF state, compare the cwnd of the + * destination to the highest cwnd seen thus far. + * Store the destination with the higher cwnd value. + * If the cwnd values are equal, randomly choose one + * of the two destinations. + */ + if (max_cwnd < mnet->cwnd) { + max_cwnd_net = mnet; + max_cwnd = mnet->cwnd; + } else if (max_cwnd == mnet->cwnd) { + uint32_t rndval; + uint8_t this_random; + + if (stcb->asoc.hb_random_idx > 3) { + rndval = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep); + memcpy(stcb->asoc.hb_random_values, &rndval, sizeof(stcb->asoc.hb_random_values)); + this_random = stcb->asoc.hb_random_values[0]; + stcb->asoc.hb_random_idx++; + stcb->asoc.hb_ect_randombit = 0; + } else { + this_random = stcb->asoc.hb_random_values[stcb->asoc.hb_random_idx]; + stcb->asoc.hb_random_idx++; + stcb->asoc.hb_ect_randombit = 0; + } + if (this_random % 2 == 1) { + max_cwnd_net = mnet; + max_cwnd = mnet->cwnd; /* Useless? */ + } + } + } + /* + * JRS 5/14/07 - After all destination have been considered + * as alternates, check to see if there was some active + * destination (not in PF state). If not, check to see if + * there was some PF destination with the minimum number of + * errors. If not, return the original destination. If + * there is a min_errors_net, remove the PF flag from that + * destination, set the cwnd to one or two MTUs, and return + * the destination as an alt. If there was some active + * destination with a highest cwnd, return the destination + * as an alt. + */ + if (max_cwnd_net == NULL) { + if (min_errors_net == NULL) { + return (net); + } + min_errors_net->dest_state &= ~SCTP_ADDR_PF; + min_errors_net->cwnd = min_errors_net->mtu * stcb->asoc.sctp_cmt_pf; + if (SCTP_OS_TIMER_PENDING(&min_errors_net->rxt_timer.timer)) { + sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, + stcb, min_errors_net, + SCTP_FROM_SCTP_TIMER + SCTP_LOC_2); + } + SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from PF to active with %d errors.\n", + min_errors_net, min_errors_net->error_count); + return (min_errors_net); + } else { + return (max_cwnd_net); + } + } + /* + * JRS 5/14/07 - If mode is set to 1, use the CMT policy for + * choosing an alternate net. + */ + else if (mode == 1) { + TAILQ_FOREACH(mnet, &stcb->asoc.nets, sctp_next) { + if (((mnet->dest_state & SCTP_ADDR_REACHABLE) != SCTP_ADDR_REACHABLE) || + (mnet->dest_state & SCTP_ADDR_UNCONFIRMED)) { + /* + * will skip ones that are not-reachable or + * unconfirmed + */ + continue; + } + if (max_cwnd < mnet->cwnd) { + max_cwnd_net = mnet; + max_cwnd = mnet->cwnd; + } else if (max_cwnd == mnet->cwnd) { + uint32_t rndval; + uint8_t this_random; + + if (stcb->asoc.hb_random_idx > 3) { + rndval = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep); + memcpy(stcb->asoc.hb_random_values, &rndval, + sizeof(stcb->asoc.hb_random_values)); + this_random = stcb->asoc.hb_random_values[0]; + stcb->asoc.hb_random_idx = 0; + stcb->asoc.hb_ect_randombit = 0; + } else { + this_random = stcb->asoc.hb_random_values[stcb->asoc.hb_random_idx]; + stcb->asoc.hb_random_idx++; + stcb->asoc.hb_ect_randombit = 0; + } + if (this_random % 2) { + max_cwnd_net = mnet; + max_cwnd = mnet->cwnd; + } + } + } + if (max_cwnd_net) { + return (max_cwnd_net); + } + } + mnet = net; + once = 0; + + if (mnet == NULL) { + mnet = TAILQ_FIRST(&stcb->asoc.nets); + if (mnet == NULL) { + return (NULL); + } + } + do { + alt = TAILQ_NEXT(mnet, sctp_next); + if (alt == NULL) { + once++; + if (once > 1) { + break; + } + alt = TAILQ_FIRST(&stcb->asoc.nets); + if (alt == NULL) { + return (NULL); + } + } + if (alt->ro.ro_rt == NULL) { + if (alt->ro._s_addr) { + sctp_free_ifa(alt->ro._s_addr); + alt->ro._s_addr = NULL; + } + alt->src_addr_selected = 0; + } + /* sa_ignore NO_NULL_CHK */ + if (((alt->dest_state & SCTP_ADDR_REACHABLE) == SCTP_ADDR_REACHABLE) && + (alt->ro.ro_rt != NULL) && + (!(alt->dest_state & SCTP_ADDR_UNCONFIRMED))) { + /* Found a reachable address */ + break; + } + mnet = alt; + } while (alt != NULL); + + if (alt == NULL) { + /* Case where NO insv network exists (dormant state) */ + /* we rotate destinations */ + once = 0; + mnet = net; + do { + if (mnet == NULL) { + return (TAILQ_FIRST(&stcb->asoc.nets)); + } + alt = TAILQ_NEXT(mnet, sctp_next); + if (alt == NULL) { + once++; + if (once > 1) { + break; + } + alt = TAILQ_FIRST(&stcb->asoc.nets); + } + /* sa_ignore NO_NULL_CHK */ + if ((!(alt->dest_state & SCTP_ADDR_UNCONFIRMED)) && + (alt != net)) { + /* Found an alternate address */ + break; + } + mnet = alt; + } while (alt != NULL); + } + if (alt == NULL) { + return (net); + } + return (alt); +} + +static void +sctp_backoff_on_timeout(struct sctp_tcb *stcb, + struct sctp_nets *net, + int win_probe, + int num_marked, int num_abandoned) +{ + if (net->RTO == 0) { + net->RTO = stcb->asoc.minrto; + } + net->RTO <<= 1; + if (net->RTO > stcb->asoc.maxrto) { + net->RTO = stcb->asoc.maxrto; + } + if ((win_probe == 0) && (num_marked || num_abandoned)) { + /* We don't apply penalty to window probe scenarios */ + /* JRS - Use the congestion control given in the CC module */ + stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout(stcb, net); + } +} + +#ifndef INVARIANTS +static void +sctp_recover_sent_list(struct sctp_tcb *stcb) +{ + struct sctp_tmit_chunk *chk, *tp2; + struct sctp_association *asoc; + + asoc = &stcb->asoc; + chk = TAILQ_FIRST(&stcb->asoc.sent_queue); + for (; chk != NULL; chk = tp2) { + tp2 = TAILQ_NEXT(chk, sctp_next); + if ((compare_with_wrap(stcb->asoc.last_acked_seq, + chk->rec.data.TSN_seq, + MAX_TSN)) || + (stcb->asoc.last_acked_seq == chk->rec.data.TSN_seq)) { + + SCTP_PRINTF("Found chk:%p tsn:%x <= last_acked_seq:%x\n", + chk, chk->rec.data.TSN_seq, stcb->asoc.last_acked_seq); + TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next); + if (chk->pr_sctp_on) { + if (asoc->pr_sctp_cnt != 0) + asoc->pr_sctp_cnt--; + } + if (chk->data) { + /* sa_ignore NO_NULL_CHK */ + sctp_free_bufspace(stcb, asoc, chk, 1); + sctp_m_freem(chk->data); + if (asoc->peer_supports_prsctp && PR_SCTP_BUF_ENABLED(chk->flags)) { + asoc->sent_queue_cnt_removeable--; + } + } + chk->data = NULL; + asoc->sent_queue_cnt--; + sctp_free_a_chunk(stcb, chk); + } + } + SCTP_PRINTF("after recover order is as follows\n"); + chk = TAILQ_FIRST(&stcb->asoc.sent_queue); + for (; chk != NULL; chk = tp2) { + tp2 = TAILQ_NEXT(chk, sctp_next); + SCTP_PRINTF("chk:%p TSN:%x\n", chk, chk->rec.data.TSN_seq); + } +} + +#endif + +static int +sctp_mark_all_for_resend(struct sctp_tcb *stcb, + struct sctp_nets *net, + struct sctp_nets *alt, + int window_probe, + int *num_marked, + int *num_abandoned) +{ + + /* + * Mark all chunks (well not all) that were sent to *net for + * retransmission. Move them to alt for there destination as well... + * We only mark chunks that have been outstanding long enough to + * have received feed-back. + */ + struct sctp_tmit_chunk *chk, *tp2; + struct sctp_nets *lnets; + struct timeval now, min_wait, tv; + int cur_rtt; + int cnt_abandoned; + int audit_tf, num_mk, fir; + unsigned int cnt_mk; + uint32_t orig_flight, orig_tf; + uint32_t tsnlast, tsnfirst; + int recovery_cnt = 0; + + + /* none in flight now */ + audit_tf = 0; + fir = 0; + /* + * figure out how long a data chunk must be pending before we can + * mark it .. + */ + (void)SCTP_GETTIME_TIMEVAL(&now); + /* get cur rto in micro-seconds */ + cur_rtt = (((net->lastsa >> 2) + net->lastsv) >> 1); + cur_rtt *= 1000; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) { + sctp_log_fr(cur_rtt, + stcb->asoc.peers_rwnd, + window_probe, + SCTP_FR_T3_MARK_TIME); + sctp_log_fr(net->flight_size, + SCTP_OS_TIMER_PENDING(&net->fr_timer.timer), + SCTP_OS_TIMER_ACTIVE(&net->fr_timer.timer), + SCTP_FR_CWND_REPORT); + sctp_log_fr(net->flight_size, net->cwnd, stcb->asoc.total_flight, SCTP_FR_CWND_REPORT); + } + tv.tv_sec = cur_rtt / 1000000; + tv.tv_usec = cur_rtt % 1000000; + min_wait = now; + timevalsub(&min_wait, &tv); + if (min_wait.tv_sec < 0 || min_wait.tv_usec < 0) { + /* + * if we hit here, we don't have enough seconds on the clock + * to account for the RTO. We just let the lower seconds be + * the bounds and don't worry about it. This may mean we + * will mark a lot more than we should. + */ + min_wait.tv_sec = min_wait.tv_usec = 0; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) { + sctp_log_fr(cur_rtt, now.tv_sec, now.tv_usec, SCTP_FR_T3_MARK_TIME); + sctp_log_fr(0, min_wait.tv_sec, min_wait.tv_usec, SCTP_FR_T3_MARK_TIME); + } + /* + * Our rwnd will be incorrect here since we are not adding back the + * cnt * mbuf but we will fix that down below. + */ + orig_flight = net->flight_size; + orig_tf = stcb->asoc.total_flight; + + net->fast_retran_ip = 0; + /* Now on to each chunk */ + cnt_abandoned = 0; + num_mk = cnt_mk = 0; + tsnfirst = tsnlast = 0; +#ifndef INVARIANTS +start_again: +#endif + chk = TAILQ_FIRST(&stcb->asoc.sent_queue); + for (; chk != NULL; chk = tp2) { + tp2 = TAILQ_NEXT(chk, sctp_next); + if ((compare_with_wrap(stcb->asoc.last_acked_seq, + chk->rec.data.TSN_seq, + MAX_TSN)) || + (stcb->asoc.last_acked_seq == chk->rec.data.TSN_seq)) { + /* Strange case our list got out of order? */ + SCTP_PRINTF("Our list is out of order? last_acked:%x chk:%x", + (unsigned int)stcb->asoc.last_acked_seq, (unsigned int)chk->rec.data.TSN_seq); + recovery_cnt++; +#ifdef INVARIANTS + panic("last acked >= chk on sent-Q"); +#else + SCTP_PRINTF("Recover attempts a restart cnt:%d\n", recovery_cnt); + sctp_recover_sent_list(stcb); + if (recovery_cnt < 10) { + goto start_again; + } else { + SCTP_PRINTF("Recovery fails %d times??\n", recovery_cnt); + } +#endif + } + if ((chk->whoTo == net) && (chk->sent < SCTP_DATAGRAM_ACKED)) { + /* + * found one to mark: If it is less than + * DATAGRAM_ACKED it MUST not be a skipped or marked + * TSN but instead one that is either already set + * for retransmission OR one that needs + * retransmission. + */ + + /* validate its been outstanding long enough */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) { + sctp_log_fr(chk->rec.data.TSN_seq, + chk->sent_rcv_time.tv_sec, + chk->sent_rcv_time.tv_usec, + SCTP_FR_T3_MARK_TIME); + } + if ((chk->sent_rcv_time.tv_sec > min_wait.tv_sec) && (window_probe == 0)) { + /* + * we have reached a chunk that was sent + * some seconds past our min.. forget it we + * will find no more to send. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) { + sctp_log_fr(0, + chk->sent_rcv_time.tv_sec, + chk->sent_rcv_time.tv_usec, + SCTP_FR_T3_STOPPED); + } + continue; + } else if ((chk->sent_rcv_time.tv_sec == min_wait.tv_sec) && + (window_probe == 0)) { + /* + * we must look at the micro seconds to + * know. + */ + if (chk->sent_rcv_time.tv_usec >= min_wait.tv_usec) { + /* + * ok it was sent after our boundary + * time. + */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) { + sctp_log_fr(0, + chk->sent_rcv_time.tv_sec, + chk->sent_rcv_time.tv_usec, + SCTP_FR_T3_STOPPED); + } + continue; + } + } + if (stcb->asoc.peer_supports_prsctp && PR_SCTP_TTL_ENABLED(chk->flags)) { + /* Is it expired? */ + if (timevalcmp(&now, &chk->rec.data.timetodrop, >)) { + /* Yes so drop it */ + if (chk->data) { + (void)sctp_release_pr_sctp_chunk(stcb, + chk, + (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT), + SCTP_SO_NOT_LOCKED); + cnt_abandoned++; + } + continue; + } + } + if (stcb->asoc.peer_supports_prsctp && PR_SCTP_RTX_ENABLED(chk->flags)) { + /* Has it been retransmitted tv_sec times? */ + if (chk->snd_count > chk->rec.data.timetodrop.tv_sec) { + if (chk->data) { + (void)sctp_release_pr_sctp_chunk(stcb, + chk, + (SCTP_RESPONSE_TO_USER_REQ | SCTP_NOTIFY_DATAGRAM_SENT), + SCTP_SO_NOT_LOCKED); + cnt_abandoned++; + } + continue; + } + } + if (chk->sent < SCTP_DATAGRAM_RESEND) { + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + num_mk++; + if (fir == 0) { + fir = 1; + tsnfirst = chk->rec.data.TSN_seq; + } + tsnlast = chk->rec.data.TSN_seq; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) { + sctp_log_fr(chk->rec.data.TSN_seq, chk->snd_count, + 0, SCTP_FR_T3_MARKED); + } + if (chk->rec.data.chunk_was_revoked) { + /* deflate the cwnd */ + chk->whoTo->cwnd -= chk->book_size; + chk->rec.data.chunk_was_revoked = 0; + } + net->marked_retrans++; + stcb->asoc.marked_retrans++; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_RSND_TO, + chk->whoTo->flight_size, + chk->book_size, + (uintptr_t) chk->whoTo, + chk->rec.data.TSN_seq); + } + sctp_flight_size_decrease(chk); + sctp_total_flight_decrease(stcb, chk); + stcb->asoc.peers_rwnd += chk->send_size; + stcb->asoc.peers_rwnd += SCTP_BASE_SYSCTL(sctp_peer_chunk_oh); + } + chk->sent = SCTP_DATAGRAM_RESEND; + SCTP_STAT_INCR(sctps_markedretrans); + + /* reset the TSN for striking and other FR stuff */ + chk->rec.data.doing_fast_retransmit = 0; + /* Clear any time so NO RTT is being done */ + chk->do_rtt = 0; + if (alt != net) { + sctp_free_remote_addr(chk->whoTo); + chk->no_fr_allowed = 1; + chk->whoTo = alt; + atomic_add_int(&alt->ref_count, 1); + } else { + chk->no_fr_allowed = 0; + if (TAILQ_EMPTY(&stcb->asoc.send_queue)) { + chk->rec.data.fast_retran_tsn = stcb->asoc.sending_seq; + } else { + chk->rec.data.fast_retran_tsn = (TAILQ_FIRST(&stcb->asoc.send_queue))->rec.data.TSN_seq; + } + } + /* + * CMT: Do not allow FRs on retransmitted TSNs. + */ + if (stcb->asoc.sctp_cmt_on_off == 1) { + chk->no_fr_allowed = 1; + } +#ifdef THIS_SHOULD_NOT_BE_DONE + } else if (chk->sent == SCTP_DATAGRAM_ACKED) { + /* remember highest acked one */ + could_be_sent = chk; +#endif + } + if (chk->sent == SCTP_DATAGRAM_RESEND) { + cnt_mk++; + } + } + if ((orig_flight - net->flight_size) != (orig_tf - stcb->asoc.total_flight)) { + /* we did not subtract the same things? */ + audit_tf = 1; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_EARLYFR_LOGGING_ENABLE | SCTP_FR_LOGGING_ENABLE)) { + sctp_log_fr(tsnfirst, tsnlast, num_mk, SCTP_FR_T3_TIMEOUT); + } +#ifdef SCTP_DEBUG + if (num_mk) { + SCTPDBG(SCTP_DEBUG_TIMER1, "LAST TSN marked was %x\n", + tsnlast); + SCTPDBG(SCTP_DEBUG_TIMER1, "Num marked for retransmission was %d peer-rwd:%ld\n", + num_mk, (u_long)stcb->asoc.peers_rwnd); + SCTPDBG(SCTP_DEBUG_TIMER1, "LAST TSN marked was %x\n", + tsnlast); + SCTPDBG(SCTP_DEBUG_TIMER1, "Num marked for retransmission was %d peer-rwd:%d\n", + num_mk, + (int)stcb->asoc.peers_rwnd); + } +#endif + *num_marked = num_mk; + *num_abandoned = cnt_abandoned; + /* + * Now check for a ECN Echo that may be stranded And include the + * cnt_mk'd to have all resends in the control queue. + */ + TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) { + if (chk->sent == SCTP_DATAGRAM_RESEND) { + cnt_mk++; + } + if ((chk->whoTo == net) && + (chk->rec.chunk_id.id == SCTP_ECN_ECHO)) { + sctp_free_remote_addr(chk->whoTo); + chk->whoTo = alt; + if (chk->sent != SCTP_DATAGRAM_RESEND) { + chk->sent = SCTP_DATAGRAM_RESEND; + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + cnt_mk++; + } + atomic_add_int(&alt->ref_count, 1); + } + } +#ifdef THIS_SHOULD_NOT_BE_DONE + if ((stcb->asoc.sent_queue_retran_cnt == 0) && (could_be_sent)) { + /* fix it so we retransmit the highest acked anyway */ + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + cnt_mk++; + could_be_sent->sent = SCTP_DATAGRAM_RESEND; + } +#endif + if (stcb->asoc.sent_queue_retran_cnt != cnt_mk) { +#ifdef INVARIANTS + SCTP_PRINTF("Local Audit says there are %d for retran asoc cnt:%d we marked:%d this time\n", + cnt_mk, stcb->asoc.sent_queue_retran_cnt, num_mk); +#endif +#ifndef SCTP_AUDITING_ENABLED + stcb->asoc.sent_queue_retran_cnt = cnt_mk; +#endif + } + if (audit_tf) { + SCTPDBG(SCTP_DEBUG_TIMER4, + "Audit total flight due to negative value net:%p\n", + net); + stcb->asoc.total_flight = 0; + stcb->asoc.total_flight_count = 0; + /* Clear all networks flight size */ + TAILQ_FOREACH(lnets, &stcb->asoc.nets, sctp_next) { + lnets->flight_size = 0; + SCTPDBG(SCTP_DEBUG_TIMER4, + "Net:%p c-f cwnd:%d ssthresh:%d\n", + lnets, lnets->cwnd, lnets->ssthresh); + } + TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { + if (chk->sent < SCTP_DATAGRAM_RESEND) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_UP, + chk->whoTo->flight_size, + chk->book_size, + (uintptr_t) chk->whoTo, + chk->rec.data.TSN_seq); + } + sctp_flight_size_increase(chk); + sctp_total_flight_increase(stcb, chk); + } + } + } + /* + * Setup the ecn nonce re-sync point. We do this since + * retranmissions are NOT setup for ECN. This means that do to + * Karn's rule, we don't know the total of the peers ecn bits. + */ + chk = TAILQ_FIRST(&stcb->asoc.send_queue); + if (chk == NULL) { + stcb->asoc.nonce_resync_tsn = stcb->asoc.sending_seq; + } else { + stcb->asoc.nonce_resync_tsn = chk->rec.data.TSN_seq; + } + stcb->asoc.nonce_wait_for_ecne = 0; + stcb->asoc.nonce_sum_check = 0; + /* We return 1 if we only have a window probe outstanding */ + return (0); +} + + +int +sctp_t3rxt_timer(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct sctp_nets *alt; + int win_probe, num_mk, num_abandoned; + + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FR_LOGGING_ENABLE) { + sctp_log_fr(0, 0, 0, SCTP_FR_T3_TIMEOUT); + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { + struct sctp_nets *lnet; + + TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) { + if (net == lnet) { + sctp_log_cwnd(stcb, lnet, 1, SCTP_CWND_LOG_FROM_T3); + } else { + sctp_log_cwnd(stcb, lnet, 0, SCTP_CWND_LOG_FROM_T3); + } + } + } + /* Find an alternate and mark those for retransmission */ + if ((stcb->asoc.peers_rwnd == 0) && + (stcb->asoc.total_flight < net->mtu)) { + SCTP_STAT_INCR(sctps_timowindowprobe); + win_probe = 1; + } else { + win_probe = 0; + } + + /* + * JRS 5/14/07 - If CMT PF is on and the destination if not already + * in PF state, set the destination to PF state and store the + * current time as the time that the destination was last active. In + * addition, find an alternate destination with PF-based + * find_alt_net(). + */ + if ((stcb->asoc.sctp_cmt_on_off == 1) && + (stcb->asoc.sctp_cmt_pf > 0)) { + if ((net->dest_state & SCTP_ADDR_PF) != SCTP_ADDR_PF) { + net->dest_state |= SCTP_ADDR_PF; + net->last_active = sctp_get_tick_count(); + SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from active to PF.\n", + net); + } + alt = sctp_find_alternate_net(stcb, net, 2); + } else if (stcb->asoc.sctp_cmt_on_off == 1) { + /* + * CMT: Using RTX_SSTHRESH policy for CMT. If CMT is being + * used, then pick dest with largest ssthresh for any + * retransmission. + */ + alt = sctp_find_alternate_net(stcb, net, 1); + /* + * CUCv2: If a different dest is picked for the + * retransmission, then new (rtx-)pseudo_cumack needs to be + * tracked for orig dest. Let CUCv2 track new (rtx-) + * pseudo-cumack always. + */ + net->find_pseudo_cumack = 1; + net->find_rtx_pseudo_cumack = 1; + } else { /* CMT is OFF */ + alt = sctp_find_alternate_net(stcb, net, 0); + } + num_mk = 0; + num_abandoned = 0; + (void)sctp_mark_all_for_resend(stcb, net, alt, win_probe, + &num_mk, &num_abandoned); + /* FR Loss recovery just ended with the T3. */ + stcb->asoc.fast_retran_loss_recovery = 0; + + /* CMT FR loss recovery ended with the T3 */ + net->fast_retran_loss_recovery = 0; + + /* + * setup the sat loss recovery that prevents satellite cwnd advance. + */ + stcb->asoc.sat_t3_loss_recovery = 1; + stcb->asoc.sat_t3_recovery_tsn = stcb->asoc.sending_seq; + + /* Backoff the timer and cwnd */ + sctp_backoff_on_timeout(stcb, net, win_probe, num_mk, num_abandoned); + if (win_probe == 0) { + /* We don't do normal threshold management on window probes */ + if (sctp_threshold_management(inp, stcb, net, + stcb->asoc.max_send_times)) { + /* Association was destroyed */ + return (1); + } else { + if (net != stcb->asoc.primary_destination) { + /* send a immediate HB if our RTO is stale */ + struct timeval now; + unsigned int ms_goneby; + + (void)SCTP_GETTIME_TIMEVAL(&now); + if (net->last_sent_time.tv_sec) { + ms_goneby = (now.tv_sec - net->last_sent_time.tv_sec) * 1000; + } else { + ms_goneby = 0; + } + if ((ms_goneby > net->RTO) || (net->RTO == 0)) { + /* + * no recent feed back in an RTO or + * more, request a RTT update + */ + if (sctp_send_hb(stcb, 1, net) < 0) + /* + * Less than 0 means we lost + * the assoc + */ + return (1); + } + } + } + } else { + /* + * For a window probe we don't penalize the net's but only + * the association. This may fail it if SACKs are not coming + * back. If sack's are coming with rwnd locked at 0, we will + * continue to hold things waiting for rwnd to raise + */ + if (sctp_threshold_management(inp, stcb, NULL, + stcb->asoc.max_send_times)) { + /* Association was destroyed */ + return (1); + } + } + if (net->dest_state & SCTP_ADDR_NOT_REACHABLE) { + /* Move all pending over too */ + sctp_move_chunks_from_net(stcb, net); + + /* + * Get the address that failed, to force a new src address + * selecton and a route allocation. + */ + if (net->ro._s_addr) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + } + net->src_addr_selected = 0; + + /* Force a route allocation too */ + if (net->ro.ro_rt) { + RTFREE(net->ro.ro_rt); + net->ro.ro_rt = NULL; + } + /* Was it our primary? */ + if ((stcb->asoc.primary_destination == net) && (alt != net)) { + /* + * Yes, note it as such and find an alternate note: + * this means HB code must use this to resent the + * primary if it goes active AND if someone does a + * change-primary then this flag must be cleared + * from any net structures. + */ + if (sctp_set_primary_addr(stcb, + (struct sockaddr *)NULL, + alt) == 0) { + net->dest_state |= SCTP_ADDR_WAS_PRIMARY; + } + } + } else if ((stcb->asoc.sctp_cmt_on_off == 1) && + (stcb->asoc.sctp_cmt_pf > 0) && + ((net->dest_state & SCTP_ADDR_PF) == SCTP_ADDR_PF)) { + /* + * JRS 5/14/07 - If the destination hasn't failed completely + * but is in PF state, a PF-heartbeat needs to be sent + * manually. + */ + if (sctp_send_hb(stcb, 1, net) < 0) + /* Return less than 0 means we lost the association */ + return (1); + } + /* + * Special case for cookie-echo'ed case, we don't do output but must + * await the COOKIE-ACK before retransmission + */ + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) { + /* + * Here we just reset the timer and start again since we + * have not established the asoc + */ + sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); + return (0); + } + if (stcb->asoc.peer_supports_prsctp) { + struct sctp_tmit_chunk *lchk; + + lchk = sctp_try_advance_peer_ack_point(stcb, &stcb->asoc); + /* C3. See if we need to send a Fwd-TSN */ + if (compare_with_wrap(stcb->asoc.advanced_peer_ack_point, + stcb->asoc.last_acked_seq, MAX_TSN)) { + /* + * ISSUE with ECN, see FWD-TSN processing for notes + * on issues that will occur when the ECN NONCE + * stuff is put into SCTP for cross checking. + */ + send_forward_tsn(stcb, &stcb->asoc); + if (lchk) { + /* Assure a timer is up */ + sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, lchk->whoTo); + } + } + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_MONITOR_ENABLE) { + sctp_log_cwnd(stcb, net, net->cwnd, SCTP_CWND_LOG_FROM_RTX); + } + return (0); +} + +int +sctp_t1init_timer(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + /* bump the thresholds */ + if (stcb->asoc.delayed_connection) { + /* + * special hook for delayed connection. The library did NOT + * complete the rest of its sends. + */ + stcb->asoc.delayed_connection = 0; + sctp_send_initiate(inp, stcb, SCTP_SO_NOT_LOCKED); + return (0); + } + if (SCTP_GET_STATE((&stcb->asoc)) != SCTP_STATE_COOKIE_WAIT) { + return (0); + } + if (sctp_threshold_management(inp, stcb, net, + stcb->asoc.max_init_times)) { + /* Association was destroyed */ + return (1); + } + stcb->asoc.dropped_special_cnt = 0; + sctp_backoff_on_timeout(stcb, stcb->asoc.primary_destination, 1, 0, 0); + if (stcb->asoc.initial_init_rto_max < net->RTO) { + net->RTO = stcb->asoc.initial_init_rto_max; + } + if (stcb->asoc.numnets > 1) { + /* If we have more than one addr use it */ + struct sctp_nets *alt; + + alt = sctp_find_alternate_net(stcb, stcb->asoc.primary_destination, 0); + if (alt != stcb->asoc.primary_destination) { + sctp_move_chunks_from_net(stcb, stcb->asoc.primary_destination); + stcb->asoc.primary_destination = alt; + } + } + /* Send out a new init */ + sctp_send_initiate(inp, stcb, SCTP_SO_NOT_LOCKED); + return (0); +} + +/* + * For cookie and asconf we actually need to find and mark for resend, then + * increment the resend counter (after all the threshold management stuff of + * course). + */ +int +sctp_cookie_timer(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct sctp_nets *alt; + struct sctp_tmit_chunk *cookie; + + /* first before all else we must find the cookie */ + TAILQ_FOREACH(cookie, &stcb->asoc.control_send_queue, sctp_next) { + if (cookie->rec.chunk_id.id == SCTP_COOKIE_ECHO) { + break; + } + } + if (cookie == NULL) { + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_ECHOED) { + /* FOOBAR! */ + struct mbuf *oper; + + oper = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (oper) { + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(oper) = sizeof(struct sctp_paramhdr) + + sizeof(uint32_t); + ph = mtod(oper, struct sctp_paramhdr *); + ph->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); + ph->param_length = htons(SCTP_BUF_LEN(oper)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_TIMER + SCTP_LOC_3); + } + inp->last_abort_code = SCTP_FROM_SCTP_TIMER + SCTP_LOC_4; + sctp_abort_an_association(inp, stcb, SCTP_INTERNAL_ERROR, + oper, SCTP_SO_NOT_LOCKED); + } else { +#ifdef INVARIANTS + panic("Cookie timer expires in wrong state?"); +#else + SCTP_PRINTF("Strange in state %d not cookie-echoed yet c-e timer expires?\n", SCTP_GET_STATE(&stcb->asoc)); + return (0); +#endif + } + return (0); + } + /* Ok we found the cookie, threshold management next */ + if (sctp_threshold_management(inp, stcb, cookie->whoTo, + stcb->asoc.max_init_times)) { + /* Assoc is over */ + return (1); + } + /* + * cleared theshold management now lets backoff the address & select + * an alternate + */ + stcb->asoc.dropped_special_cnt = 0; + sctp_backoff_on_timeout(stcb, cookie->whoTo, 1, 0, 0); + alt = sctp_find_alternate_net(stcb, cookie->whoTo, 0); + if (alt != cookie->whoTo) { + sctp_free_remote_addr(cookie->whoTo); + cookie->whoTo = alt; + atomic_add_int(&alt->ref_count, 1); + } + /* Now mark the retran info */ + if (cookie->sent != SCTP_DATAGRAM_RESEND) { + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + } + cookie->sent = SCTP_DATAGRAM_RESEND; + /* + * Now call the output routine to kick out the cookie again, Note we + * don't mark any chunks for retran so that FR will need to kick in + * to move these (or a send timer). + */ + return (0); +} + +int +sctp_strreset_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct sctp_nets *alt; + struct sctp_tmit_chunk *strrst = NULL, *chk = NULL; + + if (stcb->asoc.stream_reset_outstanding == 0) { + return (0); + } + /* find the existing STRRESET, we use the seq number we sent out on */ + (void)sctp_find_stream_reset(stcb, stcb->asoc.str_reset_seq_out, &strrst); + if (strrst == NULL) { + return (0); + } + /* do threshold management */ + if (sctp_threshold_management(inp, stcb, strrst->whoTo, + stcb->asoc.max_send_times)) { + /* Assoc is over */ + return (1); + } + /* + * cleared theshold management now lets backoff the address & select + * an alternate + */ + sctp_backoff_on_timeout(stcb, strrst->whoTo, 1, 0, 0); + alt = sctp_find_alternate_net(stcb, strrst->whoTo, 0); + sctp_free_remote_addr(strrst->whoTo); + strrst->whoTo = alt; + atomic_add_int(&alt->ref_count, 1); + + /* See if a ECN Echo is also stranded */ + TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) { + if ((chk->whoTo == net) && + (chk->rec.chunk_id.id == SCTP_ECN_ECHO)) { + sctp_free_remote_addr(chk->whoTo); + if (chk->sent != SCTP_DATAGRAM_RESEND) { + chk->sent = SCTP_DATAGRAM_RESEND; + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + } + chk->whoTo = alt; + atomic_add_int(&alt->ref_count, 1); + } + } + if (net->dest_state & SCTP_ADDR_NOT_REACHABLE) { + /* + * If the address went un-reachable, we need to move to + * alternates for ALL chk's in queue + */ + sctp_move_chunks_from_net(stcb, net); + } + /* mark the retran info */ + if (strrst->sent != SCTP_DATAGRAM_RESEND) + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + strrst->sent = SCTP_DATAGRAM_RESEND; + + /* restart the timer */ + sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, inp, stcb, strrst->whoTo); + return (0); +} + +int +sctp_asconf_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct sctp_nets *alt; + struct sctp_tmit_chunk *asconf, *chk, *nchk; + + /* is this a first send, or a retransmission? */ + if (TAILQ_EMPTY(&stcb->asoc.asconf_send_queue)) { + /* compose a new ASCONF chunk and send it */ + sctp_send_asconf(stcb, net, SCTP_ADDR_NOT_LOCKED); + } else { + /* + * Retransmission of the existing ASCONF is needed + */ + + /* find the existing ASCONF */ + asconf = TAILQ_FIRST(&stcb->asoc.asconf_send_queue); + if (asconf == NULL) { + return (0); + } + /* do threshold management */ + if (sctp_threshold_management(inp, stcb, asconf->whoTo, + stcb->asoc.max_send_times)) { + /* Assoc is over */ + return (1); + } + if (asconf->snd_count > stcb->asoc.max_send_times) { + /* + * Something is rotten: our peer is not responding + * to ASCONFs but apparently is to other chunks. + * i.e. it is not properly handling the chunk type + * upper bits. Mark this peer as ASCONF incapable + * and cleanup. + */ + SCTPDBG(SCTP_DEBUG_TIMER1, "asconf_timer: Peer has not responded to our repeated ASCONFs\n"); + sctp_asconf_cleanup(stcb, net); + return (0); + } + /* + * cleared threshold management, so now backoff the net and + * select an alternate + */ + sctp_backoff_on_timeout(stcb, asconf->whoTo, 1, 0, 0); + alt = sctp_find_alternate_net(stcb, asconf->whoTo, 0); + if (asconf->whoTo != alt) { + sctp_free_remote_addr(asconf->whoTo); + asconf->whoTo = alt; + atomic_add_int(&alt->ref_count, 1); + } + /* See if an ECN Echo is also stranded */ + TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) { + if ((chk->whoTo == net) && + (chk->rec.chunk_id.id == SCTP_ECN_ECHO)) { + sctp_free_remote_addr(chk->whoTo); + chk->whoTo = alt; + if (chk->sent != SCTP_DATAGRAM_RESEND) { + chk->sent = SCTP_DATAGRAM_RESEND; + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + } + atomic_add_int(&alt->ref_count, 1); + } + } + for (chk = asconf; chk; chk = nchk) { + nchk = TAILQ_NEXT(chk, sctp_next); + if (chk->whoTo != alt) { + sctp_free_remote_addr(chk->whoTo); + chk->whoTo = alt; + atomic_add_int(&alt->ref_count, 1); + } + if (asconf->sent != SCTP_DATAGRAM_RESEND && chk->sent != SCTP_DATAGRAM_UNSENT) + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + chk->sent = SCTP_DATAGRAM_RESEND; + } + if (net->dest_state & SCTP_ADDR_NOT_REACHABLE) { + /* + * If the address went un-reachable, we need to move + * to the alternate for ALL chunks in queue + */ + sctp_move_chunks_from_net(stcb, net); + } + /* mark the retran info */ + if (asconf->sent != SCTP_DATAGRAM_RESEND) + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + asconf->sent = SCTP_DATAGRAM_RESEND; + + /* send another ASCONF if any and we can do */ + sctp_send_asconf(stcb, alt, SCTP_ADDR_NOT_LOCKED); + } + return (0); +} + +/* Mobility adaptation */ +void +sctp_delete_prim_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + if (stcb->asoc.deleted_primary == NULL) { + SCTPDBG(SCTP_DEBUG_ASCONF1, "delete_prim_timer: deleted_primary is not stored...\n"); + sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); + return; + } + SCTPDBG(SCTP_DEBUG_ASCONF1, "delete_prim_timer: finished to keep deleted primary "); + SCTPDBG_ADDR(SCTP_DEBUG_ASCONF1, &stcb->asoc.deleted_primary->ro._l_addr.sa); + sctp_free_remote_addr(stcb->asoc.deleted_primary); + stcb->asoc.deleted_primary = NULL; + sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); + return; +} + +/* + * For the shutdown and shutdown-ack, we do not keep one around on the + * control queue. This means we must generate a new one and call the general + * chunk output routine, AFTER having done threshold management. + * It is assumed that net is non-NULL. + */ +int +sctp_shutdown_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct sctp_nets *alt; + + /* first threshold managment */ + if (sctp_threshold_management(inp, stcb, net, stcb->asoc.max_send_times)) { + /* Assoc is over */ + return (1); + } + sctp_backoff_on_timeout(stcb, net, 1, 0, 0); + /* second select an alternative */ + alt = sctp_find_alternate_net(stcb, net, 0); + + /* third generate a shutdown into the queue for out net */ + sctp_send_shutdown(stcb, alt); + + /* fourth restart timer */ + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, inp, stcb, alt); + return (0); +} + +int +sctp_shutdownack_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct sctp_nets *alt; + + /* first threshold managment */ + if (sctp_threshold_management(inp, stcb, net, stcb->asoc.max_send_times)) { + /* Assoc is over */ + return (1); + } + sctp_backoff_on_timeout(stcb, net, 1, 0, 0); + /* second select an alternative */ + alt = sctp_find_alternate_net(stcb, net, 0); + + /* third generate a shutdown into the queue for out net */ + sctp_send_shutdown_ack(stcb, alt); + + /* fourth restart timer */ + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, inp, stcb, alt); + return (0); +} + +static void +sctp_audit_stream_queues_for_size(struct sctp_inpcb *inp, + struct sctp_tcb *stcb) +{ + struct sctp_stream_out *outs; + struct sctp_stream_queue_pending *sp; + unsigned int chks_in_queue = 0; + int being_filled = 0; + + /* + * This function is ONLY called when the send/sent queues are empty. + */ + if ((stcb == NULL) || (inp == NULL)) + return; + + if (stcb->asoc.sent_queue_retran_cnt) { + SCTP_PRINTF("Hmm, sent_queue_retran_cnt is non-zero %d\n", + stcb->asoc.sent_queue_retran_cnt); + stcb->asoc.sent_queue_retran_cnt = 0; + } + SCTP_TCB_SEND_LOCK(stcb); + if (TAILQ_EMPTY(&stcb->asoc.out_wheel)) { + int i, cnt = 0; + + /* Check to see if a spoke fell off the wheel */ + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { + sctp_insert_on_wheel(stcb, &stcb->asoc, &stcb->asoc.strmout[i], 1); + cnt++; + } + } + if (cnt) { + /* yep, we lost a spoke or two */ + SCTP_PRINTF("Found an additional %d streams NOT on outwheel, corrected\n", cnt); + } else { + /* no spokes lost, */ + stcb->asoc.total_output_queue_size = 0; + } + SCTP_TCB_SEND_UNLOCK(stcb); + return; + } + SCTP_TCB_SEND_UNLOCK(stcb); + /* Check to see if some data queued, if so report it */ + TAILQ_FOREACH(outs, &stcb->asoc.out_wheel, next_spoke) { + if (!TAILQ_EMPTY(&outs->outqueue)) { + TAILQ_FOREACH(sp, &outs->outqueue, next) { + if (sp->msg_is_complete) + being_filled++; + chks_in_queue++; + } + } + } + if (chks_in_queue != stcb->asoc.stream_queue_cnt) { + SCTP_PRINTF("Hmm, stream queue cnt at %d I counted %d in stream out wheel\n", + stcb->asoc.stream_queue_cnt, chks_in_queue); + } + if (chks_in_queue) { + /* call the output queue function */ + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); + if ((TAILQ_EMPTY(&stcb->asoc.send_queue)) && + (TAILQ_EMPTY(&stcb->asoc.sent_queue))) { + /* + * Probably should go in and make it go back through + * and add fragments allowed + */ + if (being_filled == 0) { + SCTP_PRINTF("Still nothing moved %d chunks are stuck\n", + chks_in_queue); + } + } + } else { + SCTP_PRINTF("Found no chunks on any queue tot:%lu\n", + (u_long)stcb->asoc.total_output_queue_size); + stcb->asoc.total_output_queue_size = 0; + } +} + +int +sctp_heartbeat_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net, int cnt_of_unconf) +{ + int ret; + + if (net) { + if (net->hb_responded == 0) { + if (net->ro._s_addr) { + /* + * Invalidate the src address if we did not + * get a response last time. + */ + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + net->src_addr_selected = 0; + } + sctp_backoff_on_timeout(stcb, net, 1, 0, 0); + } + /* Zero PBA, if it needs it */ + if (net->partial_bytes_acked) { + net->partial_bytes_acked = 0; + } + } + if ((stcb->asoc.total_output_queue_size > 0) && + (TAILQ_EMPTY(&stcb->asoc.send_queue)) && + (TAILQ_EMPTY(&stcb->asoc.sent_queue))) { + sctp_audit_stream_queues_for_size(inp, stcb); + } + /* Send a new HB, this will do threshold managment, pick a new dest */ + if (cnt_of_unconf == 0) { + if (sctp_send_hb(stcb, 0, NULL) < 0) { + return (1); + } + } else { + /* + * this will send out extra hb's up to maxburst if there are + * any unconfirmed addresses. + */ + uint32_t cnt_sent = 0; + + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) && + (net->dest_state & SCTP_ADDR_REACHABLE)) { + cnt_sent++; + if (net->hb_responded == 0) { + /* Did we respond last time? */ + if (net->ro._s_addr) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + net->src_addr_selected = 0; + } + } + ret = sctp_send_hb(stcb, 1, net); + if (ret < 0) + return 1; + else if (ret == 0) { + break; + } + if (cnt_sent >= SCTP_BASE_SYSCTL(sctp_hb_maxburst)) + break; + } + } + } + return (0); +} + +void +sctp_pathmtu_timer(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + uint32_t next_mtu, mtu; + + next_mtu = sctp_get_next_mtu(inp, net->mtu); + + if ((next_mtu > net->mtu) && (net->port == 0)) { + if ((net->src_addr_selected == 0) || + (net->ro._s_addr == NULL) || + (net->ro._s_addr->localifa_flags & SCTP_BEING_DELETED)) { + if ((net->ro._s_addr != NULL) && (net->ro._s_addr->localifa_flags & SCTP_BEING_DELETED)) { + sctp_free_ifa(net->ro._s_addr); + net->ro._s_addr = NULL; + net->src_addr_selected = 0; + } else if (net->ro._s_addr == NULL) { +#if defined(INET6) && defined(SCTP_EMBEDDED_V6_SCOPE) + if (net->ro._l_addr.sa.sa_family == AF_INET6) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; + + /* KAME hack: embed scopeid */ + (void)sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)); + } +#endif + + net->ro._s_addr = sctp_source_address_selection(inp, + stcb, + (sctp_route_t *) & net->ro, + net, 0, stcb->asoc.vrf_id); +#if defined(INET6) && defined(SCTP_EMBEDDED_V6_SCOPE) + if (net->ro._l_addr.sa.sa_family == AF_INET6) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; + + (void)sa6_recoverscope(sin6); + } +#endif /* INET6 */ + } + if (net->ro._s_addr) + net->src_addr_selected = 1; + } + if (net->ro._s_addr) { + mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._s_addr.sa, net->ro.ro_rt); + if (net->port) { + mtu -= sizeof(struct udphdr); + } + if (mtu > next_mtu) { + net->mtu = next_mtu; + } + } + } + /* restart the timer */ + sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net); +} + +void +sctp_autoclose_timer(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + struct timeval tn, *tim_touse; + struct sctp_association *asoc; + int ticks_gone_by; + + (void)SCTP_GETTIME_TIMEVAL(&tn); + if (stcb->asoc.sctp_autoclose_ticks && + sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { + /* Auto close is on */ + asoc = &stcb->asoc; + /* pick the time to use */ + if (asoc->time_last_rcvd.tv_sec > + asoc->time_last_sent.tv_sec) { + tim_touse = &asoc->time_last_rcvd; + } else { + tim_touse = &asoc->time_last_sent; + } + /* Now has long enough transpired to autoclose? */ + ticks_gone_by = SEC_TO_TICKS(tn.tv_sec - tim_touse->tv_sec); + if ((ticks_gone_by > 0) && + (ticks_gone_by >= (int)asoc->sctp_autoclose_ticks)) { + /* + * autoclose time has hit, call the output routine, + * which should do nothing just to be SURE we don't + * have hanging data. We can then safely check the + * queues and know that we are clear to send + * shutdown + */ + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_AUTOCLOSE_TMR, SCTP_SO_NOT_LOCKED); + /* Are we clean? */ + if (TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->sent_queue)) { + /* + * there is nothing queued to send, so I'm + * done... + */ + if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) { + /* only send SHUTDOWN 1st time thru */ + sctp_send_shutdown(stcb, stcb->asoc.primary_destination); + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, + stcb->sctp_ep, stcb, + asoc->primary_destination); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, + stcb->sctp_ep, stcb, + asoc->primary_destination); + } + } + } else { + /* + * No auto close at this time, reset t-o to check + * later + */ + int tmp; + + /* fool the timer startup to use the time left */ + tmp = asoc->sctp_autoclose_ticks; + asoc->sctp_autoclose_ticks -= ticks_gone_by; + sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, + net); + /* restore the real tick value */ + asoc->sctp_autoclose_ticks = tmp; + } + } +} diff --git a/freebsd/sys/netinet/sctp_timer.h b/freebsd/sys/netinet/sctp_timer.h new file mode 100644 index 00000000..34abbace --- /dev/null +++ b/freebsd/sys/netinet/sctp_timer.h @@ -0,0 +1,101 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_timer.h,v 1.6 2005/03/06 16:04:18 itojun Exp $ */ +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_timer_h__ +#define __sctp_timer_h__ + +#if defined(_KERNEL) || defined(__Userspace__) + +#define SCTP_RTT_SHIFT 3 +#define SCTP_RTT_VAR_SHIFT 2 + +void +sctp_early_fr_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net); + +struct sctp_nets * +sctp_find_alternate_net(struct sctp_tcb *, + struct sctp_nets *, int mode); + +int +sctp_threshold_management(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *, uint16_t); + +int +sctp_t3rxt_timer(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *); +int +sctp_t1init_timer(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *); +int +sctp_shutdown_timer(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *); +int +sctp_heartbeat_timer(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *, int); + +int +sctp_cookie_timer(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *); + +void +sctp_pathmtu_timer(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *); + +int +sctp_shutdownack_timer(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *); +int +sctp_strreset_timer(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net); + +int +sctp_asconf_timer(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *); + +void +sctp_delete_prim_timer(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *); + +void +sctp_autoclose_timer(struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *net); + +void sctp_audit_retranmission_queue(struct sctp_association *); + +void sctp_iterator_timer(struct sctp_iterator *it); + + +#endif +#endif diff --git a/freebsd/sys/netinet/sctp_uio.h b/freebsd/sys/netinet/sctp_uio.h new file mode 100644 index 00000000..734447ed --- /dev/null +++ b/freebsd/sys/netinet/sctp_uio.h @@ -0,0 +1,1166 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_uio.h,v 1.11 2005/03/06 16:04:18 itojun Exp $ */ +#include +__FBSDID("$FreeBSD$"); + +#ifndef __sctp_uio_h__ +#define __sctp_uio_h__ + + +#if ! defined(_KERNEL) +#include +#endif +#include +#include +#include + +typedef uint32_t sctp_assoc_t; + +/* Compatibility to previous define's */ +#define sctp_stream_reset_events sctp_stream_reset_event + +/* On/Off setup for subscription to events */ +struct sctp_event_subscribe { + uint8_t sctp_data_io_event; + uint8_t sctp_association_event; + uint8_t sctp_address_event; + uint8_t sctp_send_failure_event; + uint8_t sctp_peer_error_event; + uint8_t sctp_shutdown_event; + uint8_t sctp_partial_delivery_event; + uint8_t sctp_adaptation_layer_event; + uint8_t sctp_authentication_event; + uint8_t sctp_sender_dry_event; + uint8_t sctp_stream_reset_event; +}; + +/* ancillary data types */ +#define SCTP_INIT 0x0001 +#define SCTP_SNDRCV 0x0002 +#define SCTP_EXTRCV 0x0003 +/* + * ancillary data structures + */ +struct sctp_initmsg { + uint16_t sinit_num_ostreams; + uint16_t sinit_max_instreams; + uint16_t sinit_max_attempts; + uint16_t sinit_max_init_timeo; +}; + +/* We add 96 bytes to the size of sctp_sndrcvinfo. + * This makes the current structure 128 bytes long + * which is nicely 64 bit aligned but also has room + * for us to add more and keep ABI compatibility. + * For example, already we have the sctp_extrcvinfo + * when enabled which is 48 bytes. + */ + +/* + * The assoc up needs a verfid + * all sendrcvinfo's need a verfid for SENDING only. + */ + + +#define SCTP_ALIGN_RESV_PAD 96 +#define SCTP_ALIGN_RESV_PAD_SHORT 80 + +struct sctp_sndrcvinfo { + uint16_t sinfo_stream; + uint16_t sinfo_ssn; + uint16_t sinfo_flags; + uint32_t sinfo_ppid; + uint32_t sinfo_context; + uint32_t sinfo_timetolive; + uint32_t sinfo_tsn; + uint32_t sinfo_cumtsn; + sctp_assoc_t sinfo_assoc_id; + uint8_t __reserve_pad[SCTP_ALIGN_RESV_PAD]; +}; + +struct sctp_extrcvinfo { + uint16_t sinfo_stream; + uint16_t sinfo_ssn; + uint16_t sinfo_flags; + uint16_t sinfo_pr_policy; + uint32_t sinfo_ppid; + uint32_t sinfo_context; + uint32_t sinfo_timetolive; + uint32_t sinfo_tsn; + uint32_t sinfo_cumtsn; + sctp_assoc_t sinfo_assoc_id; + uint16_t sreinfo_next_flags; + uint16_t sreinfo_next_stream; + uint32_t sreinfo_next_aid; + uint32_t sreinfo_next_length; + uint32_t sreinfo_next_ppid; + uint8_t __reserve_pad[SCTP_ALIGN_RESV_PAD_SHORT]; +}; + +#define SCTP_NO_NEXT_MSG 0x0000 +#define SCTP_NEXT_MSG_AVAIL 0x0001 +#define SCTP_NEXT_MSG_ISCOMPLETE 0x0002 +#define SCTP_NEXT_MSG_IS_UNORDERED 0x0004 +#define SCTP_NEXT_MSG_IS_NOTIFICATION 0x0008 + +struct sctp_snd_all_completes { + uint16_t sall_stream; + uint16_t sall_flags; + uint32_t sall_ppid; + uint32_t sall_context; + uint32_t sall_num_sent; + uint32_t sall_num_failed; +}; + +/* Flags that go into the sinfo->sinfo_flags field */ +#define SCTP_EOF 0x0100 /* Start shutdown procedures */ +#define SCTP_ABORT 0x0200 /* Send an ABORT to peer */ +#define SCTP_UNORDERED 0x0400 /* Message is un-ordered */ +#define SCTP_ADDR_OVER 0x0800 /* Override the primary-address */ +#define SCTP_SENDALL 0x1000 /* Send this on all associations */ +#define SCTP_EOR 0x2000 /* end of message signal */ +#define SCTP_SACK_IMMEDIATELY 0x4000 /* Set I-Bit */ + +#define INVALID_SINFO_FLAG(x) (((x) & 0xffffff00 \ + & ~(SCTP_EOF | SCTP_ABORT | SCTP_UNORDERED |\ + SCTP_ADDR_OVER | SCTP_SENDALL | SCTP_EOR |\ + SCTP_SACK_IMMEDIATELY)) != 0) +/* for the endpoint */ + +/* The lower byte is an enumeration of PR-SCTP policies */ +#define SCTP_PR_SCTP_TTL 0x0001/* Time based PR-SCTP */ +#define SCTP_PR_SCTP_BUF 0x0002/* Buffer based PR-SCTP */ +#define SCTP_PR_SCTP_RTX 0x0003/* Number of retransmissions based PR-SCTP */ + +#define PR_SCTP_POLICY(x) ((x) & 0xff) +#define PR_SCTP_ENABLED(x) (PR_SCTP_POLICY(x) != 0) +#define PR_SCTP_TTL_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_TTL) +#define PR_SCTP_BUF_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_BUF) +#define PR_SCTP_RTX_ENABLED(x) (PR_SCTP_POLICY(x) == SCTP_PR_SCTP_RTX) +#define PR_SCTP_INVALID_POLICY(x) (PR_SCTP_POLICY(x) > SCTP_PR_SCTP_RTX) +/* Stat's */ +struct sctp_pcbinfo { + uint32_t ep_count; + uint32_t asoc_count; + uint32_t laddr_count; + uint32_t raddr_count; + uint32_t chk_count; + uint32_t readq_count; + uint32_t free_chunks; + uint32_t stream_oque; +}; + +struct sctp_sockstat { + sctp_assoc_t ss_assoc_id; + uint32_t ss_total_sndbuf; + uint32_t ss_total_recv_buf; +}; + +/* + * notification event structures + */ + +/* + * association change event + */ +struct sctp_assoc_change { + uint16_t sac_type; + uint16_t sac_flags; + uint32_t sac_length; + uint16_t sac_state; + uint16_t sac_error; + uint16_t sac_outbound_streams; + uint16_t sac_inbound_streams; + sctp_assoc_t sac_assoc_id; +}; + +/* sac_state values */ +#define SCTP_COMM_UP 0x0001 +#define SCTP_COMM_LOST 0x0002 +#define SCTP_RESTART 0x0003 +#define SCTP_SHUTDOWN_COMP 0x0004 +#define SCTP_CANT_STR_ASSOC 0x0005 + + +/* + * Address event + */ +struct sctp_paddr_change { + uint16_t spc_type; + uint16_t spc_flags; + uint32_t spc_length; + struct sockaddr_storage spc_aaddr; + uint32_t spc_state; + uint32_t spc_error; + sctp_assoc_t spc_assoc_id; + uint8_t spc_padding[4]; +}; + +/* paddr state values */ +#define SCTP_ADDR_AVAILABLE 0x0001 +#define SCTP_ADDR_UNREACHABLE 0x0002 +#define SCTP_ADDR_REMOVED 0x0003 +#define SCTP_ADDR_ADDED 0x0004 +#define SCTP_ADDR_MADE_PRIM 0x0005 +#define SCTP_ADDR_CONFIRMED 0x0006 + +/* + * CAUTION: these are user exposed SCTP addr reachability states must be + * compatible with SCTP_ADDR states in sctp_constants.h + */ +#ifdef SCTP_ACTIVE +#undef SCTP_ACTIVE +#endif +#define SCTP_ACTIVE 0x0001 /* SCTP_ADDR_REACHABLE */ + +#ifdef SCTP_INACTIVE +#undef SCTP_INACTIVE +#endif +#define SCTP_INACTIVE 0x0002 /* SCTP_ADDR_NOT_REACHABLE */ + +#ifdef SCTP_UNCONFIRMED +#undef SCTP_UNCONFIRMED +#endif +#define SCTP_UNCONFIRMED 0x0200 /* SCTP_ADDR_UNCONFIRMED */ + +#ifdef SCTP_NOHEARTBEAT +#undef SCTP_NOHEARTBEAT +#endif +#define SCTP_NOHEARTBEAT 0x0040 /* SCTP_ADDR_NOHB */ + + +/* remote error events */ +struct sctp_remote_error { + uint16_t sre_type; + uint16_t sre_flags; + uint32_t sre_length; + uint16_t sre_error; + sctp_assoc_t sre_assoc_id; + uint8_t sre_data[4]; +}; + +/* data send failure event */ +struct sctp_send_failed { + uint16_t ssf_type; + uint16_t ssf_flags; + uint32_t ssf_length; + uint32_t ssf_error; + struct sctp_sndrcvinfo ssf_info; + sctp_assoc_t ssf_assoc_id; + uint8_t ssf_data[]; +}; + +/* flag that indicates state of data */ +#define SCTP_DATA_UNSENT 0x0001 /* inqueue never on wire */ +#define SCTP_DATA_SENT 0x0002 /* on wire at failure */ + +/* shutdown event */ +struct sctp_shutdown_event { + uint16_t sse_type; + uint16_t sse_flags; + uint32_t sse_length; + sctp_assoc_t sse_assoc_id; +}; + +/* Adaptation layer indication stuff */ +struct sctp_adaptation_event { + uint16_t sai_type; + uint16_t sai_flags; + uint32_t sai_length; + uint32_t sai_adaptation_ind; + sctp_assoc_t sai_assoc_id; +}; + +struct sctp_setadaptation { + uint32_t ssb_adaptation_ind; +}; + +/* compatible old spelling */ +struct sctp_adaption_event { + uint16_t sai_type; + uint16_t sai_flags; + uint32_t sai_length; + uint32_t sai_adaption_ind; + sctp_assoc_t sai_assoc_id; +}; + +struct sctp_setadaption { + uint32_t ssb_adaption_ind; +}; + + +/* + * Partial Delivery API event + */ +struct sctp_pdapi_event { + uint16_t pdapi_type; + uint16_t pdapi_flags; + uint32_t pdapi_length; + uint32_t pdapi_indication; + uint16_t pdapi_stream; + uint16_t pdapi_seq; + sctp_assoc_t pdapi_assoc_id; +}; + +/* indication values */ +#define SCTP_PARTIAL_DELIVERY_ABORTED 0x0001 + + +/* + * authentication key event + */ +struct sctp_authkey_event { + uint16_t auth_type; + uint16_t auth_flags; + uint32_t auth_length; + uint16_t auth_keynumber; + uint16_t auth_altkeynumber; + uint32_t auth_indication; + sctp_assoc_t auth_assoc_id; +}; + +/* indication values */ +#define SCTP_AUTH_NEWKEY 0x0001 +#define SCTP_AUTH_NO_AUTH 0x0002 +#define SCTP_AUTH_FREE_KEY 0x0003 + + +struct sctp_sender_dry_event { + uint16_t sender_dry_type; + uint16_t sender_dry_flags; + uint32_t sender_dry_length; + sctp_assoc_t sender_dry_assoc_id; +}; + + +/* + * stream reset event + */ +struct sctp_stream_reset_event { + uint16_t strreset_type; + uint16_t strreset_flags; + uint32_t strreset_length; + sctp_assoc_t strreset_assoc_id; + uint16_t strreset_list[]; +}; + +/* flags in strreset_flags field */ +#define SCTP_STRRESET_INBOUND_STR 0x0001 +#define SCTP_STRRESET_OUTBOUND_STR 0x0002 +#define SCTP_STRRESET_ALL_STREAMS 0x0004 +#define SCTP_STRRESET_STREAM_LIST 0x0008 +#define SCTP_STRRESET_FAILED 0x0010 +#define SCTP_STRRESET_ADD_STREAM 0x0020 + +/* SCTP notification event */ +struct sctp_tlv { + uint16_t sn_type; + uint16_t sn_flags; + uint32_t sn_length; +}; + +union sctp_notification { + struct sctp_tlv sn_header; + struct sctp_assoc_change sn_assoc_change; + struct sctp_paddr_change sn_paddr_change; + struct sctp_remote_error sn_remote_error; + struct sctp_send_failed sn_send_failed; + struct sctp_shutdown_event sn_shutdown_event; + struct sctp_adaptation_event sn_adaptation_event; + /* compatibility same as above */ + struct sctp_adaption_event sn_adaption_event; + struct sctp_pdapi_event sn_pdapi_event; + struct sctp_authkey_event sn_auth_event; + struct sctp_sender_dry_event sn_sender_dry_event; + struct sctp_stream_reset_event sn_strreset_event; +}; + +/* notification types */ +#define SCTP_ASSOC_CHANGE 0x0001 +#define SCTP_PEER_ADDR_CHANGE 0x0002 +#define SCTP_REMOTE_ERROR 0x0003 +#define SCTP_SEND_FAILED 0x0004 +#define SCTP_SHUTDOWN_EVENT 0x0005 +#define SCTP_ADAPTATION_INDICATION 0x0006 +/* same as above */ +#define SCTP_ADAPTION_INDICATION 0x0006 +#define SCTP_PARTIAL_DELIVERY_EVENT 0x0007 +#define SCTP_AUTHENTICATION_EVENT 0x0008 +#define SCTP_STREAM_RESET_EVENT 0x0009 +#define SCTP_SENDER_DRY_EVENT 0x000a +#define SCTP__NOTIFICATIONS_STOPPED_EVENT 0x000b /* we don't send this */ +/* + * socket option structs + */ + +struct sctp_paddrparams { + struct sockaddr_storage spp_address; + sctp_assoc_t spp_assoc_id; + uint32_t spp_hbinterval; + uint32_t spp_pathmtu; + uint32_t spp_flags; + uint32_t spp_ipv6_flowlabel; + uint16_t spp_pathmaxrxt; + uint8_t spp_ipv4_tos; +}; + +#define SPP_HB_ENABLE 0x00000001 +#define SPP_HB_DISABLE 0x00000002 +#define SPP_HB_DEMAND 0x00000004 +#define SPP_PMTUD_ENABLE 0x00000008 +#define SPP_PMTUD_DISABLE 0x00000010 +#define SPP_HB_TIME_IS_ZERO 0x00000080 +#define SPP_IPV6_FLOWLABEL 0x00000100 +#define SPP_IPV4_TOS 0x00000200 + +struct sctp_paddrinfo { + struct sockaddr_storage spinfo_address; + sctp_assoc_t spinfo_assoc_id; + int32_t spinfo_state; + uint32_t spinfo_cwnd; + uint32_t spinfo_srtt; + uint32_t spinfo_rto; + uint32_t spinfo_mtu; +}; + +struct sctp_rtoinfo { + sctp_assoc_t srto_assoc_id; + uint32_t srto_initial; + uint32_t srto_max; + uint32_t srto_min; +}; + +struct sctp_assocparams { + sctp_assoc_t sasoc_assoc_id; + uint32_t sasoc_peer_rwnd; + uint32_t sasoc_local_rwnd; + uint32_t sasoc_cookie_life; + uint16_t sasoc_asocmaxrxt; + uint16_t sasoc_number_peer_destinations; +}; + +struct sctp_setprim { + struct sockaddr_storage ssp_addr; + sctp_assoc_t ssp_assoc_id; + uint8_t ssp_padding[4]; +}; + +struct sctp_setpeerprim { + struct sockaddr_storage sspp_addr; + sctp_assoc_t sspp_assoc_id; + uint8_t sspp_padding[4]; +}; + +struct sctp_getaddresses { + sctp_assoc_t sget_assoc_id; + /* addr is filled in for N * sockaddr_storage */ + struct sockaddr addr[1]; +}; + +struct sctp_setstrm_timeout { + sctp_assoc_t ssto_assoc_id; + uint32_t ssto_timeout; + uint32_t ssto_streamid_start; + uint32_t ssto_streamid_end; +}; + +struct sctp_status { + sctp_assoc_t sstat_assoc_id; + int32_t sstat_state; + uint32_t sstat_rwnd; + uint16_t sstat_unackdata; + uint16_t sstat_penddata; + uint16_t sstat_instrms; + uint16_t sstat_outstrms; + uint32_t sstat_fragmentation_point; + struct sctp_paddrinfo sstat_primary; +}; + +/* + * AUTHENTICATION support + */ +/* SCTP_AUTH_CHUNK */ +struct sctp_authchunk { + uint8_t sauth_chunk; +}; + +/* SCTP_AUTH_KEY */ +struct sctp_authkey { + sctp_assoc_t sca_assoc_id; + uint16_t sca_keynumber; + uint8_t sca_key[]; +}; + +/* SCTP_HMAC_IDENT */ +struct sctp_hmacalgo { + uint32_t shmac_number_of_idents; + uint16_t shmac_idents[]; +}; + +/* AUTH hmac_id */ +#define SCTP_AUTH_HMAC_ID_RSVD 0x0000 +#define SCTP_AUTH_HMAC_ID_SHA1 0x0001 /* default, mandatory */ +#define SCTP_AUTH_HMAC_ID_SHA256 0x0003 +#define SCTP_AUTH_HMAC_ID_SHA224 0x0004 +#define SCTP_AUTH_HMAC_ID_SHA384 0x0005 +#define SCTP_AUTH_HMAC_ID_SHA512 0x0006 + + +/* SCTP_AUTH_ACTIVE_KEY / SCTP_AUTH_DELETE_KEY */ +struct sctp_authkeyid { + sctp_assoc_t scact_assoc_id; + uint16_t scact_keynumber; +}; + +/* SCTP_PEER_AUTH_CHUNKS / SCTP_LOCAL_AUTH_CHUNKS */ +struct sctp_authchunks { + sctp_assoc_t gauth_assoc_id; + uint8_t gauth_chunks[]; +}; + +struct sctp_assoc_value { + sctp_assoc_t assoc_id; + uint32_t assoc_value; +}; + +struct sctp_assoc_ids { + uint32_t gaids_number_of_ids; + sctp_assoc_t gaids_assoc_id[]; +}; + +struct sctp_sack_info { + sctp_assoc_t sack_assoc_id; + uint32_t sack_delay; + uint32_t sack_freq; +}; + +struct sctp_timeouts { + sctp_assoc_t stimo_assoc_id; + uint32_t stimo_init; + uint32_t stimo_data; + uint32_t stimo_sack; + uint32_t stimo_shutdown; + uint32_t stimo_heartbeat; + uint32_t stimo_cookie; + uint32_t stimo_shutdownack; +}; + +struct sctp_cwnd_args { + struct sctp_nets *net; /* network to *//* FIXME: LP64 issue */ + uint32_t cwnd_new_value;/* cwnd in k */ + uint32_t pseudo_cumack; + uint16_t inflight; /* flightsize in k */ + uint16_t cwnd_augment; /* increment to it */ + uint8_t meets_pseudo_cumack; + uint8_t need_new_pseudo_cumack; + uint8_t cnt_in_send; + uint8_t cnt_in_str; +}; + +struct sctp_blk_args { + uint32_t onsb; /* in 1k bytes */ + uint32_t sndlen; /* len of send being attempted */ + uint32_t peer_rwnd; /* rwnd of peer */ + uint16_t send_sent_qcnt;/* chnk cnt */ + uint16_t stream_qcnt; /* chnk cnt */ + uint16_t chunks_on_oque;/* chunks out */ + uint16_t flight_size; /* flight size in k */ +}; + +/* + * Max we can reset in one setting, note this is dictated not by the define + * but the size of a mbuf cluster so don't change this define and think you + * can specify more. You must do multiple resets if you want to reset more + * than SCTP_MAX_EXPLICIT_STR_RESET. + */ +#define SCTP_MAX_EXPLICT_STR_RESET 1000 + +#define SCTP_RESET_LOCAL_RECV 0x0001 +#define SCTP_RESET_LOCAL_SEND 0x0002 +#define SCTP_RESET_BOTH 0x0003 +#define SCTP_RESET_TSN 0x0004 +#define SCTP_RESET_ADD_STREAMS 0x0005 + +struct sctp_stream_reset { + sctp_assoc_t strrst_assoc_id; + uint16_t strrst_flags; + uint16_t strrst_num_streams; /* 0 == ALL */ + uint16_t strrst_list[]; /* list if strrst_num_streams is not 0 */ +}; + + +struct sctp_get_nonce_values { + sctp_assoc_t gn_assoc_id; + uint32_t gn_peers_tag; + uint32_t gn_local_tag; +}; + +/* Debugging logs */ +struct sctp_str_log { + void *stcb; /* FIXME: LP64 issue */ + uint32_t n_tsn; + uint32_t e_tsn; + uint16_t n_sseq; + uint16_t e_sseq; + uint16_t strm; +}; + +struct sctp_sb_log { + void *stcb; /* FIXME: LP64 issue */ + uint32_t so_sbcc; + uint32_t stcb_sbcc; + uint32_t incr; +}; + +struct sctp_fr_log { + uint32_t largest_tsn; + uint32_t largest_new_tsn; + uint32_t tsn; +}; + +struct sctp_fr_map { + uint32_t base; + uint32_t cum; + uint32_t high; +}; + +struct sctp_rwnd_log { + uint32_t rwnd; + uint32_t send_size; + uint32_t overhead; + uint32_t new_rwnd; +}; + +struct sctp_mbcnt_log { + uint32_t total_queue_size; + uint32_t size_change; + uint32_t total_queue_mb_size; + uint32_t mbcnt_change; +}; + +struct sctp_sack_log { + uint32_t cumack; + uint32_t oldcumack; + uint32_t tsn; + uint16_t numGaps; + uint16_t numDups; +}; + +struct sctp_lock_log { + void *sock; /* FIXME: LP64 issue */ + void *inp; /* FIXME: LP64 issue */ + uint8_t tcb_lock; + uint8_t inp_lock; + uint8_t info_lock; + uint8_t sock_lock; + uint8_t sockrcvbuf_lock; + uint8_t socksndbuf_lock; + uint8_t create_lock; + uint8_t resv; +}; + +struct sctp_rto_log { + void *net; /* FIXME: LP64 issue */ + uint32_t rtt; +}; + +struct sctp_nagle_log { + void *stcb; /* FIXME: LP64 issue */ + uint32_t total_flight; + uint32_t total_in_queue; + uint16_t count_in_queue; + uint16_t count_in_flight; +}; + +struct sctp_sbwake_log { + void *stcb; /* FIXME: LP64 issue */ + uint16_t send_q; + uint16_t sent_q; + uint16_t flight; + uint16_t wake_cnt; + uint8_t stream_qcnt; /* chnk cnt */ + uint8_t chunks_on_oque; /* chunks out */ + uint8_t sbflags; + uint8_t sctpflags; +}; + +struct sctp_misc_info { + uint32_t log1; + uint32_t log2; + uint32_t log3; + uint32_t log4; +}; + +struct sctp_log_closing { + void *inp; /* FIXME: LP64 issue */ + void *stcb; /* FIXME: LP64 issue */ + uint32_t sctp_flags; + uint16_t state; + int16_t loc; +}; + +struct sctp_mbuf_log { + struct mbuf *mp; /* FIXME: LP64 issue */ + caddr_t ext; + caddr_t data; + uint16_t size; + uint8_t refcnt; + uint8_t mbuf_flags; +}; + +struct sctp_cwnd_log { + uint64_t time_event; + uint8_t from; + uint8_t event_type; + uint8_t resv[2]; + union { + struct sctp_log_closing close; + struct sctp_blk_args blk; + struct sctp_cwnd_args cwnd; + struct sctp_str_log strlog; + struct sctp_fr_log fr; + struct sctp_fr_map map; + struct sctp_rwnd_log rwnd; + struct sctp_mbcnt_log mbcnt; + struct sctp_sack_log sack; + struct sctp_lock_log lock; + struct sctp_rto_log rto; + struct sctp_sb_log sb; + struct sctp_nagle_log nagle; + struct sctp_sbwake_log wake; + struct sctp_mbuf_log mb; + struct sctp_misc_info misc; + } x; +}; + +struct sctp_cwnd_log_req { + int32_t num_in_log; /* Number in log */ + int32_t num_ret; /* Number returned */ + int32_t start_at; /* start at this one */ + int32_t end_at; /* end at this one */ + struct sctp_cwnd_log log[]; +}; + +struct sctp_timeval { + uint32_t tv_sec; + uint32_t tv_usec; +}; + +struct sctpstat { + struct sctp_timeval sctps_discontinuitytime; /* sctpStats 18 + * (TimeStamp) */ + /* MIB according to RFC 3873 */ + uint32_t sctps_currestab; /* sctpStats 1 (Gauge32) */ + uint32_t sctps_activeestab; /* sctpStats 2 (Counter32) */ + uint32_t sctps_restartestab; + uint32_t sctps_collisionestab; + uint32_t sctps_passiveestab; /* sctpStats 3 (Counter32) */ + uint32_t sctps_aborted; /* sctpStats 4 (Counter32) */ + uint32_t sctps_shutdown;/* sctpStats 5 (Counter32) */ + uint32_t sctps_outoftheblue; /* sctpStats 6 (Counter32) */ + uint32_t sctps_checksumerrors; /* sctpStats 7 (Counter32) */ + uint32_t sctps_outcontrolchunks; /* sctpStats 8 (Counter64) */ + uint32_t sctps_outorderchunks; /* sctpStats 9 (Counter64) */ + uint32_t sctps_outunorderchunks; /* sctpStats 10 (Counter64) */ + uint32_t sctps_incontrolchunks; /* sctpStats 11 (Counter64) */ + uint32_t sctps_inorderchunks; /* sctpStats 12 (Counter64) */ + uint32_t sctps_inunorderchunks; /* sctpStats 13 (Counter64) */ + uint32_t sctps_fragusrmsgs; /* sctpStats 14 (Counter64) */ + uint32_t sctps_reasmusrmsgs; /* sctpStats 15 (Counter64) */ + uint32_t sctps_outpackets; /* sctpStats 16 (Counter64) */ + uint32_t sctps_inpackets; /* sctpStats 17 (Counter64) */ + + /* input statistics: */ + uint32_t sctps_recvpackets; /* total input packets */ + uint32_t sctps_recvdatagrams; /* total input datagrams */ + uint32_t sctps_recvpktwithdata; /* total packets that had data */ + uint32_t sctps_recvsacks; /* total input SACK chunks */ + uint32_t sctps_recvdata;/* total input DATA chunks */ + uint32_t sctps_recvdupdata; /* total input duplicate DATA chunks */ + uint32_t sctps_recvheartbeat; /* total input HB chunks */ + uint32_t sctps_recvheartbeatack; /* total input HB-ACK chunks */ + uint32_t sctps_recvecne;/* total input ECNE chunks */ + uint32_t sctps_recvauth;/* total input AUTH chunks */ + uint32_t sctps_recvauthmissing; /* total input chunks missing AUTH */ + uint32_t sctps_recvivalhmacid; /* total number of invalid HMAC ids + * received */ + uint32_t sctps_recvivalkeyid; /* total number of invalid secret ids + * received */ + uint32_t sctps_recvauthfailed; /* total number of auth failed */ + uint32_t sctps_recvexpress; /* total fast path receives all one + * chunk */ + uint32_t sctps_recvexpressm; /* total fast path multi-part data */ + uint32_t sctps_recvnocrc; + uint32_t sctps_recvswcrc; + uint32_t sctps_recvhwcrc; + + /* output statistics: */ + uint32_t sctps_sendpackets; /* total output packets */ + uint32_t sctps_sendsacks; /* total output SACKs */ + uint32_t sctps_senddata;/* total output DATA chunks */ + uint32_t sctps_sendretransdata; /* total output retransmitted DATA + * chunks */ + uint32_t sctps_sendfastretrans; /* total output fast retransmitted + * DATA chunks */ + uint32_t sctps_sendmultfastretrans; /* total FR's that happened + * more than once to same + * chunk (u-del multi-fr + * algo). */ + uint32_t sctps_sendheartbeat; /* total output HB chunks */ + uint32_t sctps_sendecne;/* total output ECNE chunks */ + uint32_t sctps_sendauth;/* total output AUTH chunks FIXME */ + uint32_t sctps_senderrors; /* ip_output error counter */ + uint32_t sctps_sendnocrc; + uint32_t sctps_sendswcrc; + uint32_t sctps_sendhwcrc; + /* PCKDROPREP statistics: */ + uint32_t sctps_pdrpfmbox; /* Packet drop from middle box */ + uint32_t sctps_pdrpfehos; /* P-drop from end host */ + uint32_t sctps_pdrpmbda;/* P-drops with data */ + uint32_t sctps_pdrpmbct;/* P-drops, non-data, non-endhost */ + uint32_t sctps_pdrpbwrpt; /* P-drop, non-endhost, bandwidth rep + * only */ + uint32_t sctps_pdrpcrupt; /* P-drop, not enough for chunk header */ + uint32_t sctps_pdrpnedat; /* P-drop, not enough data to confirm */ + uint32_t sctps_pdrppdbrk; /* P-drop, where process_chunk_drop + * said break */ + uint32_t sctps_pdrptsnnf; /* P-drop, could not find TSN */ + uint32_t sctps_pdrpdnfnd; /* P-drop, attempt reverse TSN lookup */ + uint32_t sctps_pdrpdiwnp; /* P-drop, e-host confirms zero-rwnd */ + uint32_t sctps_pdrpdizrw; /* P-drop, midbox confirms no space */ + uint32_t sctps_pdrpbadd;/* P-drop, data did not match TSN */ + uint32_t sctps_pdrpmark;/* P-drop, TSN's marked for Fast Retran */ + /* timeouts */ + uint32_t sctps_timoiterator; /* Number of iterator timers that + * fired */ + uint32_t sctps_timodata;/* Number of T3 data time outs */ + uint32_t sctps_timowindowprobe; /* Number of window probe (T3) timers + * that fired */ + uint32_t sctps_timoinit;/* Number of INIT timers that fired */ + uint32_t sctps_timosack;/* Number of sack timers that fired */ + uint32_t sctps_timoshutdown; /* Number of shutdown timers that + * fired */ + uint32_t sctps_timoheartbeat; /* Number of heartbeat timers that + * fired */ + uint32_t sctps_timocookie; /* Number of times a cookie timeout + * fired */ + uint32_t sctps_timosecret; /* Number of times an endpoint changed + * its cookie secret */ + uint32_t sctps_timopathmtu; /* Number of PMTU timers that fired */ + uint32_t sctps_timoshutdownack; /* Number of shutdown ack timers that + * fired */ + uint32_t sctps_timoshutdownguard; /* Number of shutdown guard + * timers that fired */ + uint32_t sctps_timostrmrst; /* Number of stream reset timers that + * fired */ + uint32_t sctps_timoearlyfr; /* Number of early FR timers that + * fired */ + uint32_t sctps_timoasconf; /* Number of times an asconf timer + * fired */ + uint32_t sctps_timodelprim; /* Number of times a prim_deleted + * timer fired */ + uint32_t sctps_timoautoclose; /* Number of times auto close timer + * fired */ + uint32_t sctps_timoassockill; /* Number of asoc free timers expired */ + uint32_t sctps_timoinpkill; /* Number of inp free timers expired */ + /* Early fast retransmission counters */ + uint32_t sctps_earlyfrstart; + uint32_t sctps_earlyfrstop; + uint32_t sctps_earlyfrmrkretrans; + uint32_t sctps_earlyfrstpout; + uint32_t sctps_earlyfrstpidsck1; + uint32_t sctps_earlyfrstpidsck2; + uint32_t sctps_earlyfrstpidsck3; + uint32_t sctps_earlyfrstpidsck4; + uint32_t sctps_earlyfrstrid; + uint32_t sctps_earlyfrstrout; + uint32_t sctps_earlyfrstrtmr; + /* others */ + uint32_t sctps_hdrops; /* packet shorter than header */ + uint32_t sctps_badsum; /* checksum error */ + uint32_t sctps_noport; /* no endpoint for port */ + uint32_t sctps_badvtag; /* bad v-tag */ + uint32_t sctps_badsid; /* bad SID */ + uint32_t sctps_nomem; /* no memory */ + uint32_t sctps_fastretransinrtt; /* number of multiple FR in a + * RTT window */ + uint32_t sctps_markedretrans; + uint32_t sctps_naglesent; /* nagle allowed sending */ + uint32_t sctps_naglequeued; /* nagle doesn't allow sending */ + uint32_t sctps_maxburstqueued; /* max burst doesn't allow sending */ + uint32_t sctps_ifnomemqueued; /* look ahead tells us no memory in + * interface ring buffer OR we had a + * send error and are queuing one + * send. */ + uint32_t sctps_windowprobed; /* total number of window probes sent */ + uint32_t sctps_lowlevelerr; /* total times an output error causes + * us to clamp down on next user send. */ + uint32_t sctps_lowlevelerrusr; /* total times sctp_senderrors were + * caused from a user send from a user + * invoked send not a sack response */ + uint32_t sctps_datadropchklmt; /* Number of in data drops due to + * chunk limit reached */ + uint32_t sctps_datadroprwnd; /* Number of in data drops due to rwnd + * limit reached */ + uint32_t sctps_ecnereducedcwnd; /* Number of times a ECN reduced the + * cwnd */ + uint32_t sctps_vtagexpress; /* Used express lookup via vtag */ + uint32_t sctps_vtagbogus; /* Collision in express lookup. */ + uint32_t sctps_primary_randry; /* Number of times the sender ran dry + * of user data on primary */ + uint32_t sctps_cmt_randry; /* Same for above */ + uint32_t sctps_slowpath_sack; /* Sacks the slow way */ + uint32_t sctps_wu_sacks_sent; /* Window Update only sacks sent */ + uint32_t sctps_sends_with_flags; /* number of sends with + * sinfo_flags !=0 */ + uint32_t sctps_sends_with_unord; /* number of unordered sends */ + uint32_t sctps_sends_with_eof; /* number of sends with EOF flag set */ + uint32_t sctps_sends_with_abort; /* number of sends with ABORT + * flag set */ + uint32_t sctps_protocol_drain_calls; /* number of times protocol + * drain called */ + uint32_t sctps_protocol_drains_done; /* number of times we did a + * protocol drain */ + uint32_t sctps_read_peeks; /* Number of times recv was called + * with peek */ + uint32_t sctps_cached_chk; /* Number of cached chunks used */ + uint32_t sctps_cached_strmoq; /* Number of cached stream oq's used */ + uint32_t sctps_left_abandon; /* Number of unread messages abandoned + * by close */ + uint32_t sctps_send_burst_avoid; /* Unused */ + uint32_t sctps_send_cwnd_avoid; /* Send cwnd full avoidance, already + * max burst inflight to net */ + uint32_t sctps_fwdtsn_map_over; /* number of map array over-runs via + * fwd-tsn's */ + + uint32_t sctps_reserved[32]; /* Future ABI compat - remove int's + * from here when adding new */ +}; + +#define SCTP_STAT_INCR(_x) SCTP_STAT_INCR_BY(_x,1) +#define SCTP_STAT_DECR(_x) SCTP_STAT_DECR_BY(_x,1) +#if defined(__FreeBSD__) && defined(SMP) && defined(SCTP_USE_PERCPU_STAT) +#define SCTP_STAT_INCR_BY(_x,_d) (SCTP_BASE_STATS[PCPU_GET(cpuid)]._x += _d) +#define SCTP_STAT_DECR_BY(_x,_d) (SCTP_BASE_STATS[PCPU_GET(cpuid)]._x -= _d) +#else +#define SCTP_STAT_INCR_BY(_x,_d) atomic_add_int(&SCTP_BASE_STAT(_x), _d) +#define SCTP_STAT_DECR_BY(_x,_d) atomic_subtract_int(&SCTP_BASE_STAT(_x), _d) +#endif +/* The following macros are for handling MIB values, */ +#define SCTP_STAT_INCR_COUNTER32(_x) SCTP_STAT_INCR(_x) +#define SCTP_STAT_INCR_COUNTER64(_x) SCTP_STAT_INCR(_x) +#define SCTP_STAT_INCR_GAUGE32(_x) SCTP_STAT_INCR(_x) +#define SCTP_STAT_DECR_COUNTER32(_x) SCTP_STAT_DECR(_x) +#define SCTP_STAT_DECR_COUNTER64(_x) SCTP_STAT_DECR(_x) +#define SCTP_STAT_DECR_GAUGE32(_x) SCTP_STAT_DECR(_x) + +union sctp_sockstore { +#if defined(INET) || !defined(_KERNEL) + struct sockaddr_in sin; +#endif +#if defined(INET6) || !defined(_KERNEL) + struct sockaddr_in6 sin6; +#endif + struct sockaddr sa; +}; + + +/***********************************/ +/* And something for us old timers */ +/***********************************/ + +#ifndef ntohll +#include +#define ntohll(x) be64toh(x) +#endif + +#ifndef htonll +#include +#define htonll(x) htobe64(x) +#endif +/***********************************/ + + +struct xsctp_inpcb { + uint32_t last; + uint32_t flags; + uint32_t features; + uint32_t total_sends; + uint32_t total_recvs; + uint32_t total_nospaces; + uint32_t fragmentation_point; + uint16_t local_port; + uint16_t qlen; + uint16_t maxqlen; + uint32_t extra_padding[32]; /* future */ +}; + +struct xsctp_tcb { + union sctp_sockstore primary_addr; /* sctpAssocEntry 5/6 */ + uint32_t last; + uint32_t heartbeat_interval; /* sctpAssocEntry 7 */ + uint32_t state; /* sctpAssocEntry 8 */ + uint32_t in_streams; /* sctpAssocEntry 9 */ + uint32_t out_streams; /* sctpAssocEntry 10 */ + uint32_t max_nr_retrans;/* sctpAssocEntry 11 */ + uint32_t primary_process; /* sctpAssocEntry 12 */ + uint32_t T1_expireries; /* sctpAssocEntry 13 */ + uint32_t T2_expireries; /* sctpAssocEntry 14 */ + uint32_t retransmitted_tsns; /* sctpAssocEntry 15 */ + uint32_t total_sends; + uint32_t total_recvs; + uint32_t local_tag; + uint32_t remote_tag; + uint32_t initial_tsn; + uint32_t highest_tsn; + uint32_t cumulative_tsn; + uint32_t cumulative_tsn_ack; + uint32_t mtu; + uint32_t refcnt; + uint16_t local_port; /* sctpAssocEntry 3 */ + uint16_t remote_port; /* sctpAssocEntry 4 */ + struct sctp_timeval start_time; /* sctpAssocEntry 16 */ + struct sctp_timeval discontinuity_time; /* sctpAssocEntry 17 */ + uint32_t peers_rwnd; + sctp_assoc_t assoc_id; /* sctpAssocEntry 1 */ + uint32_t extra_padding[32]; /* future */ +}; + +struct xsctp_laddr { + union sctp_sockstore address; /* sctpAssocLocalAddrEntry 1/2 */ + uint32_t last; + struct sctp_timeval start_time; /* sctpAssocLocalAddrEntry 3 */ + uint32_t extra_padding[32]; /* future */ +}; + +struct xsctp_raddr { + union sctp_sockstore address; /* sctpAssocLocalRemEntry 1/2 */ + uint32_t last; + uint32_t rto; /* sctpAssocLocalRemEntry 5 */ + uint32_t max_path_rtx; /* sctpAssocLocalRemEntry 6 */ + uint32_t rtx; /* sctpAssocLocalRemEntry 7 */ + uint32_t error_counter; /* */ + uint32_t cwnd; /* */ + uint32_t flight_size; /* */ + uint32_t mtu; /* */ + uint8_t active; /* sctpAssocLocalRemEntry 3 */ + uint8_t confirmed; /* */ + uint8_t heartbeat_enabled; /* sctpAssocLocalRemEntry 4 */ + struct sctp_timeval start_time; /* sctpAssocLocalRemEntry 8 */ + uint32_t rtt; + uint32_t extra_padding[32]; /* future */ +}; + +#define SCTP_MAX_LOGGING_SIZE 30000 +#define SCTP_TRACE_PARAMS 6 /* This number MUST be even */ + +struct sctp_log_entry { + uint64_t timestamp; + uint32_t subsys; + uint32_t padding; + uint32_t params[SCTP_TRACE_PARAMS]; +}; + +struct sctp_log { + struct sctp_log_entry entry[SCTP_MAX_LOGGING_SIZE]; + uint32_t index; + uint32_t padding; +}; + +/* + * Kernel defined for sctp_send + */ +#if defined(_KERNEL) || defined(__Userspace__) +int +sctp_lower_sosend(struct socket *so, + struct sockaddr *addr, + struct uio *uio, + struct mbuf *i_pak, + struct mbuf *control, + int flags, + struct sctp_sndrcvinfo *srcv + ,struct thread *p +); + +int +sctp_sorecvmsg(struct socket *so, + struct uio *uio, + struct mbuf **mp, + struct sockaddr *from, + int fromlen, + int *msg_flags, + struct sctp_sndrcvinfo *sinfo, + int filling_sinfo); + +#endif + +/* + * API system calls + */ +#if !(defined(_KERNEL)) && !(defined(__Userspace__)) + +__BEGIN_DECLS +int sctp_peeloff __P((int, sctp_assoc_t)); +int sctp_bindx __P((int, struct sockaddr *, int, int)); +int sctp_connectx __P((int, const struct sockaddr *, int, sctp_assoc_t *)); +int sctp_getaddrlen __P((sa_family_t)); +int sctp_getpaddrs __P((int, sctp_assoc_t, struct sockaddr **)); +void sctp_freepaddrs __P((struct sockaddr *)); +int sctp_getladdrs __P((int, sctp_assoc_t, struct sockaddr **)); +void sctp_freeladdrs __P((struct sockaddr *)); +int sctp_opt_info __P((int, sctp_assoc_t, int, void *, socklen_t *)); + +ssize_t sctp_sendmsg +__P((int, const void *, size_t, + const struct sockaddr *, + socklen_t, uint32_t, uint32_t, uint16_t, uint32_t, uint32_t)); + + ssize_t sctp_send __P((int sd, const void *msg, size_t len, + const struct sctp_sndrcvinfo *sinfo, int flags)); + + ssize_t sctp_sendx __P((int sd, const void *msg, size_t len, + struct sockaddr *addrs, int addrcnt, + struct sctp_sndrcvinfo *sinfo, int flags)); + + ssize_t sctp_sendmsgx __P((int sd, const void *, size_t, + struct sockaddr *, int, + uint32_t, uint32_t, uint16_t, uint32_t, uint32_t)); + + sctp_assoc_t sctp_getassocid __P((int sd, struct sockaddr *sa)); + + ssize_t sctp_recvmsg __P((int, void *, size_t, struct sockaddr *, + socklen_t *, struct sctp_sndrcvinfo *, int *)); + +__END_DECLS + +#endif /* !_KERNEL */ +#endif /* !__sctp_uio_h__ */ diff --git a/freebsd/sys/netinet/sctp_usrreq.c b/freebsd/sys/netinet/sctp_usrreq.c new file mode 100644 index 00000000..bb60795c --- /dev/null +++ b/freebsd/sys/netinet/sctp_usrreq.c @@ -0,0 +1,4918 @@ +#include + +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_usrreq.c,v 1.48 2005/03/07 23:26:08 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include +#include +#if defined(INET6) +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + + +void +sctp_init(void) +{ + u_long sb_max_adj; + + bzero(&SCTP_BASE_STATS, sizeof(struct sctpstat)); + + /* Initialize and modify the sysctled variables */ + sctp_init_sysctls(); + if ((nmbclusters / 8) > SCTP_ASOC_MAX_CHUNKS_ON_QUEUE) + SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue) = (nmbclusters / 8); + /* + * Allow a user to take no more than 1/2 the number of clusters or + * the SB_MAX whichever is smaller for the send window. + */ + sb_max_adj = (u_long)((u_quad_t) (SB_MAX) * MCLBYTES / (MSIZE + MCLBYTES)); + SCTP_BASE_SYSCTL(sctp_sendspace) = min(sb_max_adj, + (((uint32_t) nmbclusters / 2) * SCTP_DEFAULT_MAXSEGMENT)); + /* + * Now for the recv window, should we take the same amount? or + * should I do 1/2 the SB_MAX instead in the SB_MAX min above. For + * now I will just copy. + */ + SCTP_BASE_SYSCTL(sctp_recvspace) = SCTP_BASE_SYSCTL(sctp_sendspace); + + SCTP_BASE_VAR(first_time) = 0; + SCTP_BASE_VAR(sctp_pcb_initialized) = 0; + sctp_pcb_init(); +#if defined(SCTP_PACKET_LOGGING) + SCTP_BASE_VAR(packet_log_writers) = 0; + SCTP_BASE_VAR(packet_log_end) = 0; + bzero(&SCTP_BASE_VAR(packet_log_buffer), SCTP_PACKET_LOG_SIZE); +#endif + + +} + +void +sctp_finish(void) +{ + sctp_pcb_finish(); +} + + + +void +sctp_pathmtu_adjustment(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_nets *net, + uint16_t nxtsz) +{ + struct sctp_tmit_chunk *chk; + uint16_t overhead; + + /* Adjust that too */ + stcb->asoc.smallest_mtu = nxtsz; + /* now off to subtract IP_DF flag if needed */ + overhead = IP_HDR_SIZE; + if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { + overhead += sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); + } + TAILQ_FOREACH(chk, &stcb->asoc.send_queue, sctp_next) { + if ((chk->send_size + overhead) > nxtsz) { + chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; + } + } + TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { + if ((chk->send_size + overhead) > nxtsz) { + /* + * For this guy we also mark for immediate resend + * since we sent to big of chunk + */ + chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; + if (chk->sent < SCTP_DATAGRAM_RESEND) { + sctp_flight_size_decrease(chk); + sctp_total_flight_decrease(stcb, chk); + } + if (chk->sent != SCTP_DATAGRAM_RESEND) { + sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); + } + chk->sent = SCTP_DATAGRAM_RESEND; + chk->rec.data.doing_fast_retransmit = 0; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PMTU, + chk->whoTo->flight_size, + chk->book_size, + (uintptr_t) chk->whoTo, + chk->rec.data.TSN_seq); + } + /* Clear any time so NO RTT is being done */ + chk->do_rtt = 0; + } + } +} + +static void +sctp_notify_mbuf(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_nets *net, + struct ip *ip, + struct sctphdr *sh) +{ + struct icmp *icmph; + int totsz, tmr_stopped = 0; + uint16_t nxtsz; + + /* protection */ + if ((inp == NULL) || (stcb == NULL) || (net == NULL) || + (ip == NULL) || (sh == NULL)) { + if (stcb != NULL) { + SCTP_TCB_UNLOCK(stcb); + } + return; + } + /* First job is to verify the vtag matches what I would send */ + if (ntohl(sh->v_tag) != (stcb->asoc.peer_vtag)) { + SCTP_TCB_UNLOCK(stcb); + return; + } + icmph = (struct icmp *)((caddr_t)ip - (sizeof(struct icmp) - + sizeof(struct ip))); + if (icmph->icmp_type != ICMP_UNREACH) { + /* We only care about unreachable */ + SCTP_TCB_UNLOCK(stcb); + return; + } + if (icmph->icmp_code != ICMP_UNREACH_NEEDFRAG) { + /* not a unreachable message due to frag. */ + SCTP_TCB_UNLOCK(stcb); + return; + } + totsz = ip->ip_len; + + nxtsz = ntohs(icmph->icmp_nextmtu); + if (nxtsz == 0) { + /* + * old type router that does not tell us what the next size + * mtu is. Rats we will have to guess (in a educated fashion + * of course) + */ + nxtsz = sctp_get_prev_mtu(totsz); + } + /* Stop any PMTU timer */ + if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { + tmr_stopped = 1; + sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1); + } + /* Adjust destination size limit */ + if (net->mtu > nxtsz) { + net->mtu = nxtsz; + if (net->port) { + net->mtu -= sizeof(struct udphdr); + } + } + /* now what about the ep? */ + if (stcb->asoc.smallest_mtu > nxtsz) { + sctp_pathmtu_adjustment(inp, stcb, net, nxtsz); + } + if (tmr_stopped) + sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net); + + SCTP_TCB_UNLOCK(stcb); +} + + +void +sctp_notify(struct sctp_inpcb *inp, + struct ip *ip, + struct sctphdr *sh, + struct sockaddr *to, + struct sctp_tcb *stcb, + struct sctp_nets *net) +{ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + /* protection */ + int reason; + struct icmp *icmph; + + + if ((inp == NULL) || (stcb == NULL) || (net == NULL) || + (sh == NULL) || (to == NULL)) { + if (stcb) + SCTP_TCB_UNLOCK(stcb); + return; + } + /* First job is to verify the vtag matches what I would send */ + if (ntohl(sh->v_tag) != (stcb->asoc.peer_vtag)) { + SCTP_TCB_UNLOCK(stcb); + return; + } + icmph = (struct icmp *)((caddr_t)ip - (sizeof(struct icmp) - + sizeof(struct ip))); + if (icmph->icmp_type != ICMP_UNREACH) { + /* We only care about unreachable */ + SCTP_TCB_UNLOCK(stcb); + return; + } + if ((icmph->icmp_code == ICMP_UNREACH_NET) || + (icmph->icmp_code == ICMP_UNREACH_HOST) || + (icmph->icmp_code == ICMP_UNREACH_NET_UNKNOWN) || + (icmph->icmp_code == ICMP_UNREACH_HOST_UNKNOWN) || + (icmph->icmp_code == ICMP_UNREACH_ISOLATED) || + (icmph->icmp_code == ICMP_UNREACH_NET_PROHIB) || + (icmph->icmp_code == ICMP_UNREACH_HOST_PROHIB) || + (icmph->icmp_code == ICMP_UNREACH_FILTER_PROHIB)) { + + /* + * Hmm reachablity problems we must examine closely. If its + * not reachable, we may have lost a network. Or if there is + * NO protocol at the other end named SCTP. well we consider + * it a OOTB abort. + */ + if (net->dest_state & SCTP_ADDR_REACHABLE) { + /* Ok that destination is NOT reachable */ + SCTP_PRINTF("ICMP (thresh %d/%d) takes interface %p down\n", + net->error_count, + net->failure_threshold, + net); + + net->dest_state &= ~SCTP_ADDR_REACHABLE; + net->dest_state |= SCTP_ADDR_NOT_REACHABLE; + /* + * JRS 5/14/07 - If a destination is unreachable, + * the PF bit is turned off. This allows an + * unambiguous use of the PF bit for destinations + * that are reachable but potentially failed. If the + * destination is set to the unreachable state, also + * set the destination to the PF state. + */ + /* + * Add debug message here if destination is not in + * PF state. + */ + /* Stop any running T3 timers here? */ + if ((stcb->asoc.sctp_cmt_on_off == 1) && + (stcb->asoc.sctp_cmt_pf > 0)) { + net->dest_state &= ~SCTP_ADDR_PF; + SCTPDBG(SCTP_DEBUG_TIMER4, "Destination %p moved from PF to unreachable.\n", + net); + } + net->error_count = net->failure_threshold + 1; + sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, + stcb, SCTP_FAILED_THRESHOLD, + (void *)net, SCTP_SO_NOT_LOCKED); + } + SCTP_TCB_UNLOCK(stcb); + } else if ((icmph->icmp_code == ICMP_UNREACH_PROTOCOL) || + (icmph->icmp_code == ICMP_UNREACH_PORT)) { + /* + * Here the peer is either playing tricks on us, including + * an address that belongs to someone who does not support + * SCTP OR was a userland implementation that shutdown and + * now is dead. In either case treat it like a OOTB abort + * with no TCB + */ + reason = SCTP_PEER_FAULTY; + sctp_abort_notification(stcb, reason, SCTP_SO_NOT_LOCKED); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(inp); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); + /* SCTP_TCB_UNLOCK(stcb); MT: I think this is not needed. */ +#endif + /* no need to unlock here, since the TCB is gone */ + } else { + SCTP_TCB_UNLOCK(stcb); + } +} + +void +sctp_ctlinput(cmd, sa, vip) + int cmd; + struct sockaddr *sa; + void *vip; +{ + struct ip *ip = vip; + struct sctphdr *sh; + uint32_t vrf_id; + + /* FIX, for non-bsd is this right? */ + vrf_id = SCTP_DEFAULT_VRFID; + if (sa->sa_family != AF_INET || + ((struct sockaddr_in *)sa)->sin_addr.s_addr == INADDR_ANY) { + return; + } + if (PRC_IS_REDIRECT(cmd)) { + ip = 0; + } else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) { + return; + } + if (ip) { + struct sctp_inpcb *inp = NULL; + struct sctp_tcb *stcb = NULL; + struct sctp_nets *net = NULL; + struct sockaddr_in to, from; + + sh = (struct sctphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + bzero(&to, sizeof(to)); + bzero(&from, sizeof(from)); + from.sin_family = to.sin_family = AF_INET; + from.sin_len = to.sin_len = sizeof(to); + from.sin_port = sh->src_port; + from.sin_addr = ip->ip_src; + to.sin_port = sh->dest_port; + to.sin_addr = ip->ip_dst; + + /* + * 'to' holds the dest of the packet that failed to be sent. + * 'from' holds our local endpoint address. Thus we reverse + * the to and the from in the lookup. + */ + stcb = sctp_findassociation_addr_sa((struct sockaddr *)&from, + (struct sockaddr *)&to, + &inp, &net, 1, vrf_id); + if (stcb != NULL && inp && (inp->sctp_socket != NULL)) { + if (cmd != PRC_MSGSIZE) { + sctp_notify(inp, ip, sh, + (struct sockaddr *)&to, stcb, + net); + } else { + /* handle possible ICMP size messages */ + sctp_notify_mbuf(inp, stcb, net, ip, sh); + } + } else { + if ((stcb == NULL) && (inp != NULL)) { + /* reduce ref-count */ + SCTP_INP_WLOCK(inp); + SCTP_INP_DECR_REF(inp); + SCTP_INP_WUNLOCK(inp); + } + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } + } + } + return; +} + +static int +sctp_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in addrs[2]; + struct sctp_inpcb *inp; + struct sctp_nets *net; + struct sctp_tcb *stcb; + int error; + uint32_t vrf_id; + + /* FIX, for non-bsd is this right? */ + vrf_id = SCTP_DEFAULT_VRFID; + + error = priv_check(req->td, PRIV_NETINET_GETCRED); + + if (error) + return (error); + + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + + stcb = sctp_findassociation_addr_sa(sintosa(&addrs[0]), + sintosa(&addrs[1]), + &inp, &net, 1, vrf_id); + if (stcb == NULL || inp == NULL || inp->sctp_socket == NULL) { + if ((inp != NULL) && (stcb == NULL)) { + /* reduce ref-count */ + SCTP_INP_WLOCK(inp); + SCTP_INP_DECR_REF(inp); + goto cred_can_cont; + } + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); + error = ENOENT; + goto out; + } + SCTP_TCB_UNLOCK(stcb); + /* + * We use the write lock here, only since in the error leg we need + * it. If we used RLOCK, then we would have to + * wlock/decr/unlock/rlock. Which in theory could create a hole. + * Better to use higher wlock. + */ + SCTP_INP_WLOCK(inp); +cred_can_cont: + error = cr_canseesocket(req->td->td_ucred, inp->sctp_socket); + if (error) { + SCTP_INP_WUNLOCK(inp); + goto out; + } + cru2x(inp->sctp_socket->so_cred, &xuc); + SCTP_INP_WUNLOCK(inp); + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); +out: + return (error); +} + +SYSCTL_PROC(_net_inet_sctp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW, + 0, 0, sctp_getcred, "S,ucred", "Get the ucred of a SCTP connection"); + + +static void +sctp_abort(struct socket *so) +{ + struct sctp_inpcb *inp; + uint32_t flags; + + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == 0) { + return; + } +sctp_must_try_again: + flags = inp->sctp_flags; +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 17); +#endif + if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && + (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) { +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 16); +#endif + sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, + SCTP_CALLED_AFTER_CMPSET_OFCLOSE); + SOCK_LOCK(so); + SCTP_SB_CLEAR(so->so_snd); + /* + * same for the rcv ones, they are only here for the + * accounting/select. + */ + SCTP_SB_CLEAR(so->so_rcv); + + /* Now null out the reference, we are completely detached. */ + so->so_pcb = NULL; + SOCK_UNLOCK(so); + } else { + flags = inp->sctp_flags; + if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) { + goto sctp_must_try_again; + } + } + return; +} + +static int +sctp_attach(struct socket *so, int proto, struct thread *p) +{ + struct sctp_inpcb *inp; + struct inpcb *ip_inp; + int error; + uint32_t vrf_id = SCTP_DEFAULT_VRFID; + +#ifdef IPSEC + uint32_t flags; + +#endif + + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp != 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return EINVAL; + } + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + error = SCTP_SORESERVE(so, SCTP_BASE_SYSCTL(sctp_sendspace), SCTP_BASE_SYSCTL(sctp_recvspace)); + if (error) { + return error; + } + } + error = sctp_inpcb_alloc(so, vrf_id); + if (error) { + return error; + } + inp = (struct sctp_inpcb *)so->so_pcb; + SCTP_INP_WLOCK(inp); + inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUND_V6; /* I'm not v6! */ + ip_inp = &inp->ip_inp.inp; + ip_inp->inp_vflag |= INP_IPV4; + ip_inp->inp_ip_ttl = MODULE_GLOBAL(ip_defttl); +#ifdef IPSEC + error = ipsec_init_policy(so, &ip_inp->inp_sp); +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 17); +#endif + if (error != 0) { +try_again: + flags = inp->sctp_flags; + if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && + (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) { +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 15); +#endif + SCTP_INP_WUNLOCK(inp); + sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, + SCTP_CALLED_AFTER_CMPSET_OFCLOSE); + } else { + flags = inp->sctp_flags; + if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) { + goto try_again; + } else { + SCTP_INP_WUNLOCK(inp); + } + } + return error; + } +#endif /* IPSEC */ + SCTP_INP_WUNLOCK(inp); + return 0; +} + +static int +sctp_bind(struct socket *so, struct sockaddr *addr, struct thread *p) +{ + struct sctp_inpcb *inp = NULL; + int error; + +#ifdef INET6 + if (addr && addr->sa_family != AF_INET) { + /* must be a v4 address! */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return EINVAL; + } +#endif /* INET6 */ + if (addr && (addr->sa_len != sizeof(struct sockaddr_in))) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return EINVAL; + } + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return EINVAL; + } + error = sctp_inpcb_bind(so, addr, NULL, p); + return error; +} + +void +sctp_close(struct socket *so) +{ + struct sctp_inpcb *inp; + uint32_t flags; + + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == 0) + return; + + /* + * Inform all the lower layer assoc that we are done. + */ +sctp_must_try_again: + flags = inp->sctp_flags; +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 17); +#endif + if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && + (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) { + if (((so->so_options & SO_LINGER) && (so->so_linger == 0)) || + (so->so_rcv.sb_cc > 0)) { +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 13); +#endif + sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, + SCTP_CALLED_AFTER_CMPSET_OFCLOSE); + } else { +#ifdef SCTP_LOG_CLOSING + sctp_log_closing(inp, NULL, 14); +#endif + sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE, + SCTP_CALLED_AFTER_CMPSET_OFCLOSE); + } + /* + * The socket is now detached, no matter what the state of + * the SCTP association. + */ + SOCK_LOCK(so); + SCTP_SB_CLEAR(so->so_snd); + /* + * same for the rcv ones, they are only here for the + * accounting/select. + */ + SCTP_SB_CLEAR(so->so_rcv); + + /* Now null out the reference, we are completely detached. */ + so->so_pcb = NULL; + SOCK_UNLOCK(so); + } else { + flags = inp->sctp_flags; + if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) { + goto sctp_must_try_again; + } + } + return; +} + + +int +sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, + struct mbuf *control, struct thread *p); + + +int +sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, + struct mbuf *control, struct thread *p) +{ + struct sctp_inpcb *inp; + int error; + + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == 0) { + if (control) { + sctp_m_freem(control); + control = NULL; + } + SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + sctp_m_freem(m); + return EINVAL; + } + /* Got to have an to address if we are NOT a connected socket */ + if ((addr == NULL) && + ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) || + (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)) + ) { + goto connected_type; + } else if (addr == NULL) { + SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ); + error = EDESTADDRREQ; + sctp_m_freem(m); + if (control) { + sctp_m_freem(control); + control = NULL; + } + return (error); + } +#ifdef INET6 + if (addr->sa_family != AF_INET) { + /* must be a v4 address! */ + SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ); + sctp_m_freem(m); + if (control) { + sctp_m_freem(control); + control = NULL; + } + error = EDESTADDRREQ; + return EDESTADDRREQ; + } +#endif /* INET6 */ +connected_type: + /* now what about control */ + if (control) { + if (inp->control) { + SCTP_PRINTF("huh? control set?\n"); + sctp_m_freem(inp->control); + inp->control = NULL; + } + inp->control = control; + } + /* Place the data */ + if (inp->pkt) { + SCTP_BUF_NEXT(inp->pkt_last) = m; + inp->pkt_last = m; + } else { + inp->pkt_last = inp->pkt = m; + } + if ( + /* FreeBSD uses a flag passed */ + ((flags & PRUS_MORETOCOME) == 0) + ) { + /* + * note with the current version this code will only be used + * by OpenBSD-- NetBSD, FreeBSD, and MacOS have methods for + * re-defining sosend to use the sctp_sosend. One can + * optionally switch back to this code (by changing back the + * definitions) but this is not advisable. This code is used + * by FreeBSD when sending a file with sendfile() though. + */ + int ret; + + ret = sctp_output(inp, inp->pkt, addr, inp->control, p, flags); + inp->pkt = NULL; + inp->control = NULL; + return (ret); + } else { + return (0); + } +} + +int +sctp_disconnect(struct socket *so) +{ + struct sctp_inpcb *inp; + + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); + return (ENOTCONN); + } + SCTP_INP_RLOCK(inp); + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + if (LIST_EMPTY(&inp->sctp_asoc_list)) { + /* No connection */ + SCTP_INP_RUNLOCK(inp); + return (0); + } else { + struct sctp_association *asoc; + struct sctp_tcb *stcb; + + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb == NULL) { + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + SCTP_TCB_LOCK(stcb); + asoc = &stcb->asoc; + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + /* We are about to be freed, out of here */ + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_RUNLOCK(inp); + return (0); + } + if (((so->so_options & SO_LINGER) && + (so->so_linger == 0)) || + (so->so_rcv.sb_cc > 0)) { + if (SCTP_GET_STATE(asoc) != + SCTP_STATE_COOKIE_WAIT) { + /* Left with Data unread */ + struct mbuf *err; + + err = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_DONTWAIT, 1, MT_DATA); + if (err) { + /* + * Fill in the user + * initiated abort + */ + struct sctp_paramhdr *ph; + + ph = mtod(err, struct sctp_paramhdr *); + SCTP_BUF_LEN(err) = sizeof(struct sctp_paramhdr); + ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); + ph->param_length = htons(SCTP_BUF_LEN(err)); + } +#if defined(SCTP_PANIC_ON_ABORT) + panic("disconnect does an abort"); +#endif + sctp_send_abort_tcb(stcb, err, SCTP_SO_LOCKED); + SCTP_STAT_INCR_COUNTER32(sctps_aborted); + } + SCTP_INP_RUNLOCK(inp); + if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_3); + /* No unlock tcb assoc is gone */ + return (0); + } + if (TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->sent_queue) && + (asoc->stream_queue_cnt == 0)) { + /* there is nothing queued to send, so done */ + if (asoc->locked_on_sending) { + goto abort_anyway; + } + if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && + (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { + /* only send SHUTDOWN 1st time thru */ + sctp_stop_timers_for_shutdown(stcb); + sctp_send_shutdown(stcb, + stcb->asoc.primary_destination); + sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_LOCKED); + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, + stcb->sctp_ep, stcb, + asoc->primary_destination); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, + stcb->sctp_ep, stcb, + asoc->primary_destination); + } + } else { + /* + * we still got (or just got) data to send, + * so set SHUTDOWN_PENDING + */ + /* + * XXX sockets draft says that SCTP_EOF + * should be sent with no data. currently, + * we will allow user data to be sent first + * and move to SHUTDOWN-PENDING + */ + asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, + asoc->primary_destination); + if (asoc->locked_on_sending) { + /* Locked to send out the data */ + struct sctp_stream_queue_pending *sp; + + sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead); + if (sp == NULL) { + SCTP_PRINTF("Error, sp is NULL, locked on sending is non-null strm:%d\n", + asoc->locked_on_sending->stream_no); + } else { + if ((sp->length == 0) && (sp->msg_is_complete == 0)) + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + } + } + if (TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->sent_queue) && + (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { + struct mbuf *op_err; + + abort_anyway: + op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (op_err) { + /* + * Fill in the user + * initiated abort + */ + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(op_err) = + (sizeof(struct sctp_paramhdr) + sizeof(uint32_t)); + ph = mtod(op_err, + struct sctp_paramhdr *); + ph->param_type = htons( + SCTP_CAUSE_USER_INITIATED_ABT); + ph->param_length = htons(SCTP_BUF_LEN(op_err)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_USRREQ + SCTP_LOC_4); + } +#if defined(SCTP_PANIC_ON_ABORT) + panic("disconnect does an abort"); +#endif + + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_4; + sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED); + SCTP_STAT_INCR_COUNTER32(sctps_aborted); + if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + SCTP_INP_RUNLOCK(inp); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_5); + return (0); + } else { + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); + } + } + soisdisconnecting(so); + SCTP_TCB_UNLOCK(stcb); + SCTP_INP_RUNLOCK(inp); + return (0); + } + /* not reached */ + } else { + /* UDP model does not support this */ + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); + return EOPNOTSUPP; + } +} + +int +sctp_flush(struct socket *so, int how) +{ + /* + * We will just clear out the values and let subsequent close clear + * out the data, if any. Note if the user did a shutdown(SHUT_RD) + * they will not be able to read the data, the socket will block + * that from happening. + */ + struct sctp_inpcb *inp; + + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return EINVAL; + } + SCTP_INP_RLOCK(inp); + /* For the 1 to many model this does nothing */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { + SCTP_INP_RUNLOCK(inp); + return (0); + } + SCTP_INP_RUNLOCK(inp); + if ((how == PRU_FLUSH_RD) || (how == PRU_FLUSH_RDWR)) { + /* + * First make sure the sb will be happy, we don't use these + * except maybe the count + */ + SCTP_INP_WLOCK(inp); + SCTP_INP_READ_LOCK(inp); + inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_CANT_READ; + SCTP_INP_READ_UNLOCK(inp); + SCTP_INP_WUNLOCK(inp); + so->so_rcv.sb_cc = 0; + so->so_rcv.sb_mbcnt = 0; + so->so_rcv.sb_mb = NULL; + } + if ((how == PRU_FLUSH_WR) || (how == PRU_FLUSH_RDWR)) { + /* + * First make sure the sb will be happy, we don't use these + * except maybe the count + */ + so->so_snd.sb_cc = 0; + so->so_snd.sb_mbcnt = 0; + so->so_snd.sb_mb = NULL; + + } + return (0); +} + +int +sctp_shutdown(struct socket *so) +{ + struct sctp_inpcb *inp; + + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return EINVAL; + } + SCTP_INP_RLOCK(inp); + /* For UDP model this is a invalid call */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { + /* Restore the flags that the soshutdown took away. */ + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_state &= ~SBS_CANTRCVMORE; + SOCKBUF_UNLOCK(&so->so_rcv); + /* This proc will wakeup for read and do nothing (I hope) */ + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); + return (EOPNOTSUPP); + } + /* + * Ok if we reach here its the TCP model and it is either a SHUT_WR + * or SHUT_RDWR. This means we put the shutdown flag against it. + */ + { + struct sctp_tcb *stcb; + struct sctp_association *asoc; + + if ((so->so_state & + (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { + SCTP_INP_RUNLOCK(inp); + return (ENOTCONN); + } + socantsendmore(so); + + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb == NULL) { + /* + * Ok we hit the case that the shutdown call was + * made after an abort or something. Nothing to do + * now. + */ + SCTP_INP_RUNLOCK(inp); + return (0); + } + SCTP_TCB_LOCK(stcb); + asoc = &stcb->asoc; + if (TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->sent_queue) && + (asoc->stream_queue_cnt == 0)) { + if (asoc->locked_on_sending) { + goto abort_anyway; + } + /* there is nothing queued to send, so I'm done... */ + if (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) { + /* only send SHUTDOWN the first time through */ + sctp_stop_timers_for_shutdown(stcb); + sctp_send_shutdown(stcb, + stcb->asoc.primary_destination); + sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_LOCKED); + if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); + SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, + stcb->sctp_ep, stcb, + asoc->primary_destination); + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, + stcb->sctp_ep, stcb, + asoc->primary_destination); + } + } else { + /* + * we still got (or just got) data to send, so set + * SHUTDOWN_PENDING + */ + asoc->state |= SCTP_STATE_SHUTDOWN_PENDING; + sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, + asoc->primary_destination); + + if (asoc->locked_on_sending) { + /* Locked to send out the data */ + struct sctp_stream_queue_pending *sp; + + sp = TAILQ_LAST(&asoc->locked_on_sending->outqueue, sctp_streamhead); + if (sp == NULL) { + SCTP_PRINTF("Error, sp is NULL, locked on sending is non-null strm:%d\n", + asoc->locked_on_sending->stream_no); + } else { + if ((sp->length == 0) && (sp->msg_is_complete == 0)) { + asoc->state |= SCTP_STATE_PARTIAL_MSG_LEFT; + } + } + } + if (TAILQ_EMPTY(&asoc->send_queue) && + TAILQ_EMPTY(&asoc->sent_queue) && + (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { + struct mbuf *op_err; + + abort_anyway: + op_err = sctp_get_mbuf_for_msg((sizeof(struct sctp_paramhdr) + sizeof(uint32_t)), + 0, M_DONTWAIT, 1, MT_DATA); + if (op_err) { + /* Fill in the user initiated abort */ + struct sctp_paramhdr *ph; + uint32_t *ippp; + + SCTP_BUF_LEN(op_err) = + sizeof(struct sctp_paramhdr) + sizeof(uint32_t); + ph = mtod(op_err, + struct sctp_paramhdr *); + ph->param_type = htons( + SCTP_CAUSE_USER_INITIATED_ABT); + ph->param_length = htons(SCTP_BUF_LEN(op_err)); + ippp = (uint32_t *) (ph + 1); + *ippp = htonl(SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6); + } +#if defined(SCTP_PANIC_ON_ABORT) + panic("shutdown does an abort"); +#endif + stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6; + sctp_abort_an_association(stcb->sctp_ep, stcb, + SCTP_RESPONSE_TO_USER_REQ, + op_err, SCTP_SO_LOCKED); + goto skip_unlock; + } else { + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); + } + } + SCTP_TCB_UNLOCK(stcb); + } +skip_unlock: + SCTP_INP_RUNLOCK(inp); + return 0; +} + +/* + * copies a "user" presentable address and removes embedded scope, etc. + * returns 0 on success, 1 on error + */ +static uint32_t +sctp_fill_user_address(struct sockaddr_storage *ss, struct sockaddr *sa) +{ +#ifdef INET6 + struct sockaddr_in6 lsa6; + + sa = (struct sockaddr *)sctp_recover_scope((struct sockaddr_in6 *)sa, + &lsa6); +#endif + memcpy(ss, sa, sa->sa_len); + return (0); +} + + + +/* + * NOTE: assumes addr lock is held + */ +static size_t +sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + size_t limit, + struct sockaddr_storage *sas, + uint32_t vrf_id) +{ + struct sctp_ifn *sctp_ifn; + struct sctp_ifa *sctp_ifa; + int loopback_scope, ipv4_local_scope, local_scope, site_scope; + size_t actual; + int ipv4_addr_legal, ipv6_addr_legal; + struct sctp_vrf *vrf; + + actual = 0; + if (limit <= 0) + return (actual); + + if (stcb) { + /* Turn on all the appropriate scope */ + loopback_scope = stcb->asoc.loopback_scope; + ipv4_local_scope = stcb->asoc.ipv4_local_scope; + local_scope = stcb->asoc.local_scope; + site_scope = stcb->asoc.site_scope; + } else { + /* Turn on ALL scope, since we look at the EP */ + loopback_scope = ipv4_local_scope = local_scope = + site_scope = 1; + } + ipv4_addr_legal = ipv6_addr_legal = 0; + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ipv6_addr_legal = 1; + if (SCTP_IPV6_V6ONLY(inp) == 0) { + ipv4_addr_legal = 1; + } + } else { + ipv4_addr_legal = 1; + } + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) { + return (0); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { + if ((loopback_scope == 0) && + SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { + /* Skip loopback if loopback_scope not set */ + continue; + } + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + if (stcb) { + /* + * For the BOUND-ALL case, the list + * associated with a TCB is Always + * considered a reverse list.. i.e. + * it lists addresses that are NOT + * part of the association. If this + * is one of those we must skip it. + */ + if (sctp_is_addr_restricted(stcb, + sctp_ifa)) { + continue; + } + } + switch (sctp_ifa->address.sa.sa_family) { + case AF_INET: + if (ipv4_addr_legal) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&sctp_ifa->address.sa; + if (sin->sin_addr.s_addr == 0) { + /* + * we skip + * unspecifed + * addresses + */ + continue; + } + if ((ipv4_local_scope == 0) && + (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { + continue; + } +#ifdef INET6 + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { + in6_sin_2_v4mapsin6(sin, (struct sockaddr_in6 *)sas); + ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport; + sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(struct sockaddr_in6)); + actual += sizeof(struct sockaddr_in6); + } else { +#endif + memcpy(sas, sin, sizeof(*sin)); + ((struct sockaddr_in *)sas)->sin_port = inp->sctp_lport; + sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(*sin)); + actual += sizeof(*sin); +#ifdef INET6 + } +#endif + if (actual >= limit) { + return (actual); + } + } else { + continue; + } + break; +#ifdef INET6 + case AF_INET6: + if (ipv6_addr_legal) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + /* + * we skip + * unspecifed + * addresses + */ + continue; + } + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { + if (local_scope == 0) + continue; + if (sin6->sin6_scope_id == 0) { + if (sa6_recoverscope(sin6) != 0) + /* + * + * bad + * + * li + * nk + * + * loc + * al + * + * add + * re + * ss + * */ + continue; + } + } + if ((site_scope == 0) && + (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { + continue; + } + memcpy(sas, sin6, sizeof(*sin6)); + ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport; + sas = (struct sockaddr_storage *)((caddr_t)sas + sizeof(*sin6)); + actual += sizeof(*sin6); + if (actual >= limit) { + return (actual); + } + } else { + continue; + } + break; +#endif + default: + /* TSNH */ + break; + } + } + } + } else { + struct sctp_laddr *laddr; + + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (stcb) { + if (sctp_is_addr_restricted(stcb, laddr->ifa)) { + continue; + } + } + if (sctp_fill_user_address(sas, &laddr->ifa->address.sa)) + continue; + + ((struct sockaddr_in6 *)sas)->sin6_port = inp->sctp_lport; + sas = (struct sockaddr_storage *)((caddr_t)sas + + laddr->ifa->address.sa.sa_len); + actual += laddr->ifa->address.sa.sa_len; + if (actual >= limit) { + return (actual); + } + } + } + return (actual); +} + +static size_t +sctp_fill_up_addresses(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + size_t limit, + struct sockaddr_storage *sas) +{ + size_t size = 0; + + SCTP_IPI_ADDR_RLOCK(); + /* fill up addresses for the endpoint's default vrf */ + size = sctp_fill_up_addresses_vrf(inp, stcb, limit, sas, + inp->def_vrf_id); + SCTP_IPI_ADDR_RUNLOCK(); + return (size); +} + +/* + * NOTE: assumes addr lock is held + */ +static int +sctp_count_max_addresses_vrf(struct sctp_inpcb *inp, uint32_t vrf_id) +{ + int cnt = 0; + struct sctp_vrf *vrf = NULL; + + /* + * In both sub-set bound an bound_all cases we return the MAXIMUM + * number of addresses that you COULD get. In reality the sub-set + * bound may have an exclusion list for a given TCB OR in the + * bound-all case a TCB may NOT include the loopback or other + * addresses as well. + */ + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) { + return (0); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + struct sctp_ifn *sctp_ifn; + struct sctp_ifa *sctp_ifa; + + LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + /* Count them if they are the right type */ + if (sctp_ifa->address.sa.sa_family == AF_INET) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) + cnt += sizeof(struct sockaddr_in6); + else + cnt += sizeof(struct sockaddr_in); + + } else if (sctp_ifa->address.sa.sa_family == AF_INET6) + cnt += sizeof(struct sockaddr_in6); + } + } + } else { + struct sctp_laddr *laddr; + + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa->address.sa.sa_family == AF_INET) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) + cnt += sizeof(struct sockaddr_in6); + else + cnt += sizeof(struct sockaddr_in); + + } else if (laddr->ifa->address.sa.sa_family == AF_INET6) + cnt += sizeof(struct sockaddr_in6); + } + } + return (cnt); +} + +static int +sctp_count_max_addresses(struct sctp_inpcb *inp) +{ + int cnt = 0; + + SCTP_IPI_ADDR_RLOCK(); + /* count addresses for the endpoint's default VRF */ + cnt = sctp_count_max_addresses_vrf(inp, inp->def_vrf_id); + SCTP_IPI_ADDR_RUNLOCK(); + return (cnt); +} + +static int +sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval, + size_t optsize, void *p, int delay) +{ + int error = 0; + int creat_lock_on = 0; + struct sctp_tcb *stcb = NULL; + struct sockaddr *sa; + int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr; + int added = 0; + uint32_t vrf_id; + int bad_addresses = 0; + sctp_assoc_t *a_id; + + SCTPDBG(SCTP_DEBUG_PCB1, "Connectx called\n"); + + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && + (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) { + /* We are already connected AND the TCP model */ + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); + return (EADDRINUSE); + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) && + (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) { + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { + SCTP_INP_RLOCK(inp); + stcb = LIST_FIRST(&inp->sctp_asoc_list); + SCTP_INP_RUNLOCK(inp); + } + if (stcb) { + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); + return (EALREADY); + } + SCTP_INP_INCR_REF(inp); + SCTP_ASOC_CREATE_LOCK(inp); + creat_lock_on = 1; + if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EFAULT); + error = EFAULT; + goto out_now; + } + totaddrp = (int *)optval; + totaddr = *totaddrp; + sa = (struct sockaddr *)(totaddrp + 1); + stcb = sctp_connectx_helper_find(inp, sa, &totaddr, &num_v4, &num_v6, &error, (optsize - sizeof(int)), &bad_addresses); + if ((stcb != NULL) || bad_addresses) { + /* Already have or am bring up an association */ + SCTP_ASOC_CREATE_UNLOCK(inp); + creat_lock_on = 0; + if (stcb) + SCTP_TCB_UNLOCK(stcb); + if (bad_addresses == 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); + error = EALREADY; + } + goto out_now; + } +#ifdef INET6 + if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) && + (num_v6 > 0)) { + error = EINVAL; + goto out_now; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + (num_v4 > 0)) { + struct in6pcb *inp6; + + inp6 = (struct in6pcb *)inp; + if (SCTP_IPV6_V6ONLY(inp6)) { + /* + * if IPV6_V6ONLY flag, ignore connections destined + * to a v4 addr or v4-mapped addr + */ + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + goto out_now; + } + } +#endif /* INET6 */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == + SCTP_PCB_FLAGS_UNBOUND) { + /* Bind a ephemeral port */ + error = sctp_inpcb_bind(so, NULL, NULL, p); + if (error) { + goto out_now; + } + } + /* FIX ME: do we want to pass in a vrf on the connect call? */ + vrf_id = inp->def_vrf_id; + + + /* We are GOOD to go */ + stcb = sctp_aloc_assoc(inp, sa, &error, 0, vrf_id, + (struct thread *)p + ); + if (stcb == NULL) { + /* Gak! no memory */ + goto out_now; + } + SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT); + /* move to second address */ + if (sa->sa_family == AF_INET) + sa = (struct sockaddr *)((caddr_t)sa + sizeof(struct sockaddr_in)); + else + sa = (struct sockaddr *)((caddr_t)sa + sizeof(struct sockaddr_in6)); + + error = 0; + added = sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error); + /* Fill in the return id */ + if (error) { + (void)sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6); + goto out_now; + } + a_id = (sctp_assoc_t *) optval; + *a_id = sctp_get_associd(stcb); + + /* initialize authentication parameters for the assoc */ + sctp_initialize_auth_params(inp, stcb); + + if (delay) { + /* doing delayed connection */ + stcb->asoc.delayed_connection = 1; + sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, stcb->asoc.primary_destination); + } else { + (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); + sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); + } + SCTP_TCB_UNLOCK(stcb); + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { + stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; + /* Set the connected flag so we can queue data */ + soisconnecting(so); + } +out_now: + if (creat_lock_on) { + SCTP_ASOC_CREATE_UNLOCK(inp); + } + SCTP_INP_DECR_REF(inp); + return error; +} + +#define SCTP_FIND_STCB(inp, stcb, assoc_id) { \ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||\ + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { \ + SCTP_INP_RLOCK(inp); \ + stcb = LIST_FIRST(&inp->sctp_asoc_list); \ + if (stcb) { \ + SCTP_TCB_LOCK(stcb); \ + } \ + SCTP_INP_RUNLOCK(inp); \ + } else if (assoc_id != 0) { \ + stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1); \ + if (stcb == NULL) { \ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); \ + error = ENOENT; \ + break; \ + } \ + } else { \ + stcb = NULL; \ + } \ + } + + +#define SCTP_CHECK_AND_CAST(destp, srcp, type, size) {\ + if (size < sizeof(type)) { \ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); \ + error = EINVAL; \ + break; \ + } else { \ + destp = (type *)srcp; \ + } \ + } + +static int +sctp_getopt(struct socket *so, int optname, void *optval, size_t *optsize, + void *p) +{ + struct sctp_inpcb *inp = NULL; + int error, val = 0; + struct sctp_tcb *stcb = NULL; + + if (optval == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return EINVAL; + } + error = 0; + + switch (optname) { + case SCTP_NODELAY: + case SCTP_AUTOCLOSE: + case SCTP_EXPLICIT_EOR: + case SCTP_AUTO_ASCONF: + case SCTP_DISABLE_FRAGMENTS: + case SCTP_I_WANT_MAPPED_V4_ADDR: + case SCTP_USE_EXT_RCVINFO: + SCTP_INP_RLOCK(inp); + switch (optname) { + case SCTP_DISABLE_FRAGMENTS: + val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT); + break; + case SCTP_I_WANT_MAPPED_V4_ADDR: + val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4); + break; + case SCTP_AUTO_ASCONF: + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + /* only valid for bound all sockets */ + val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + goto flags_out; + } + break; + case SCTP_EXPLICIT_EOR: + val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR); + break; + case SCTP_NODELAY: + val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY); + break; + case SCTP_USE_EXT_RCVINFO: + val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO); + break; + case SCTP_AUTOCLOSE: + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) + val = TICKS_TO_SEC(inp->sctp_ep.auto_close_time); + else + val = 0; + break; + + default: + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); + error = ENOPROTOOPT; + } /* end switch (sopt->sopt_name) */ + if (optname != SCTP_AUTOCLOSE) { + /* make it an "on/off" value */ + val = (val != 0); + } + if (*optsize < sizeof(val)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } +flags_out: + SCTP_INP_RUNLOCK(inp); + if (error == 0) { + /* return the option value */ + *(int *)optval = val; + *optsize = sizeof(val); + } + break; + case SCTP_GET_PACKET_LOG: + { +#ifdef SCTP_PACKET_LOGGING + uint8_t *target; + int ret; + + SCTP_CHECK_AND_CAST(target, optval, uint8_t, *optsize); + ret = sctp_copy_out_packet_log(target, (int)*optsize); + *optsize = ret; +#else + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); + error = EOPNOTSUPP; +#endif + break; + } + case SCTP_REUSE_PORT: + { + uint32_t *value; + + if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) { + /* Can't do this for a 1-m socket */ + error = EINVAL; + break; + } + SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); + *value = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE); + *optsize = sizeof(uint32_t); + } + break; + case SCTP_PARTIAL_DELIVERY_POINT: + { + uint32_t *value; + + SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); + *value = inp->partial_delivery_point; + *optsize = sizeof(uint32_t); + } + break; + case SCTP_FRAGMENT_INTERLEAVE: + { + uint32_t *value; + + SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS)) { + *value = SCTP_FRAG_LEVEL_2; + } else { + *value = SCTP_FRAG_LEVEL_1; + } + } else { + *value = SCTP_FRAG_LEVEL_0; + } + *optsize = sizeof(uint32_t); + } + break; + case SCTP_CMT_ON_OFF: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + if (stcb) { + av->assoc_value = stcb->asoc.sctp_cmt_on_off; + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_cmt_on_off; + SCTP_INP_RUNLOCK(inp); + } + *optsize = sizeof(*av); + } + break; + /* JRS - Get socket option for pluggable congestion control */ + case SCTP_PLUGGABLE_CC: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + if (stcb) { + av->assoc_value = stcb->asoc.congestion_control_module; + SCTP_TCB_UNLOCK(stcb); + } else { + av->assoc_value = inp->sctp_ep.sctp_default_cc_module; + } + *optsize = sizeof(*av); + } + break; + case SCTP_GET_ADDR_LEN: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + error = EINVAL; +#ifdef INET + if (av->assoc_value == AF_INET) { + av->assoc_value = sizeof(struct sockaddr_in); + error = 0; + } +#endif +#ifdef INET6 + if (av->assoc_value == AF_INET6) { + av->assoc_value = sizeof(struct sockaddr_in6); + error = 0; + } +#endif + if (error) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + } + *optsize = sizeof(*av); + } + break; + case SCTP_GET_ASSOC_NUMBER: + { + uint32_t *value, cnt; + + SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); + cnt = 0; + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + cnt++; + } + SCTP_INP_RUNLOCK(inp); + *value = cnt; + *optsize = sizeof(uint32_t); + } + break; + + case SCTP_GET_ASSOC_ID_LIST: + { + struct sctp_assoc_ids *ids; + unsigned int at, limit; + + SCTP_CHECK_AND_CAST(ids, optval, struct sctp_assoc_ids, *optsize); + at = 0; + limit = (*optsize - sizeof(uint32_t)) / sizeof(sctp_assoc_t); + SCTP_INP_RLOCK(inp); + LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { + if (at < limit) { + ids->gaids_assoc_id[at++] = sctp_get_associd(stcb); + } else { + error = EINVAL; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } + } + SCTP_INP_RUNLOCK(inp); + ids->gaids_number_of_ids = at; + *optsize = ((at * sizeof(sctp_assoc_t)) + sizeof(uint32_t)); + } + break; + case SCTP_CONTEXT: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + av->assoc_value = stcb->asoc.context; + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_RLOCK(inp); + av->assoc_value = inp->sctp_context; + SCTP_INP_RUNLOCK(inp); + } + *optsize = sizeof(*av); + } + break; + case SCTP_VRF_ID: + { + uint32_t *default_vrfid; + + SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, *optsize); + *default_vrfid = inp->def_vrf_id; + break; + } + case SCTP_GET_ASOC_VRF: + { + struct sctp_assoc_value *id; + + SCTP_CHECK_AND_CAST(id, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, id->assoc_id); + if (stcb == NULL) { + error = EINVAL; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } + id->assoc_value = stcb->asoc.vrf_id; + break; + } + case SCTP_GET_VRF_IDS: + { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); + error = EOPNOTSUPP; + break; + } + case SCTP_GET_NONCE_VALUES: + { + struct sctp_get_nonce_values *gnv; + + SCTP_CHECK_AND_CAST(gnv, optval, struct sctp_get_nonce_values, *optsize); + SCTP_FIND_STCB(inp, stcb, gnv->gn_assoc_id); + + if (stcb) { + gnv->gn_peers_tag = stcb->asoc.peer_vtag; + gnv->gn_local_tag = stcb->asoc.my_vtag; + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); + error = ENOTCONN; + } + *optsize = sizeof(*gnv); + } + break; + case SCTP_DELAYED_SACK: + { + struct sctp_sack_info *sack; + + SCTP_CHECK_AND_CAST(sack, optval, struct sctp_sack_info, *optsize); + SCTP_FIND_STCB(inp, stcb, sack->sack_assoc_id); + if (stcb) { + sack->sack_delay = stcb->asoc.delayed_ack; + sack->sack_freq = stcb->asoc.sack_freq; + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_RLOCK(inp); + sack->sack_delay = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]); + sack->sack_freq = inp->sctp_ep.sctp_sack_freq; + SCTP_INP_RUNLOCK(inp); + } + *optsize = sizeof(*sack); + } + break; + + case SCTP_GET_SNDBUF_USE: + { + struct sctp_sockstat *ss; + + SCTP_CHECK_AND_CAST(ss, optval, struct sctp_sockstat, *optsize); + SCTP_FIND_STCB(inp, stcb, ss->ss_assoc_id); + + if (stcb) { + ss->ss_total_sndbuf = stcb->asoc.total_output_queue_size; + ss->ss_total_recv_buf = (stcb->asoc.size_on_reasm_queue + + stcb->asoc.size_on_all_streams); + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); + error = ENOTCONN; + } + *optsize = sizeof(struct sctp_sockstat); + } + break; + case SCTP_MAX_BURST: + { + uint8_t *value; + + SCTP_CHECK_AND_CAST(value, optval, uint8_t, *optsize); + + SCTP_INP_RLOCK(inp); + *value = inp->sctp_ep.max_burst; + SCTP_INP_RUNLOCK(inp); + *optsize = sizeof(uint8_t); + } + break; + case SCTP_MAXSEG: + { + struct sctp_assoc_value *av; + int ovh; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + av->assoc_value = sctp_get_frag_point(stcb, &stcb->asoc); + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_RLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ovh = SCTP_MED_OVERHEAD; + } else { + ovh = SCTP_MED_V4_OVERHEAD; + } + if (inp->sctp_frag_point >= SCTP_DEFAULT_MAXSEGMENT) + av->assoc_value = 0; + else + av->assoc_value = inp->sctp_frag_point - ovh; + SCTP_INP_RUNLOCK(inp); + } + *optsize = sizeof(struct sctp_assoc_value); + } + break; + case SCTP_GET_STAT_LOG: + error = sctp_fill_stat_log(optval, optsize); + break; + case SCTP_EVENTS: + { + struct sctp_event_subscribe *events; + + SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, *optsize); + memset(events, 0, sizeof(*events)); + SCTP_INP_RLOCK(inp); + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT)) + events->sctp_data_io_event = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT)) + events->sctp_association_event = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVPADDREVNT)) + events->sctp_address_event = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) + events->sctp_send_failure_event = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVPEERERR)) + events->sctp_peer_error_event = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT)) + events->sctp_shutdown_event = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT)) + events->sctp_partial_delivery_event = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT)) + events->sctp_adaptation_layer_event = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTHEVNT)) + events->sctp_authentication_event = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT)) + events->sctp_sender_dry_event = 1; + + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT)) + events->sctp_stream_reset_event = 1; + SCTP_INP_RUNLOCK(inp); + *optsize = sizeof(struct sctp_event_subscribe); + } + break; + + case SCTP_ADAPTATION_LAYER: + { + uint32_t *value; + + SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); + + SCTP_INP_RLOCK(inp); + *value = inp->sctp_ep.adaptation_layer_indicator; + SCTP_INP_RUNLOCK(inp); + *optsize = sizeof(uint32_t); + } + break; + case SCTP_SET_INITIAL_DBG_SEQ: + { + uint32_t *value; + + SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); + SCTP_INP_RLOCK(inp); + *value = inp->sctp_ep.initial_sequence_debug; + SCTP_INP_RUNLOCK(inp); + *optsize = sizeof(uint32_t); + } + break; + case SCTP_GET_LOCAL_ADDR_SIZE: + { + uint32_t *value; + + SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); + SCTP_INP_RLOCK(inp); + *value = sctp_count_max_addresses(inp); + SCTP_INP_RUNLOCK(inp); + *optsize = sizeof(uint32_t); + } + break; + case SCTP_GET_REMOTE_ADDR_SIZE: + { + uint32_t *value; + size_t size; + struct sctp_nets *net; + + SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize); + /* FIXME MT: change to sctp_assoc_value? */ + SCTP_FIND_STCB(inp, stcb, (sctp_assoc_t) * value); + + if (stcb) { + size = 0; + /* Count the sizes */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) || + (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET6)) { + size += sizeof(struct sockaddr_in6); + } else if (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET) { + size += sizeof(struct sockaddr_in); + } else { + /* huh */ + break; + } + } + SCTP_TCB_UNLOCK(stcb); + *value = (uint32_t) size; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); + error = ENOTCONN; + } + *optsize = sizeof(uint32_t); + } + break; + case SCTP_GET_PEER_ADDRESSES: + /* + * Get the address information, an array is passed in to + * fill up we pack it. + */ + { + size_t cpsz, left; + struct sockaddr_storage *sas; + struct sctp_nets *net; + struct sctp_getaddresses *saddr; + + SCTP_CHECK_AND_CAST(saddr, optval, struct sctp_getaddresses, *optsize); + SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id); + + if (stcb) { + left = (*optsize) - sizeof(struct sctp_getaddresses); + *optsize = sizeof(struct sctp_getaddresses); + sas = (struct sockaddr_storage *)&saddr->addr[0]; + + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) || + (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET6)) { + cpsz = sizeof(struct sockaddr_in6); + } else if (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET) { + cpsz = sizeof(struct sockaddr_in); + } else { + /* huh */ + break; + } + if (left < cpsz) { + /* not enough room. */ + break; + } +#ifdef INET6 + if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) && + (((struct sockaddr *)&net->ro._l_addr)->sa_family == AF_INET)) { + /* Must map the address */ + in6_sin_2_v4mapsin6((struct sockaddr_in *)&net->ro._l_addr, + (struct sockaddr_in6 *)sas); + } else { +#endif + memcpy(sas, &net->ro._l_addr, cpsz); +#ifdef INET6 + } +#endif + ((struct sockaddr_in *)sas)->sin_port = stcb->rport; + + sas = (struct sockaddr_storage *)((caddr_t)sas + cpsz); + left -= cpsz; + *optsize += cpsz; + } + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); + error = ENOENT; + } + } + break; + case SCTP_GET_LOCAL_ADDRESSES: + { + size_t limit, actual; + struct sockaddr_storage *sas; + struct sctp_getaddresses *saddr; + + SCTP_CHECK_AND_CAST(saddr, optval, struct sctp_getaddresses, *optsize); + SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id); + + sas = (struct sockaddr_storage *)&saddr->addr[0]; + limit = *optsize - sizeof(sctp_assoc_t); + actual = sctp_fill_up_addresses(inp, stcb, limit, sas); + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } + *optsize = sizeof(struct sockaddr_storage) + actual; + } + break; + case SCTP_PEER_ADDR_PARAMS: + { + struct sctp_paddrparams *paddrp; + struct sctp_nets *net; + + SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, *optsize); + SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id); + + net = NULL; + if (stcb) { + net = sctp_findnet(stcb, (struct sockaddr *)&paddrp->spp_address); + } else { + /* + * We increment here since + * sctp_findassociation_ep_addr() wil do a + * decrement if it finds the stcb as long as + * the locked tcb (last argument) is NOT a + * TCB.. aka NULL. + */ + SCTP_INP_INCR_REF(inp); + stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&paddrp->spp_address, &net, NULL, NULL); + if (stcb == NULL) { + SCTP_INP_DECR_REF(inp); + } + } + if (stcb && (net == NULL)) { + struct sockaddr *sa; + + sa = (struct sockaddr *)&paddrp->spp_address; + if (sa->sa_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)sa; + if (sin->sin_addr.s_addr) { + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } + } else if (sa->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)sa; + if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } + } else { + error = EAFNOSUPPORT; + SCTP_TCB_UNLOCK(stcb); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } + } + if (stcb) { + /* Applys to the specific association */ + paddrp->spp_flags = 0; + if (net) { + int ovh; + + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ovh = SCTP_MED_OVERHEAD; + } else { + ovh = SCTP_MED_V4_OVERHEAD; + } + + + paddrp->spp_pathmaxrxt = net->failure_threshold; + paddrp->spp_pathmtu = net->mtu - ovh; + /* get flags for HB */ + if (net->dest_state & SCTP_ADDR_NOHB) + paddrp->spp_flags |= SPP_HB_DISABLE; + else + paddrp->spp_flags |= SPP_HB_ENABLE; + /* get flags for PMTU */ + if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { + paddrp->spp_flags |= SPP_PMTUD_ENABLE; + } else { + paddrp->spp_flags |= SPP_PMTUD_DISABLE; + } +#ifdef INET + if (net->ro._l_addr.sin.sin_family == AF_INET) { + paddrp->spp_ipv4_tos = net->tos_flowlabel & 0x000000fc; + paddrp->spp_flags |= SPP_IPV4_TOS; + } +#endif +#ifdef INET6 + if (net->ro._l_addr.sin6.sin6_family == AF_INET6) { + paddrp->spp_ipv6_flowlabel = net->tos_flowlabel; + paddrp->spp_flags |= SPP_IPV6_FLOWLABEL; + } +#endif + } else { + /* + * No destination so return default + * value + */ + int cnt = 0; + + paddrp->spp_pathmaxrxt = stcb->asoc.def_net_failure; + paddrp->spp_pathmtu = sctp_get_frag_point(stcb, &stcb->asoc); +#ifdef INET + paddrp->spp_ipv4_tos = stcb->asoc.default_tos & 0x000000fc; + paddrp->spp_flags |= SPP_IPV4_TOS; +#endif +#ifdef INET6 + paddrp->spp_ipv6_flowlabel = stcb->asoc.default_flowlabel; + paddrp->spp_flags |= SPP_IPV6_FLOWLABEL; +#endif + /* default settings should be these */ + if (stcb->asoc.hb_is_disabled == 0) { + paddrp->spp_flags |= SPP_HB_ENABLE; + } else { + paddrp->spp_flags |= SPP_HB_DISABLE; + } + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { + cnt++; + } + } + if (cnt) { + paddrp->spp_flags |= SPP_PMTUD_ENABLE; + } + } + paddrp->spp_hbinterval = stcb->asoc.heart_beat_delay; + paddrp->spp_assoc_id = sctp_get_associd(stcb); + SCTP_TCB_UNLOCK(stcb); + } else { + /* Use endpoint defaults */ + SCTP_INP_RLOCK(inp); + paddrp->spp_pathmaxrxt = inp->sctp_ep.def_net_failure; + paddrp->spp_hbinterval = TICKS_TO_MSEC(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]); + paddrp->spp_assoc_id = (sctp_assoc_t) 0; + /* get inp's default */ +#ifdef INET + paddrp->spp_ipv4_tos = inp->ip_inp.inp.inp_ip_tos; + paddrp->spp_flags |= SPP_IPV4_TOS; +#endif +#ifdef INET6 + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + paddrp->spp_ipv6_flowlabel = ((struct in6pcb *)inp)->in6p_flowinfo; + paddrp->spp_flags |= SPP_IPV6_FLOWLABEL; + } +#endif + /* can't return this */ + paddrp->spp_pathmtu = 0; + + /* default behavior, no stcb */ + paddrp->spp_flags = SPP_PMTUD_ENABLE; + + if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) { + paddrp->spp_flags |= SPP_HB_ENABLE; + } else { + paddrp->spp_flags |= SPP_HB_DISABLE; + } + SCTP_INP_RUNLOCK(inp); + } + *optsize = sizeof(struct sctp_paddrparams); + } + break; + case SCTP_GET_PEER_ADDR_INFO: + { + struct sctp_paddrinfo *paddri; + struct sctp_nets *net; + + SCTP_CHECK_AND_CAST(paddri, optval, struct sctp_paddrinfo, *optsize); + SCTP_FIND_STCB(inp, stcb, paddri->spinfo_assoc_id); + + net = NULL; + if (stcb) { + net = sctp_findnet(stcb, (struct sockaddr *)&paddri->spinfo_address); + } else { + /* + * We increment here since + * sctp_findassociation_ep_addr() wil do a + * decrement if it finds the stcb as long as + * the locked tcb (last argument) is NOT a + * TCB.. aka NULL. + */ + SCTP_INP_INCR_REF(inp); + stcb = sctp_findassociation_ep_addr(&inp, (struct sockaddr *)&paddri->spinfo_address, &net, NULL, NULL); + if (stcb == NULL) { + SCTP_INP_DECR_REF(inp); + } + } + + if ((stcb) && (net)) { + paddri->spinfo_state = net->dest_state & (SCTP_REACHABLE_MASK | SCTP_ADDR_NOHB); + paddri->spinfo_cwnd = net->cwnd; + paddri->spinfo_srtt = ((net->lastsa >> 2) + net->lastsv) >> 1; + paddri->spinfo_rto = net->RTO; + paddri->spinfo_assoc_id = sctp_get_associd(stcb); + SCTP_TCB_UNLOCK(stcb); + } else { + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); + error = ENOENT; + } + *optsize = sizeof(struct sctp_paddrinfo); + } + break; + case SCTP_PCB_STATUS: + { + struct sctp_pcbinfo *spcb; + + SCTP_CHECK_AND_CAST(spcb, optval, struct sctp_pcbinfo, *optsize); + sctp_fill_pcbinfo(spcb); + *optsize = sizeof(struct sctp_pcbinfo); + } + break; + + case SCTP_STATUS: + { + struct sctp_nets *net; + struct sctp_status *sstat; + + SCTP_CHECK_AND_CAST(sstat, optval, struct sctp_status, *optsize); + SCTP_FIND_STCB(inp, stcb, sstat->sstat_assoc_id); + + if (stcb == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + error = EINVAL; + break; + } + /* + * I think passing the state is fine since + * sctp_constants.h will be available to the user + * land. + */ + sstat->sstat_state = stcb->asoc.state; + sstat->sstat_assoc_id = sctp_get_associd(stcb); + sstat->sstat_rwnd = stcb->asoc.peers_rwnd; + sstat->sstat_unackdata = stcb->asoc.sent_queue_cnt; + /* + * We can't include chunks that have been passed to + * the socket layer. Only things in queue. + */ + sstat->sstat_penddata = (stcb->asoc.cnt_on_reasm_queue + + stcb->asoc.cnt_on_all_streams); + + + sstat->sstat_instrms = stcb->asoc.streamincnt; + sstat->sstat_outstrms = stcb->asoc.streamoutcnt; + sstat->sstat_fragmentation_point = sctp_get_frag_point(stcb, &stcb->asoc); + memcpy(&sstat->sstat_primary.spinfo_address, + &stcb->asoc.primary_destination->ro._l_addr, + ((struct sockaddr *)(&stcb->asoc.primary_destination->ro._l_addr))->sa_len); + net = stcb->asoc.primary_destination; + ((struct sockaddr_in *)&sstat->sstat_primary.spinfo_address)->sin_port = stcb->rport; + /* + * Again the user can get info from sctp_constants.h + * for what the state of the network is. + */ + sstat->sstat_primary.spinfo_state = net->dest_state & SCTP_REACHABLE_MASK; + sstat->sstat_primary.spinfo_cwnd = net->cwnd; + sstat->sstat_primary.spinfo_srtt = net->lastsa; + sstat->sstat_primary.spinfo_rto = net->RTO; + sstat->sstat_primary.spinfo_mtu = net->mtu; + sstat->sstat_primary.spinfo_assoc_id = sctp_get_associd(stcb); + SCTP_TCB_UNLOCK(stcb); + *optsize = sizeof(*sstat); + } + break; + case SCTP_RTOINFO: + { + struct sctp_rtoinfo *srto; + + SCTP_CHECK_AND_CAST(srto, optval, struct sctp_rtoinfo, *optsize); + SCTP_FIND_STCB(inp, stcb, srto->srto_assoc_id); + + if (stcb) { + srto->srto_initial = stcb->asoc.initial_rto; + srto->srto_max = stcb->asoc.maxrto; + srto->srto_min = stcb->asoc.minrto; + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_RLOCK(inp); + srto->srto_initial = inp->sctp_ep.initial_rto; + srto->srto_max = inp->sctp_ep.sctp_maxrto; + srto->srto_min = inp->sctp_ep.sctp_minrto; + SCTP_INP_RUNLOCK(inp); + } + *optsize = sizeof(*srto); + } + break; + case SCTP_TIMEOUTS: + { + struct sctp_timeouts *stimo; + + SCTP_CHECK_AND_CAST(stimo, optval, struct sctp_timeouts, *optsize); + SCTP_FIND_STCB(inp, stcb, stimo->stimo_assoc_id); + + if (stcb) { + stimo->stimo_init = stcb->asoc.timoinit; + stimo->stimo_data = stcb->asoc.timodata; + stimo->stimo_sack = stcb->asoc.timosack; + stimo->stimo_shutdown = stcb->asoc.timoshutdown; + stimo->stimo_heartbeat = stcb->asoc.timoheartbeat; + stimo->stimo_cookie = stcb->asoc.timocookie; + stimo->stimo_shutdownack = stcb->asoc.timoshutdownack; + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + error = EINVAL; + } + *optsize = sizeof(*stimo); + } + break; + case SCTP_ASSOCINFO: + { + struct sctp_assocparams *sasoc; + uint32_t oldval; + + SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, *optsize); + SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id); + + if (stcb) { + oldval = sasoc->sasoc_cookie_life; + sasoc->sasoc_cookie_life = TICKS_TO_MSEC(stcb->asoc.cookie_life); + sasoc->sasoc_asocmaxrxt = stcb->asoc.max_send_times; + sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets; + sasoc->sasoc_peer_rwnd = stcb->asoc.peers_rwnd; + sasoc->sasoc_local_rwnd = stcb->asoc.my_rwnd; + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_RLOCK(inp); + sasoc->sasoc_cookie_life = TICKS_TO_MSEC(inp->sctp_ep.def_cookie_life); + sasoc->sasoc_asocmaxrxt = inp->sctp_ep.max_send_times; + sasoc->sasoc_number_peer_destinations = 0; + sasoc->sasoc_peer_rwnd = 0; + sasoc->sasoc_local_rwnd = sbspace(&inp->sctp_socket->so_rcv); + SCTP_INP_RUNLOCK(inp); + } + *optsize = sizeof(*sasoc); + } + break; + case SCTP_DEFAULT_SEND_PARAM: + { + struct sctp_sndrcvinfo *s_info; + + SCTP_CHECK_AND_CAST(s_info, optval, struct sctp_sndrcvinfo, *optsize); + SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id); + + if (stcb) { + memcpy(s_info, &stcb->asoc.def_send, sizeof(stcb->asoc.def_send)); + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_RLOCK(inp); + memcpy(s_info, &inp->def_send, sizeof(inp->def_send)); + SCTP_INP_RUNLOCK(inp); + } + *optsize = sizeof(*s_info); + } + break; + case SCTP_INITMSG: + { + struct sctp_initmsg *sinit; + + SCTP_CHECK_AND_CAST(sinit, optval, struct sctp_initmsg, *optsize); + SCTP_INP_RLOCK(inp); + sinit->sinit_num_ostreams = inp->sctp_ep.pre_open_stream_count; + sinit->sinit_max_instreams = inp->sctp_ep.max_open_streams_intome; + sinit->sinit_max_attempts = inp->sctp_ep.max_init_times; + sinit->sinit_max_init_timeo = inp->sctp_ep.initial_init_rto_max; + SCTP_INP_RUNLOCK(inp); + *optsize = sizeof(*sinit); + } + break; + case SCTP_PRIMARY_ADDR: + /* we allow a "get" operation on this */ + { + struct sctp_setprim *ssp; + + SCTP_CHECK_AND_CAST(ssp, optval, struct sctp_setprim, *optsize); + SCTP_FIND_STCB(inp, stcb, ssp->ssp_assoc_id); + + if (stcb) { + /* simply copy out the sockaddr_storage... */ + int len; + + len = *optsize; + if (len > stcb->asoc.primary_destination->ro._l_addr.sa.sa_len) + len = stcb->asoc.primary_destination->ro._l_addr.sa.sa_len; + + memcpy(&ssp->ssp_addr, + &stcb->asoc.primary_destination->ro._l_addr, + len); + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + error = EINVAL; + } + *optsize = sizeof(*ssp); + } + break; + + case SCTP_HMAC_IDENT: + { + struct sctp_hmacalgo *shmac; + sctp_hmaclist_t *hmaclist; + uint32_t size; + int i; + + SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, *optsize); + + SCTP_INP_RLOCK(inp); + hmaclist = inp->sctp_ep.local_hmacs; + if (hmaclist == NULL) { + /* no HMACs to return */ + *optsize = sizeof(*shmac); + SCTP_INP_RUNLOCK(inp); + break; + } + /* is there room for all of the hmac ids? */ + size = sizeof(*shmac) + (hmaclist->num_algo * + sizeof(shmac->shmac_idents[0])); + if ((size_t)(*optsize) < size) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + error = EINVAL; + SCTP_INP_RUNLOCK(inp); + break; + } + /* copy in the list */ + shmac->shmac_number_of_idents = hmaclist->num_algo; + for (i = 0; i < hmaclist->num_algo; i++) { + shmac->shmac_idents[i] = hmaclist->hmac[i]; + } + SCTP_INP_RUNLOCK(inp); + *optsize = size; + break; + } + case SCTP_AUTH_ACTIVE_KEY: + { + struct sctp_authkeyid *scact; + + SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, *optsize); + SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id); + + if (stcb) { + /* get the active key on the assoc */ + scact->scact_keynumber = stcb->asoc.authinfo.active_keyid; + SCTP_TCB_UNLOCK(stcb); + } else { + /* get the endpoint active key */ + SCTP_INP_RLOCK(inp); + scact->scact_keynumber = inp->sctp_ep.default_keyid; + SCTP_INP_RUNLOCK(inp); + } + *optsize = sizeof(*scact); + break; + } + case SCTP_LOCAL_AUTH_CHUNKS: + { + struct sctp_authchunks *sac; + sctp_auth_chklist_t *chklist = NULL; + size_t size = 0; + + SCTP_CHECK_AND_CAST(sac, optval, struct sctp_authchunks, *optsize); + SCTP_FIND_STCB(inp, stcb, sac->gauth_assoc_id); + + if (stcb) { + /* get off the assoc */ + chklist = stcb->asoc.local_auth_chunks; + /* is there enough space? */ + size = sctp_auth_get_chklist_size(chklist); + if (*optsize < (sizeof(struct sctp_authchunks) + size)) { + error = EINVAL; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + } else { + /* copy in the chunks */ + (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + } + SCTP_TCB_UNLOCK(stcb); + } else { + /* get off the endpoint */ + SCTP_INP_RLOCK(inp); + chklist = inp->sctp_ep.local_auth_chunks; + /* is there enough space? */ + size = sctp_auth_get_chklist_size(chklist); + if (*optsize < (sizeof(struct sctp_authchunks) + size)) { + error = EINVAL; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + } else { + /* copy in the chunks */ + (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + } + SCTP_INP_RUNLOCK(inp); + } + *optsize = sizeof(struct sctp_authchunks) + size; + break; + } + case SCTP_PEER_AUTH_CHUNKS: + { + struct sctp_authchunks *sac; + sctp_auth_chklist_t *chklist = NULL; + size_t size = 0; + + SCTP_CHECK_AND_CAST(sac, optval, struct sctp_authchunks, *optsize); + SCTP_FIND_STCB(inp, stcb, sac->gauth_assoc_id); + + if (stcb) { + /* get off the assoc */ + chklist = stcb->asoc.peer_auth_chunks; + /* is there enough space? */ + size = sctp_auth_get_chklist_size(chklist); + if (*optsize < (sizeof(struct sctp_authchunks) + size)) { + error = EINVAL; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + } else { + /* copy in the chunks */ + (void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks); + } + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); + error = ENOENT; + } + *optsize = sizeof(struct sctp_authchunks) + size; + break; + } + + + default: + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); + error = ENOPROTOOPT; + *optsize = 0; + break; + } /* end switch (sopt->sopt_name) */ + return (error); +} + +static int +sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize, + void *p) +{ + int error, set_opt; + uint32_t *mopt; + struct sctp_tcb *stcb = NULL; + struct sctp_inpcb *inp = NULL; + uint32_t vrf_id; + + if (optval == NULL) { + SCTP_PRINTF("optval is NULL\n"); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == 0) { + SCTP_PRINTF("inp is NULL?\n"); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return EINVAL; + } + vrf_id = inp->def_vrf_id; + + error = 0; + switch (optname) { + case SCTP_NODELAY: + case SCTP_AUTOCLOSE: + case SCTP_AUTO_ASCONF: + case SCTP_EXPLICIT_EOR: + case SCTP_DISABLE_FRAGMENTS: + case SCTP_USE_EXT_RCVINFO: + case SCTP_I_WANT_MAPPED_V4_ADDR: + /* copy in the option value */ + SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize); + set_opt = 0; + if (error) + break; + switch (optname) { + case SCTP_DISABLE_FRAGMENTS: + set_opt = SCTP_PCB_FLAGS_NO_FRAGMENT; + break; + case SCTP_AUTO_ASCONF: + /* + * NOTE: we don't really support this flag + */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + /* only valid for bound all sockets */ + set_opt = SCTP_PCB_FLAGS_AUTO_ASCONF; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + break; + case SCTP_EXPLICIT_EOR: + set_opt = SCTP_PCB_FLAGS_EXPLICIT_EOR; + break; + case SCTP_USE_EXT_RCVINFO: + set_opt = SCTP_PCB_FLAGS_EXT_RCVINFO; + break; + case SCTP_I_WANT_MAPPED_V4_ADDR: + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + set_opt = SCTP_PCB_FLAGS_NEEDS_MAPPED_V4; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + break; + case SCTP_NODELAY: + set_opt = SCTP_PCB_FLAGS_NODELAY; + break; + case SCTP_AUTOCLOSE: + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + set_opt = SCTP_PCB_FLAGS_AUTOCLOSE; + /* + * The value is in ticks. Note this does not effect + * old associations, only new ones. + */ + inp->sctp_ep.auto_close_time = SEC_TO_TICKS(*mopt); + break; + } + SCTP_INP_WLOCK(inp); + if (*mopt != 0) { + sctp_feature_on(inp, set_opt); + } else { + sctp_feature_off(inp, set_opt); + } + SCTP_INP_WUNLOCK(inp); + break; + case SCTP_REUSE_PORT: + { + SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize); + if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) { + /* Can't set it after we are bound */ + error = EINVAL; + break; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) { + /* Can't do this for a 1-m socket */ + error = EINVAL; + break; + } + if (optval) + sctp_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE); + else + sctp_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE); + } + break; + case SCTP_PARTIAL_DELIVERY_POINT: + { + uint32_t *value; + + SCTP_CHECK_AND_CAST(value, optval, uint32_t, optsize); + if (*value > SCTP_SB_LIMIT_RCV(so)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + inp->partial_delivery_point = *value; + } + break; + case SCTP_FRAGMENT_INTERLEAVE: + /* not yet until we re-write sctp_recvmsg() */ + { + uint32_t *level; + + SCTP_CHECK_AND_CAST(level, optval, uint32_t, optsize); + if (*level == SCTP_FRAG_LEVEL_2) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); + sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); + } else if (*level == SCTP_FRAG_LEVEL_1) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); + sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); + } else if (*level == SCTP_FRAG_LEVEL_0) { + sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); + sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); + + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + } + break; + case SCTP_CMT_ON_OFF: + if (SCTP_BASE_SYSCTL(sctp_cmt_on_off)) { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + if (stcb) { + if (av->assoc_value != 0) + stcb->asoc.sctp_cmt_on_off = 1; + else + stcb->asoc.sctp_cmt_on_off = 0; + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_WLOCK(inp); + if (av->assoc_value != 0) + inp->sctp_cmt_on_off = 1; + else + inp->sctp_cmt_on_off = 0; + SCTP_INP_WUNLOCK(inp); + } + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); + error = ENOPROTOOPT; + } + break; + /* JRS - Set socket option for pluggable congestion control */ + case SCTP_PLUGGABLE_CC: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + if (stcb) { + switch (av->assoc_value) { + /* + * JRS - Standard TCP congestion + * control + */ + case SCTP_CC_RFC2581: + { + stcb->asoc.congestion_control_module = SCTP_CC_RFC2581; + stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_set_initial_cc_param; + stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_cwnd_update_after_sack; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_cwnd_update_after_fr; + stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_cwnd_update_after_timeout; + stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_cwnd_update_after_ecn_echo; + stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped; + stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_cwnd_update_after_fr_timer; + SCTP_TCB_UNLOCK(stcb); + break; + } + /* + * JRS - High Speed TCP congestion + * control (Floyd) + */ + case SCTP_CC_HSTCP: + { + stcb->asoc.congestion_control_module = SCTP_CC_HSTCP; + stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_set_initial_cc_param; + stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_hs_cwnd_update_after_sack; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_hs_cwnd_update_after_fr; + stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_cwnd_update_after_timeout; + stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_cwnd_update_after_ecn_echo; + stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped; + stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_cwnd_update_after_fr_timer; + SCTP_TCB_UNLOCK(stcb); + break; + } + /* JRS - HTCP congestion control */ + case SCTP_CC_HTCP: + { + stcb->asoc.congestion_control_module = SCTP_CC_HTCP; + stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_htcp_set_initial_cc_param; + stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_htcp_cwnd_update_after_sack; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_htcp_cwnd_update_after_fr; + stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_htcp_cwnd_update_after_timeout; + stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_htcp_cwnd_update_after_ecn_echo; + stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped; + stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_htcp_cwnd_update_after_fr_timer; + SCTP_TCB_UNLOCK(stcb); + break; + } + /* + * JRS - All other values are + * invalid + */ + default: + { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + break; + } + } + } else { + switch (av->assoc_value) { + case SCTP_CC_RFC2581: + case SCTP_CC_HSTCP: + case SCTP_CC_HTCP: + inp->sctp_ep.sctp_default_cc_module = av->assoc_value; + break; + default: + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + }; + } + } + break; + case SCTP_CLR_STAT_LOG: + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); + error = EOPNOTSUPP; + break; + case SCTP_CONTEXT: + { + struct sctp_assoc_value *av; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (stcb) { + stcb->asoc.context = av->assoc_value; + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_WLOCK(inp); + inp->sctp_context = av->assoc_value; + SCTP_INP_WUNLOCK(inp); + } + } + break; + case SCTP_VRF_ID: + { + uint32_t *default_vrfid; + + SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, optsize); + if (*default_vrfid > SCTP_MAX_VRF_ID) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + inp->def_vrf_id = *default_vrfid; + break; + } + case SCTP_DEL_VRF_ID: + { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); + error = EOPNOTSUPP; + break; + } + case SCTP_ADD_VRF_ID: + { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); + error = EOPNOTSUPP; + break; + } + case SCTP_DELAYED_SACK: + { + struct sctp_sack_info *sack; + + SCTP_CHECK_AND_CAST(sack, optval, struct sctp_sack_info, optsize); + SCTP_FIND_STCB(inp, stcb, sack->sack_assoc_id); + if (sack->sack_delay) { + if (sack->sack_delay > SCTP_MAX_SACK_DELAY) + sack->sack_delay = SCTP_MAX_SACK_DELAY; + } + if (stcb) { + if (sack->sack_delay) { + if (MSEC_TO_TICKS(sack->sack_delay) < 1) { + sack->sack_delay = TICKS_TO_MSEC(1); + } + stcb->asoc.delayed_ack = sack->sack_delay; + } + if (sack->sack_freq) { + stcb->asoc.sack_freq = sack->sack_freq; + } + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_WLOCK(inp); + if (sack->sack_delay) { + if (MSEC_TO_TICKS(sack->sack_delay) < 1) { + sack->sack_delay = TICKS_TO_MSEC(1); + } + inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV] = MSEC_TO_TICKS(sack->sack_delay); + } + if (sack->sack_freq) { + inp->sctp_ep.sctp_sack_freq = sack->sack_freq; + } + SCTP_INP_WUNLOCK(inp); + } + break; + } + case SCTP_AUTH_CHUNK: + { + struct sctp_authchunk *sauth; + + SCTP_CHECK_AND_CAST(sauth, optval, struct sctp_authchunk, optsize); + + SCTP_INP_WLOCK(inp); + if (sctp_auth_add_chunk(sauth->sauth_chunk, inp->sctp_ep.local_auth_chunks)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + break; + } + case SCTP_AUTH_KEY: + { + struct sctp_authkey *sca; + struct sctp_keyhead *shared_keys; + sctp_sharedkey_t *shared_key; + sctp_key_t *key = NULL; + size_t size; + + SCTP_CHECK_AND_CAST(sca, optval, struct sctp_authkey, optsize); + SCTP_FIND_STCB(inp, stcb, sca->sca_assoc_id); + size = optsize - sizeof(*sca); + + if (stcb) { + /* set it on the assoc */ + shared_keys = &stcb->asoc.shared_keys; + /* clear the cached keys for this key id */ + sctp_clear_cachedkeys(stcb, sca->sca_keynumber); + /* + * create the new shared key and + * insert/replace it + */ + if (size > 0) { + key = sctp_set_key(sca->sca_key, (uint32_t) size); + if (key == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); + error = ENOMEM; + SCTP_TCB_UNLOCK(stcb); + break; + } + } + shared_key = sctp_alloc_sharedkey(); + if (shared_key == NULL) { + sctp_free_key(key); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); + error = ENOMEM; + SCTP_TCB_UNLOCK(stcb); + break; + } + shared_key->key = key; + shared_key->keyid = sca->sca_keynumber; + error = sctp_insert_sharedkey(shared_keys, shared_key); + SCTP_TCB_UNLOCK(stcb); + } else { + /* set it on the endpoint */ + SCTP_INP_WLOCK(inp); + shared_keys = &inp->sctp_ep.shared_keys; + /* + * clear the cached keys on all assocs for + * this key id + */ + sctp_clear_cachedkeys_ep(inp, sca->sca_keynumber); + /* + * create the new shared key and + * insert/replace it + */ + if (size > 0) { + key = sctp_set_key(sca->sca_key, (uint32_t) size); + if (key == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); + error = ENOMEM; + SCTP_INP_WUNLOCK(inp); + break; + } + } + shared_key = sctp_alloc_sharedkey(); + if (shared_key == NULL) { + sctp_free_key(key); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); + error = ENOMEM; + SCTP_INP_WUNLOCK(inp); + break; + } + shared_key->key = key; + shared_key->keyid = sca->sca_keynumber; + error = sctp_insert_sharedkey(shared_keys, shared_key); + SCTP_INP_WUNLOCK(inp); + } + break; + } + case SCTP_HMAC_IDENT: + { + struct sctp_hmacalgo *shmac; + sctp_hmaclist_t *hmaclist; + uint16_t hmacid; + uint32_t i; + + size_t found; + + SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, optsize); + if (optsize < sizeof(struct sctp_hmacalgo) + shmac->shmac_number_of_idents * sizeof(uint16_t)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + hmaclist = sctp_alloc_hmaclist(shmac->shmac_number_of_idents); + if (hmaclist == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM); + error = ENOMEM; + break; + } + for (i = 0; i < shmac->shmac_number_of_idents; i++) { + hmacid = shmac->shmac_idents[i]; + if (sctp_auth_add_hmacid(hmaclist, hmacid)) { + /* invalid HMACs were found */ ; + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + sctp_free_hmaclist(hmaclist); + goto sctp_set_hmac_done; + } + } + found = 0; + for (i = 0; i < hmaclist->num_algo; i++) { + if (hmaclist->hmac[i] == SCTP_AUTH_HMAC_ID_SHA1) { + /* already in list */ + found = 1; + } + } + if (!found) { + sctp_free_hmaclist(hmaclist); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + /* set it on the endpoint */ + SCTP_INP_WLOCK(inp); + if (inp->sctp_ep.local_hmacs) + sctp_free_hmaclist(inp->sctp_ep.local_hmacs); + inp->sctp_ep.local_hmacs = hmaclist; + SCTP_INP_WUNLOCK(inp); + sctp_set_hmac_done: + break; + } + case SCTP_AUTH_ACTIVE_KEY: + { + struct sctp_authkeyid *scact; + + SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, + optsize); + SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id); + + /* set the active key on the right place */ + if (stcb) { + /* set the active key on the assoc */ + if (sctp_auth_setactivekey(stcb, + scact->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, + SCTP_FROM_SCTP_USRREQ, + EINVAL); + error = EINVAL; + } + SCTP_TCB_UNLOCK(stcb); + } else { + /* set the active key on the endpoint */ + SCTP_INP_WLOCK(inp); + if (sctp_auth_setactivekey_ep(inp, + scact->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, + SCTP_FROM_SCTP_USRREQ, + EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + } + break; + } + case SCTP_AUTH_DELETE_KEY: + { + struct sctp_authkeyid *scdel; + + SCTP_CHECK_AND_CAST(scdel, optval, struct sctp_authkeyid, + optsize); + SCTP_FIND_STCB(inp, stcb, scdel->scact_assoc_id); + + /* delete the key from the right place */ + if (stcb) { + if (sctp_delete_sharedkey(stcb, + scdel->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, + SCTP_FROM_SCTP_USRREQ, + EINVAL); + error = EINVAL; + } + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_WLOCK(inp); + if (sctp_delete_sharedkey_ep(inp, + scdel->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, + SCTP_FROM_SCTP_USRREQ, + EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + } + break; + } + case SCTP_AUTH_DEACTIVATE_KEY: + { + struct sctp_authkeyid *keyid; + + SCTP_CHECK_AND_CAST(keyid, optval, struct sctp_authkeyid, + optsize); + SCTP_FIND_STCB(inp, stcb, keyid->scact_assoc_id); + + /* deactivate the key from the right place */ + if (stcb) { + if (sctp_deact_sharedkey(stcb, + keyid->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, + SCTP_FROM_SCTP_USRREQ, + EINVAL); + error = EINVAL; + } + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_WLOCK(inp); + if (sctp_deact_sharedkey_ep(inp, + keyid->scact_keynumber)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, + SCTP_FROM_SCTP_USRREQ, + EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + } + break; + } + + case SCTP_RESET_STREAMS: + { + struct sctp_stream_reset *strrst; + uint8_t send_in = 0, send_tsn = 0, send_out = 0, + addstream = 0; + uint16_t addstrmcnt = 0; + int i; + + SCTP_CHECK_AND_CAST(strrst, optval, struct sctp_stream_reset, optsize); + SCTP_FIND_STCB(inp, stcb, strrst->strrst_assoc_id); + + if (stcb == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); + error = ENOENT; + break; + } + if (stcb->asoc.peer_supports_strreset == 0) { + /* + * Peer does not support it, we return + * protocol not supported since this is true + * for this feature and this peer, not the + * socket request in general. + */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EPROTONOSUPPORT); + error = EPROTONOSUPPORT; + SCTP_TCB_UNLOCK(stcb); + break; + } + if (stcb->asoc.stream_reset_outstanding) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); + error = EALREADY; + SCTP_TCB_UNLOCK(stcb); + break; + } + if (strrst->strrst_flags == SCTP_RESET_LOCAL_RECV) { + send_in = 1; + } else if (strrst->strrst_flags == SCTP_RESET_LOCAL_SEND) { + send_out = 1; + } else if (strrst->strrst_flags == SCTP_RESET_BOTH) { + send_in = 1; + send_out = 1; + } else if (strrst->strrst_flags == SCTP_RESET_TSN) { + send_tsn = 1; + } else if (strrst->strrst_flags == SCTP_RESET_ADD_STREAMS) { + if (send_tsn || + send_in || + send_out) { + /* We can't do that and add streams */ + error = EINVAL; + goto skip_stuff; + } + if (stcb->asoc.stream_reset_outstanding) { + error = EBUSY; + goto skip_stuff; + } + addstream = 1; + /* We allocate here */ + addstrmcnt = strrst->strrst_num_streams; + if ((int)(addstrmcnt + stcb->asoc.streamoutcnt) > 0xffff) { + /* You can't have more than 64k */ + error = EINVAL; + goto skip_stuff; + } + if ((stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt) < addstrmcnt) { + /* Need to allocate more */ + struct sctp_stream_out *oldstream; + struct sctp_stream_queue_pending *sp; + int removed; + + oldstream = stcb->asoc.strmout; + /* get some more */ + SCTP_MALLOC(stcb->asoc.strmout, struct sctp_stream_out *, + ((stcb->asoc.streamoutcnt + addstrmcnt) * sizeof(struct sctp_stream_out)), + SCTP_M_STRMO); + if (stcb->asoc.strmout == NULL) { + stcb->asoc.strmout = oldstream; + error = ENOMEM; + goto skip_stuff; + } + /* + * Ok now we proceed with copying + * the old out stuff and + * initializing the new stuff. + */ + SCTP_TCB_SEND_LOCK(stcb); + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); + stcb->asoc.strmout[i].next_sequence_sent = oldstream[i].next_sequence_sent; + stcb->asoc.strmout[i].last_msg_incomplete = oldstream[i].last_msg_incomplete; + stcb->asoc.strmout[i].stream_no = i; + if (oldstream[i].next_spoke.tqe_next) { + sctp_remove_from_wheel(stcb, &stcb->asoc, &oldstream[i], 1); + stcb->asoc.strmout[i].next_spoke.tqe_next = NULL; + stcb->asoc.strmout[i].next_spoke.tqe_prev = NULL; + removed = 1; + } else { + /* not on out wheel */ + stcb->asoc.strmout[i].next_spoke.tqe_next = NULL; + stcb->asoc.strmout[i].next_spoke.tqe_prev = NULL; + removed = 0; + } + /* + * now anything on those + * queues? + */ + while (TAILQ_EMPTY(&oldstream[i].outqueue) == 0) { + sp = TAILQ_FIRST(&oldstream[i].outqueue); + TAILQ_REMOVE(&oldstream[i].outqueue, sp, next); + TAILQ_INSERT_TAIL(&stcb->asoc.strmout[i].outqueue, sp, next); + } + /* Did we disrupt the wheel? */ + if (removed) { + sctp_insert_on_wheel(stcb, + &stcb->asoc, + &stcb->asoc.strmout[i], + 1); + } + /* + * Now move assoc pointers + * too + */ + if (stcb->asoc.last_out_stream == &oldstream[i]) { + stcb->asoc.last_out_stream = &stcb->asoc.strmout[i]; + } + if (stcb->asoc.locked_on_sending == &oldstream[i]) { + stcb->asoc.locked_on_sending = &stcb->asoc.strmout[i]; + } + } + /* now the new streams */ + for (i = stcb->asoc.streamoutcnt; i < (stcb->asoc.streamoutcnt + addstrmcnt); i++) { + stcb->asoc.strmout[i].next_sequence_sent = 0x0; + TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); + stcb->asoc.strmout[i].stream_no = i; + stcb->asoc.strmout[i].last_msg_incomplete = 0; + stcb->asoc.strmout[i].next_spoke.tqe_next = NULL; + stcb->asoc.strmout[i].next_spoke.tqe_prev = NULL; + } + stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt + addstrmcnt; + SCTP_FREE(oldstream, SCTP_M_STRMO); + } + SCTP_TCB_SEND_UNLOCK(stcb); + goto skip_stuff; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + SCTP_TCB_UNLOCK(stcb); + break; + } + for (i = 0; i < strrst->strrst_num_streams; i++) { + if ((send_in) && + + (strrst->strrst_list[i] > stcb->asoc.streamincnt)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + goto get_out; + } + if ((send_out) && + (strrst->strrst_list[i] > stcb->asoc.streamoutcnt)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + goto get_out; + } + } + skip_stuff: + if (error) { + get_out: + SCTP_TCB_UNLOCK(stcb); + break; + } + error = sctp_send_str_reset_req(stcb, strrst->strrst_num_streams, + strrst->strrst_list, + send_out, (stcb->asoc.str_reset_seq_in - 3), + send_in, send_tsn, addstream, addstrmcnt); + + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED); + SCTP_TCB_UNLOCK(stcb); + } + break; + + case SCTP_CONNECT_X: + if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + error = sctp_do_connect_x(so, inp, optval, optsize, p, 0); + break; + + case SCTP_CONNECT_X_DELAYED: + if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + error = sctp_do_connect_x(so, inp, optval, optsize, p, 1); + break; + + case SCTP_CONNECT_X_COMPLETE: + { + struct sockaddr *sa; + struct sctp_nets *net; + + /* FIXME MT: check correct? */ + SCTP_CHECK_AND_CAST(sa, optval, struct sockaddr, optsize); + + /* find tcb */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { + SCTP_INP_RLOCK(inp); + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb) { + SCTP_TCB_LOCK(stcb); + net = sctp_findnet(stcb, sa); + } + SCTP_INP_RUNLOCK(inp); + } else { + /* + * We increment here since + * sctp_findassociation_ep_addr() wil do a + * decrement if it finds the stcb as long as + * the locked tcb (last argument) is NOT a + * TCB.. aka NULL. + */ + SCTP_INP_INCR_REF(inp); + stcb = sctp_findassociation_ep_addr(&inp, sa, &net, NULL, NULL); + if (stcb == NULL) { + SCTP_INP_DECR_REF(inp); + } + } + + if (stcb == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); + error = ENOENT; + break; + } + if (stcb->asoc.delayed_connection == 1) { + stcb->asoc.delayed_connection = 0; + (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); + sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, + stcb->asoc.primary_destination, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_9); + sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); + } else { + /* + * already expired or did not use delayed + * connectx + */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); + error = EALREADY; + } + SCTP_TCB_UNLOCK(stcb); + } + break; + case SCTP_MAX_BURST: + { + uint8_t *burst; + + SCTP_CHECK_AND_CAST(burst, optval, uint8_t, optsize); + + SCTP_INP_WLOCK(inp); + if (*burst) { + inp->sctp_ep.max_burst = *burst; + } + SCTP_INP_WUNLOCK(inp); + } + break; + case SCTP_MAXSEG: + { + struct sctp_assoc_value *av; + int ovh; + + SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize); + SCTP_FIND_STCB(inp, stcb, av->assoc_id); + + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ovh = SCTP_MED_OVERHEAD; + } else { + ovh = SCTP_MED_V4_OVERHEAD; + } + if (stcb) { + if (av->assoc_value) { + stcb->asoc.sctp_frag_point = (av->assoc_value + ovh); + } else { + stcb->asoc.sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; + } + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_WLOCK(inp); + /* + * FIXME MT: I think this is not in tune + * with the API ID + */ + if (av->assoc_value) { + inp->sctp_frag_point = (av->assoc_value + ovh); + } else { + inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; + } + SCTP_INP_WUNLOCK(inp); + } + } + break; + case SCTP_EVENTS: + { + struct sctp_event_subscribe *events; + + SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, optsize); + + SCTP_INP_WLOCK(inp); + if (events->sctp_data_io_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT); + } + + if (events->sctp_association_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT); + } + + if (events->sctp_address_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVPADDREVNT); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVPADDREVNT); + } + + if (events->sctp_send_failure_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT); + } + + if (events->sctp_peer_error_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVPEERERR); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVPEERERR); + } + + if (events->sctp_shutdown_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT); + } + + if (events->sctp_partial_delivery_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_PDAPIEVNT); + } + + if (events->sctp_adaptation_layer_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT); + } + + if (events->sctp_authentication_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTHEVNT); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTHEVNT); + } + + if (events->sctp_sender_dry_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT); + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb) { + SCTP_TCB_LOCK(stcb); + } + if (stcb && + TAILQ_EMPTY(&stcb->asoc.send_queue) && + TAILQ_EMPTY(&stcb->asoc.sent_queue) && + (stcb->asoc.stream_queue_cnt == 0)) { + sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED); + } + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } + } + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_DRYEVNT); + } + + if (events->sctp_stream_reset_event) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT); + } else { + sctp_feature_off(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT); + } + SCTP_INP_WUNLOCK(inp); + } + break; + + case SCTP_ADAPTATION_LAYER: + { + struct sctp_setadaptation *adap_bits; + + SCTP_CHECK_AND_CAST(adap_bits, optval, struct sctp_setadaptation, optsize); + SCTP_INP_WLOCK(inp); + inp->sctp_ep.adaptation_layer_indicator = adap_bits->ssb_adaptation_ind; + SCTP_INP_WUNLOCK(inp); + } + break; +#ifdef SCTP_DEBUG + case SCTP_SET_INITIAL_DBG_SEQ: + { + uint32_t *vvv; + + SCTP_CHECK_AND_CAST(vvv, optval, uint32_t, optsize); + SCTP_INP_WLOCK(inp); + inp->sctp_ep.initial_sequence_debug = *vvv; + SCTP_INP_WUNLOCK(inp); + } + break; +#endif + case SCTP_DEFAULT_SEND_PARAM: + { + struct sctp_sndrcvinfo *s_info; + + SCTP_CHECK_AND_CAST(s_info, optval, struct sctp_sndrcvinfo, optsize); + SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id); + + if (stcb) { + if (s_info->sinfo_stream <= stcb->asoc.streamoutcnt) { + memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send))); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_WLOCK(inp); + memcpy(&inp->def_send, s_info, min(optsize, sizeof(inp->def_send))); + SCTP_INP_WUNLOCK(inp); + } + } + break; + case SCTP_PEER_ADDR_PARAMS: + /* Applys to the specific association */ + { + struct sctp_paddrparams *paddrp; + struct sctp_nets *net; + + SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, optsize); + SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id); + net = NULL; + if (stcb) { + net = sctp_findnet(stcb, (struct sockaddr *)&paddrp->spp_address); + } else { + /* + * We increment here since + * sctp_findassociation_ep_addr() wil do a + * decrement if it finds the stcb as long as + * the locked tcb (last argument) is NOT a + * TCB.. aka NULL. + */ + SCTP_INP_INCR_REF(inp); + stcb = sctp_findassociation_ep_addr(&inp, + (struct sockaddr *)&paddrp->spp_address, + &net, NULL, NULL); + if (stcb == NULL) { + SCTP_INP_DECR_REF(inp); + } + } + if (stcb && (net == NULL)) { + struct sockaddr *sa; + + sa = (struct sockaddr *)&paddrp->spp_address; + if (sa->sa_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)sa; + if (sin->sin_addr.s_addr) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + SCTP_TCB_UNLOCK(stcb); + error = EINVAL; + break; + } + } else if (sa->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)sa; + if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + SCTP_TCB_UNLOCK(stcb); + error = EINVAL; + break; + } + } else { + error = EAFNOSUPPORT; + SCTP_TCB_UNLOCK(stcb); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } + } + /* sanity checks */ + if ((paddrp->spp_flags & SPP_HB_ENABLE) && (paddrp->spp_flags & SPP_HB_DISABLE)) { + if (stcb) + SCTP_TCB_UNLOCK(stcb); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + if ((paddrp->spp_flags & SPP_PMTUD_ENABLE) && (paddrp->spp_flags & SPP_PMTUD_DISABLE)) { + if (stcb) + SCTP_TCB_UNLOCK(stcb); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + if (stcb) { + /************************TCB SPECIFIC SET ******************/ + /* + * do we change the timer for HB, we run + * only one? + */ + int ovh = 0; + + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ovh = SCTP_MED_OVERHEAD; + } else { + ovh = SCTP_MED_V4_OVERHEAD; + } + + if (paddrp->spp_hbinterval) + stcb->asoc.heart_beat_delay = paddrp->spp_hbinterval; + else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) + stcb->asoc.heart_beat_delay = 0; + + /* network sets ? */ + if (net) { + /************************NET SPECIFIC SET ******************/ + if (paddrp->spp_flags & SPP_HB_DEMAND) { + /* on demand HB */ + if (sctp_send_hb(stcb, 1, net) < 0) { + /* asoc destroyed */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + } + if (paddrp->spp_flags & SPP_HB_DISABLE) { + net->dest_state |= SCTP_ADDR_NOHB; + } + if (paddrp->spp_flags & SPP_HB_ENABLE) { + net->dest_state &= ~SCTP_ADDR_NOHB; + } + if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) { + if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { + sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10); + } + if (paddrp->spp_pathmtu > SCTP_DEFAULT_MINSEGMENT) { + net->mtu = paddrp->spp_pathmtu + ovh; + if (net->mtu < stcb->asoc.smallest_mtu) { + sctp_pathmtu_adjustment(inp, stcb, net, net->mtu); + } + } + } + if (paddrp->spp_flags & SPP_PMTUD_ENABLE) { + if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { + sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net); + } + } + if (paddrp->spp_pathmaxrxt) + net->failure_threshold = paddrp->spp_pathmaxrxt; +#ifdef INET + if (paddrp->spp_flags & SPP_IPV4_TOS) { + if (net->ro._l_addr.sin.sin_family == AF_INET) { + net->tos_flowlabel = paddrp->spp_ipv4_tos & 0x000000fc; + } + } +#endif +#ifdef INET6 + if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) { + if (net->ro._l_addr.sin6.sin6_family == AF_INET6) { + net->tos_flowlabel = paddrp->spp_ipv6_flowlabel; + } + } +#endif + } else { + /************************ASSOC ONLY -- NO NET SPECIFIC SET ******************/ + if (paddrp->spp_pathmaxrxt) + stcb->asoc.def_net_failure = paddrp->spp_pathmaxrxt; + + if (paddrp->spp_flags & SPP_HB_ENABLE) { + /* Turn back on the timer */ + stcb->asoc.hb_is_disabled = 0; + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net); + } + if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) && (paddrp->spp_pathmtu >= SCTP_SMALLEST_PMTU)) { + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { + sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10); + } + if (paddrp->spp_pathmtu > SCTP_DEFAULT_MINSEGMENT) { + net->mtu = paddrp->spp_pathmtu + ovh; + if (net->mtu < stcb->asoc.smallest_mtu) { + sctp_pathmtu_adjustment(inp, stcb, net, net->mtu); + } + } + } + } + if (paddrp->spp_flags & SPP_PMTUD_ENABLE) { + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) { + sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net); + } + } + } + if (paddrp->spp_flags & SPP_HB_DISABLE) { + int cnt_of_unconf = 0; + struct sctp_nets *lnet; + + stcb->asoc.hb_is_disabled = 1; + TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) { + if (lnet->dest_state & SCTP_ADDR_UNCONFIRMED) { + cnt_of_unconf++; + } + } + /* + * stop the timer ONLY if we + * have no unconfirmed + * addresses + */ + if (cnt_of_unconf == 0) { + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, + SCTP_FROM_SCTP_USRREQ + SCTP_LOC_11); + } + } + } + if (paddrp->spp_flags & SPP_HB_ENABLE) { + /* start up the timer. */ + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net); + } + } +#ifdef INET + if (paddrp->spp_flags & SPP_IPV4_TOS) + stcb->asoc.default_tos = paddrp->spp_ipv4_tos & 0x000000fc; +#endif +#ifdef INET6 + if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) + stcb->asoc.default_flowlabel = paddrp->spp_ipv6_flowlabel; +#endif + + } + SCTP_TCB_UNLOCK(stcb); + } else { + /************************NO TCB, SET TO default stuff ******************/ + SCTP_INP_WLOCK(inp); + /* + * For the TOS/FLOWLABEL stuff you set it + * with the options on the socket + */ + if (paddrp->spp_pathmaxrxt) { + inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt; + } + if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) + inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0; + else if (paddrp->spp_hbinterval) { + if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL) + paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL; + inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = MSEC_TO_TICKS(paddrp->spp_hbinterval); + } + if (paddrp->spp_flags & SPP_HB_ENABLE) { + sctp_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); + + } else if (paddrp->spp_flags & SPP_HB_DISABLE) { + sctp_feature_on(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT); + } + SCTP_INP_WUNLOCK(inp); + } + } + break; + case SCTP_RTOINFO: + { + struct sctp_rtoinfo *srto; + uint32_t new_init, new_min, new_max; + + SCTP_CHECK_AND_CAST(srto, optval, struct sctp_rtoinfo, optsize); + SCTP_FIND_STCB(inp, stcb, srto->srto_assoc_id); + + if (stcb) { + if (srto->srto_initial) + new_init = srto->srto_initial; + else + new_init = stcb->asoc.initial_rto; + if (srto->srto_max) + new_max = srto->srto_max; + else + new_max = stcb->asoc.maxrto; + if (srto->srto_min) + new_min = srto->srto_min; + else + new_min = stcb->asoc.minrto; + if ((new_min <= new_init) && (new_init <= new_max)) { + stcb->asoc.initial_rto = new_init; + stcb->asoc.maxrto = new_max; + stcb->asoc.minrto = new_min; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_WLOCK(inp); + if (srto->srto_initial) + new_init = srto->srto_initial; + else + new_init = inp->sctp_ep.initial_rto; + if (srto->srto_max) + new_max = srto->srto_max; + else + new_max = inp->sctp_ep.sctp_maxrto; + if (srto->srto_min) + new_min = srto->srto_min; + else + new_min = inp->sctp_ep.sctp_minrto; + if ((new_min <= new_init) && (new_init <= new_max)) { + inp->sctp_ep.initial_rto = new_init; + inp->sctp_ep.sctp_maxrto = new_max; + inp->sctp_ep.sctp_minrto = new_min; + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + SCTP_INP_WUNLOCK(inp); + } + } + break; + case SCTP_ASSOCINFO: + { + struct sctp_assocparams *sasoc; + + SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, optsize); + SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id); + if (sasoc->sasoc_cookie_life) { + /* boundary check the cookie life */ + if (sasoc->sasoc_cookie_life < 1000) + sasoc->sasoc_cookie_life = 1000; + if (sasoc->sasoc_cookie_life > SCTP_MAX_COOKIE_LIFE) { + sasoc->sasoc_cookie_life = SCTP_MAX_COOKIE_LIFE; + } + } + if (stcb) { + if (sasoc->sasoc_asocmaxrxt) + stcb->asoc.max_send_times = sasoc->sasoc_asocmaxrxt; + sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets; + sasoc->sasoc_peer_rwnd = 0; + sasoc->sasoc_local_rwnd = 0; + if (sasoc->sasoc_cookie_life) { + stcb->asoc.cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life); + } + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_INP_WLOCK(inp); + if (sasoc->sasoc_asocmaxrxt) + inp->sctp_ep.max_send_times = sasoc->sasoc_asocmaxrxt; + sasoc->sasoc_number_peer_destinations = 0; + sasoc->sasoc_peer_rwnd = 0; + sasoc->sasoc_local_rwnd = 0; + if (sasoc->sasoc_cookie_life) { + inp->sctp_ep.def_cookie_life = MSEC_TO_TICKS(sasoc->sasoc_cookie_life); + } + SCTP_INP_WUNLOCK(inp); + } + } + break; + case SCTP_INITMSG: + { + struct sctp_initmsg *sinit; + + SCTP_CHECK_AND_CAST(sinit, optval, struct sctp_initmsg, optsize); + SCTP_INP_WLOCK(inp); + if (sinit->sinit_num_ostreams) + inp->sctp_ep.pre_open_stream_count = sinit->sinit_num_ostreams; + + if (sinit->sinit_max_instreams) + inp->sctp_ep.max_open_streams_intome = sinit->sinit_max_instreams; + + if (sinit->sinit_max_attempts) + inp->sctp_ep.max_init_times = sinit->sinit_max_attempts; + + if (sinit->sinit_max_init_timeo) + inp->sctp_ep.initial_init_rto_max = sinit->sinit_max_init_timeo; + SCTP_INP_WUNLOCK(inp); + } + break; + case SCTP_PRIMARY_ADDR: + { + struct sctp_setprim *spa; + struct sctp_nets *net, *lnet; + + SCTP_CHECK_AND_CAST(spa, optval, struct sctp_setprim, optsize); + SCTP_FIND_STCB(inp, stcb, spa->ssp_assoc_id); + + net = NULL; + if (stcb) { + net = sctp_findnet(stcb, (struct sockaddr *)&spa->ssp_addr); + } else { + /* + * We increment here since + * sctp_findassociation_ep_addr() wil do a + * decrement if it finds the stcb as long as + * the locked tcb (last argument) is NOT a + * TCB.. aka NULL. + */ + SCTP_INP_INCR_REF(inp); + stcb = sctp_findassociation_ep_addr(&inp, + (struct sockaddr *)&spa->ssp_addr, + &net, NULL, NULL); + if (stcb == NULL) { + SCTP_INP_DECR_REF(inp); + } + } + + if ((stcb) && (net)) { + if ((net != stcb->asoc.primary_destination) && + (!(net->dest_state & SCTP_ADDR_UNCONFIRMED))) { + /* Ok we need to set it */ + lnet = stcb->asoc.primary_destination; + if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) { + if (net->dest_state & SCTP_ADDR_SWITCH_PRIMARY) { + net->dest_state |= SCTP_ADDR_DOUBLE_SWITCH; + } + net->dest_state |= SCTP_ADDR_SWITCH_PRIMARY; + } + } + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } + } + break; + case SCTP_SET_DYNAMIC_PRIMARY: + { + union sctp_sockstore *ss; + + error = priv_check(curthread, + PRIV_NETINET_RESERVEDPORT); + if (error) + break; + + SCTP_CHECK_AND_CAST(ss, optval, union sctp_sockstore, optsize); + /* SUPER USER CHECK? */ + error = sctp_dynamic_set_primary(&ss->sa, vrf_id); + } + break; + case SCTP_SET_PEER_PRIMARY_ADDR: + { + struct sctp_setpeerprim *sspp; + + SCTP_CHECK_AND_CAST(sspp, optval, struct sctp_setpeerprim, optsize); + SCTP_FIND_STCB(inp, stcb, sspp->sspp_assoc_id); + if (stcb != NULL) { + struct sctp_ifa *ifa; + + ifa = sctp_find_ifa_by_addr((struct sockaddr *)&sspp->sspp_addr, + stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED); + if (ifa == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + goto out_of_it; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { + /* + * Must validate the ifa found is in + * our ep + */ + struct sctp_laddr *laddr; + int found = 0; + + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa == NULL) { + SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", + __FUNCTION__); + continue; + } + if (laddr->ifa == ifa) { + found = 1; + break; + } + } + if (!found) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + goto out_of_it; + } + } + if (sctp_set_primary_ip_address_sa(stcb, + (struct sockaddr *)&sspp->sspp_addr) != 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + out_of_it: + SCTP_TCB_UNLOCK(stcb); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + + } + break; + case SCTP_BINDX_ADD_ADDR: + { + struct sctp_getaddresses *addrs; + size_t sz; + struct thread *td; + + td = (struct thread *)p; + SCTP_CHECK_AND_CAST(addrs, optval, struct sctp_getaddresses, + optsize); + if (addrs->addr->sa_family == AF_INET) { + sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in); + if (optsize < sz) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + if (td != NULL && (error = prison_local_ip4(td->td_ucred, &(((struct sockaddr_in *)(addrs->addr))->sin_addr)))) { + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } +#ifdef INET6 + } else if (addrs->addr->sa_family == AF_INET6) { + sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6); + if (optsize < sz) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + if (td != NULL && (error = prison_local_ip6(td->td_ucred, &(((struct sockaddr_in6 *)(addrs->addr))->sin6_addr), + (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) { + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } +#endif + } else { + error = EAFNOSUPPORT; + break; + } + sctp_bindx_add_address(so, inp, addrs->addr, + addrs->sget_assoc_id, vrf_id, + &error, p); + } + break; + case SCTP_BINDX_REM_ADDR: + { + struct sctp_getaddresses *addrs; + size_t sz; + struct thread *td; + + td = (struct thread *)p; + + SCTP_CHECK_AND_CAST(addrs, optval, struct sctp_getaddresses, optsize); + if (addrs->addr->sa_family == AF_INET) { + sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in); + if (optsize < sz) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + if (td != NULL && (error = prison_local_ip4(td->td_ucred, &(((struct sockaddr_in *)(addrs->addr))->sin_addr)))) { + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } +#ifdef INET6 + } else if (addrs->addr->sa_family == AF_INET6) { + sz = sizeof(struct sctp_getaddresses) - sizeof(struct sockaddr) + sizeof(struct sockaddr_in6); + if (optsize < sz) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + break; + } + if (td != NULL && (error = prison_local_ip6(td->td_ucred, &(((struct sockaddr_in6 *)(addrs->addr))->sin6_addr), + (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) { + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error); + break; + } +#endif + } else { + error = EAFNOSUPPORT; + break; + } + sctp_bindx_delete_address(so, inp, addrs->addr, + addrs->sget_assoc_id, vrf_id, + &error); + } + break; + default: + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT); + error = ENOPROTOOPT; + break; + } /* end switch (opt) */ + return (error); +} + +int +sctp_ctloutput(struct socket *so, struct sockopt *sopt) +{ + void *optval = NULL; + size_t optsize = 0; + struct sctp_inpcb *inp; + void *p; + int error = 0; + + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == 0) { + /* I made the same as TCP since we are not setup? */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (ECONNRESET); + } + if (sopt->sopt_level != IPPROTO_SCTP) { + /* wrong proto level... send back up to IP */ +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) + error = ip6_ctloutput(so, sopt); + else +#endif /* INET6 */ + error = ip_ctloutput(so, sopt); + return (error); + } + optsize = sopt->sopt_valsize; + if (optsize) { + SCTP_MALLOC(optval, void *, optsize, SCTP_M_SOCKOPT); + if (optval == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS); + return (ENOBUFS); + } + error = sooptcopyin(sopt, optval, optsize, optsize); + if (error) { + SCTP_FREE(optval, SCTP_M_SOCKOPT); + goto out; + } + } + p = (void *)sopt->sopt_td; + if (sopt->sopt_dir == SOPT_SET) { + error = sctp_setopt(so, sopt->sopt_name, optval, optsize, p); + } else if (sopt->sopt_dir == SOPT_GET) { + error = sctp_getopt(so, sopt->sopt_name, optval, &optsize, p); + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + } + if ((error == 0) && (optval != NULL)) { + error = sooptcopyout(sopt, optval, optsize); + SCTP_FREE(optval, SCTP_M_SOCKOPT); + } else if (optval != NULL) { + SCTP_FREE(optval, SCTP_M_SOCKOPT); + } +out: + return (error); +} + + +static int +sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p) +{ + int error = 0; + int create_lock_on = 0; + uint32_t vrf_id; + struct sctp_inpcb *inp; + struct sctp_tcb *stcb = NULL; + + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == 0) { + /* I made the same as TCP since we are not setup? */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (ECONNRESET); + } + if (addr == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return EINVAL; + } +#ifdef INET6 + if (addr->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6p; + + if (addr->sa_len != sizeof(struct sockaddr_in6)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + sin6p = (struct sockaddr_in6 *)addr; + if (p != NULL && (error = prison_remote_ip6(p->td_ucred, &sin6p->sin6_addr)) != 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + return (error); + } + } else +#endif + if (addr->sa_family == AF_INET) { + struct sockaddr_in *sinp; + + if (addr->sa_len != sizeof(struct sockaddr_in)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (EINVAL); + } + sinp = (struct sockaddr_in *)addr; + if (p != NULL && (error = prison_remote_ip4(p->td_ucred, &sinp->sin_addr)) != 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error); + return (error); + } + } else { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EAFNOSUPPORT); + return (EAFNOSUPPORT); + } + SCTP_INP_INCR_REF(inp); + SCTP_ASOC_CREATE_LOCK(inp); + create_lock_on = 1; + + + if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { + /* Should I really unlock ? */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EFAULT); + error = EFAULT; + goto out_now; + } +#ifdef INET6 + if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) && + (addr->sa_family == AF_INET6)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + goto out_now; + } +#endif /* INET6 */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == + SCTP_PCB_FLAGS_UNBOUND) { + /* Bind a ephemeral port */ + error = sctp_inpcb_bind(so, NULL, NULL, p); + if (error) { + goto out_now; + } + } + /* Now do we connect? */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) && + (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + error = EINVAL; + goto out_now; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && + (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) { + /* We are already connected AND the TCP model */ + SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); + error = EADDRINUSE; + goto out_now; + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { + SCTP_INP_RLOCK(inp); + stcb = LIST_FIRST(&inp->sctp_asoc_list); + SCTP_INP_RUNLOCK(inp); + } else { + /* + * We increment here since sctp_findassociation_ep_addr() + * will do a decrement if it finds the stcb as long as the + * locked tcb (last argument) is NOT a TCB.. aka NULL. + */ + SCTP_INP_INCR_REF(inp); + stcb = sctp_findassociation_ep_addr(&inp, addr, NULL, NULL, NULL); + if (stcb == NULL) { + SCTP_INP_DECR_REF(inp); + } else { + SCTP_TCB_UNLOCK(stcb); + } + } + if (stcb != NULL) { + /* Already have or am bring up an association */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY); + error = EALREADY; + goto out_now; + } + vrf_id = inp->def_vrf_id; + /* We are GOOD to go */ + stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, p); + if (stcb == NULL) { + /* Gak! no memory */ + goto out_now; + } + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { + stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; + /* Set the connected flag so we can queue data */ + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_state &= ~SBS_CANTRCVMORE; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_state &= ~SBS_CANTSENDMORE; + SOCKBUF_UNLOCK(&so->so_snd); + SOCK_LOCK(so); + so->so_state &= ~SS_ISDISCONNECTING; + SOCK_UNLOCK(so); + soisconnecting(so); + } + SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_COOKIE_WAIT); + (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); + + /* initialize authentication parameters for the assoc */ + sctp_initialize_auth_params(inp, stcb); + + sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); + SCTP_TCB_UNLOCK(stcb); +out_now: + if (create_lock_on) { + SCTP_ASOC_CREATE_UNLOCK(inp); + } + SCTP_INP_DECR_REF(inp); + return error; +} + +int +sctp_listen(struct socket *so, int backlog, struct thread *p) +{ + /* + * Note this module depends on the protocol processing being called + * AFTER any socket level flags and backlog are applied to the + * socket. The traditional way that the socket flags are applied is + * AFTER protocol processing. We have made a change to the + * sys/kern/uipc_socket.c module to reverse this but this MUST be in + * place if the socket API for SCTP is to work properly. + */ + + int error = 0; + struct sctp_inpcb *inp; + + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == 0) { + /* I made the same as TCP since we are not setup? */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (ECONNRESET); + } + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) { + /* See if we have a listener */ + struct sctp_inpcb *tinp; + union sctp_sockstore store, *sp; + + sp = &store; + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { + /* not bound all */ + struct sctp_laddr *laddr; + + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + memcpy(&store, &laddr->ifa->address, sizeof(store)); + sp->sin.sin_port = inp->sctp_lport; + tinp = sctp_pcb_findep(&sp->sa, 0, 0, inp->def_vrf_id); + if (tinp && (tinp != inp) && + ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) && + ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && + (tinp->sctp_socket->so_qlimit)) { + /* + * we have a listener already and + * its not this inp. + */ + SCTP_INP_DECR_REF(tinp); + return (EADDRINUSE); + } else if (tinp) { + SCTP_INP_DECR_REF(tinp); + } + } + } else { + /* Setup a local addr bound all */ + memset(&store, 0, sizeof(store)); + store.sin.sin_port = inp->sctp_lport; +#ifdef INET6 + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + store.sa.sa_family = AF_INET6; + store.sa.sa_len = sizeof(struct sockaddr_in6); + } +#endif + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { + store.sa.sa_family = AF_INET; + store.sa.sa_len = sizeof(struct sockaddr_in); + } + tinp = sctp_pcb_findep(&sp->sa, 0, 0, inp->def_vrf_id); + if (tinp && (tinp != inp) && + ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) && + ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && + (tinp->sctp_socket->so_qlimit)) { + /* + * we have a listener already and its not + * this inp. + */ + SCTP_INP_DECR_REF(tinp); + return (EADDRINUSE); + } else if (tinp) { + SCTP_INP_DECR_REF(inp); + } + } + } + SCTP_INP_RLOCK(inp); +#ifdef SCTP_LOCK_LOGGING + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) { + sctp_log_lock(inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_SOCK); + } +#endif + SOCK_LOCK(so); + error = solisten_proto_check(so); + if (error) { + SOCK_UNLOCK(so); + SCTP_INP_RUNLOCK(inp); + return (error); + } + if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + /* + * The unlucky case - We are in the tcp pool with this guy. + * - Someone else is in the main inp slot. - We must move + * this guy (the listener) to the main slot - We must then + * move the guy that was listener to the TCP Pool. + */ + if (sctp_swap_inpcb_for_listen(inp)) { + goto in_use; + } + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && + (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) { + /* We are already connected AND the TCP model */ +in_use: + SCTP_INP_RUNLOCK(inp); + SOCK_UNLOCK(so); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE); + return (EADDRINUSE); + } + SCTP_INP_RUNLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) { + /* We must do a bind. */ + SOCK_UNLOCK(so); + if ((error = sctp_inpcb_bind(so, NULL, NULL, p))) { + /* bind error, probably perm */ + return (error); + } + SOCK_LOCK(so); + } + /* It appears for 7.0 and on, we must always call this. */ + solisten_proto(so, backlog); + if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { + /* remove the ACCEPTCONN flag for one-to-many sockets */ + so->so_options &= ~SO_ACCEPTCONN; + } + if (backlog == 0) { + /* turning off listen */ + so->so_options &= ~SO_ACCEPTCONN; + } + SOCK_UNLOCK(so); + return (error); +} + +static int sctp_defered_wakeup_cnt = 0; + +int +sctp_accept(struct socket *so, struct sockaddr **addr) +{ + struct sctp_tcb *stcb; + struct sctp_inpcb *inp; + union sctp_sockstore store; + +#ifdef INET6 + int error; + +#endif + inp = (struct sctp_inpcb *)so->so_pcb; + + if (inp == 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (ECONNRESET); + } + SCTP_INP_RLOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP); + return (EOPNOTSUPP); + } + if (so->so_state & SS_ISDISCONNECTED) { + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ECONNABORTED); + return (ECONNABORTED); + } + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb == NULL) { + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return (ECONNRESET); + } + SCTP_TCB_LOCK(stcb); + SCTP_INP_RUNLOCK(inp); + store = stcb->asoc.primary_destination->ro._l_addr; + stcb->asoc.state &= ~SCTP_STATE_IN_ACCEPT_QUEUE; + SCTP_TCB_UNLOCK(stcb); + switch (store.sa.sa_family) { + case AF_INET: + { + struct sockaddr_in *sin; + + SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin); + if (sin == NULL) + return (ENOMEM); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_port = ((struct sockaddr_in *)&store)->sin_port; + sin->sin_addr = ((struct sockaddr_in *)&store)->sin_addr; + *addr = (struct sockaddr *)sin; + break; + } +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *sin6; + + SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6); + if (sin6 == NULL) + return (ENOMEM); + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_port = ((struct sockaddr_in6 *)&store)->sin6_port; + + sin6->sin6_addr = ((struct sockaddr_in6 *)&store)->sin6_addr; + if ((error = sa6_recoverscope(sin6)) != 0) { + SCTP_FREE_SONAME(sin6); + return (error); + } + *addr = (struct sockaddr *)sin6; + break; + } +#endif + default: + /* TSNH */ + break; + } + /* Wake any delayed sleep action */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) { + SCTP_INP_WLOCK(inp); + inp->sctp_flags &= ~SCTP_PCB_FLAGS_DONT_WAKE; + if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT) { + inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT; + SCTP_INP_WUNLOCK(inp); + SOCKBUF_LOCK(&inp->sctp_socket->so_snd); + if (sowriteable(inp->sctp_socket)) { + sowwakeup_locked(inp->sctp_socket); + } else { + SOCKBUF_UNLOCK(&inp->sctp_socket->so_snd); + } + SCTP_INP_WLOCK(inp); + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT) { + inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT; + SCTP_INP_WUNLOCK(inp); + SOCKBUF_LOCK(&inp->sctp_socket->so_rcv); + if (soreadable(inp->sctp_socket)) { + sctp_defered_wakeup_cnt++; + sorwakeup_locked(inp->sctp_socket); + } else { + SOCKBUF_UNLOCK(&inp->sctp_socket->so_rcv); + } + SCTP_INP_WLOCK(inp); + } + SCTP_INP_WUNLOCK(inp); + } + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + SCTP_TCB_LOCK(stcb); + sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7); + } + return (0); +} + +int +sctp_ingetaddr(struct socket *so, struct sockaddr **addr) +{ + struct sockaddr_in *sin; + uint32_t vrf_id; + struct sctp_inpcb *inp; + struct sctp_ifa *sctp_ifa; + + /* + * Do the malloc first in case it blocks. + */ + SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin); + if (sin == NULL) + return (ENOMEM); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + inp = (struct sctp_inpcb *)so->so_pcb; + if (!inp) { + SCTP_FREE_SONAME(sin); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return ECONNRESET; + } + SCTP_INP_RLOCK(inp); + sin->sin_port = inp->sctp_lport; + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { + struct sctp_tcb *stcb; + struct sockaddr_in *sin_a; + struct sctp_nets *net; + int fnd; + + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb == NULL) { + goto notConn; + } + fnd = 0; + sin_a = NULL; + SCTP_TCB_LOCK(stcb); + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + sin_a = (struct sockaddr_in *)&net->ro._l_addr; + if (sin_a == NULL) + /* this will make coverity happy */ + continue; + + if (sin_a->sin_family == AF_INET) { + fnd = 1; + break; + } + } + if ((!fnd) || (sin_a == NULL)) { + /* punt */ + SCTP_TCB_UNLOCK(stcb); + goto notConn; + } + vrf_id = inp->def_vrf_id; + sctp_ifa = sctp_source_address_selection(inp, + stcb, + (sctp_route_t *) & net->ro, + net, 0, vrf_id); + if (sctp_ifa) { + sin->sin_addr = sctp_ifa->address.sin.sin_addr; + sctp_free_ifa(sctp_ifa); + } + SCTP_TCB_UNLOCK(stcb); + } else { + /* For the bound all case you get back 0 */ + notConn: + sin->sin_addr.s_addr = 0; + } + + } else { + /* Take the first IPv4 address in the list */ + struct sctp_laddr *laddr; + int fnd = 0; + + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa->address.sa.sa_family == AF_INET) { + struct sockaddr_in *sin_a; + + sin_a = (struct sockaddr_in *)&laddr->ifa->address.sa; + sin->sin_addr = sin_a->sin_addr; + fnd = 1; + break; + } + } + if (!fnd) { + SCTP_FREE_SONAME(sin); + SCTP_INP_RUNLOCK(inp); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); + return ENOENT; + } + } + SCTP_INP_RUNLOCK(inp); + (*addr) = (struct sockaddr *)sin; + return (0); +} + +int +sctp_peeraddr(struct socket *so, struct sockaddr **addr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)*addr; + int fnd; + struct sockaddr_in *sin_a; + struct sctp_inpcb *inp; + struct sctp_tcb *stcb; + struct sctp_nets *net; + + /* Do the malloc first in case it blocks. */ + inp = (struct sctp_inpcb *)so->so_pcb; + if ((inp == NULL) || + ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) { + /* UDP type and listeners will drop out here */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN); + return (ENOTCONN); + } + SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin); + if (sin == NULL) + return (ENOMEM); + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + + /* We must recapture incase we blocked */ + inp = (struct sctp_inpcb *)so->so_pcb; + if (!inp) { + SCTP_FREE_SONAME(sin); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return ECONNRESET; + } + SCTP_INP_RLOCK(inp); + stcb = LIST_FIRST(&inp->sctp_asoc_list); + if (stcb) { + SCTP_TCB_LOCK(stcb); + } + SCTP_INP_RUNLOCK(inp); + if (stcb == NULL) { + SCTP_FREE_SONAME(sin); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); + return ECONNRESET; + } + fnd = 0; + TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { + sin_a = (struct sockaddr_in *)&net->ro._l_addr; + if (sin_a->sin_family == AF_INET) { + fnd = 1; + sin->sin_port = stcb->rport; + sin->sin_addr = sin_a->sin_addr; + break; + } + } + SCTP_TCB_UNLOCK(stcb); + if (!fnd) { + /* No IPv4 address */ + SCTP_FREE_SONAME(sin); + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); + return ENOENT; + } + (*addr) = (struct sockaddr *)sin; + return (0); +} + +struct pr_usrreqs sctp_usrreqs = { + .pru_abort = sctp_abort, + .pru_accept = sctp_accept, + .pru_attach = sctp_attach, + .pru_bind = sctp_bind, + .pru_connect = sctp_connect, + .pru_control = in_control, + .pru_close = sctp_close, + .pru_detach = sctp_close, + .pru_sopoll = sopoll_generic, + .pru_flush = sctp_flush, + .pru_disconnect = sctp_disconnect, + .pru_listen = sctp_listen, + .pru_peeraddr = sctp_peeraddr, + .pru_send = sctp_sendm, + .pru_shutdown = sctp_shutdown, + .pru_sockaddr = sctp_ingetaddr, + .pru_sosend = sctp_sosend, + .pru_soreceive = sctp_soreceive +}; diff --git a/freebsd/sys/netinet/sctp_var.h b/freebsd/sys/netinet/sctp_var.h new file mode 100644 index 00000000..93b92038 --- /dev/null +++ b/freebsd/sys/netinet/sctp_var.h @@ -0,0 +1,336 @@ +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctp_var.h,v 1.24 2005/03/06 16:04:19 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#ifndef _NETINET_SCTP_VAR_HH_ +#define _NETINET_SCTP_VAR_HH_ + +#include + +#if defined(_KERNEL) || defined(__Userspace__) + +extern struct pr_usrreqs sctp_usrreqs; + + +#define sctp_feature_on(inp, feature) (inp->sctp_features |= feature) +#define sctp_feature_off(inp, feature) (inp->sctp_features &= ~feature) +#define sctp_is_feature_on(inp, feature) ((inp->sctp_features & feature) == feature) +#define sctp_is_feature_off(inp, feature) ((inp->sctp_features & feature) == 0) + + +/* managing mobility_feature in inpcb (by micchie) */ +#define sctp_mobility_feature_on(inp, feature) (inp->sctp_mobility_features |= feature) +#define sctp_mobility_feature_off(inp, feature) (inp->sctp_mobility_features &= ~feature) +#define sctp_is_mobility_feature_on(inp, feature) (inp->sctp_mobility_features & feature) +#define sctp_is_mobility_feature_off(inp, feature) ((inp->sctp_mobility_features & feature) == 0) + +#define sctp_maxspace(sb) (max((sb)->sb_hiwat,SCTP_MINIMAL_RWND)) + +#define sctp_sbspace(asoc, sb) ((long) ((sctp_maxspace(sb) > (asoc)->sb_cc) ? (sctp_maxspace(sb) - (asoc)->sb_cc) : 0)) + +#define sctp_sbspace_failedmsgs(sb) ((long) ((sctp_maxspace(sb) > (sb)->sb_cc) ? (sctp_maxspace(sb) - (sb)->sb_cc) : 0)) + +#define sctp_sbspace_sub(a,b) ((a > b) ? (a - b) : 0) + +/* + * I tried to cache the readq entries at one point. But the reality + * is that it did not add any performance since this meant we had to + * lock the STCB on read. And at that point once you have to do an + * extra lock, it really does not matter if the lock is in the ZONE + * stuff or in our code. Note that this same problem would occur with + * an mbuf cache as well so it is not really worth doing, at least + * right now :-D + */ + +#define sctp_free_a_readq(_stcb, _readq) { \ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \ + SCTP_DECR_READQ_COUNT(); \ +} + +#define sctp_alloc_a_readq(_stcb, _readq) { \ + (_readq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_readq), struct sctp_queued_to_read); \ + if ((_readq)) { \ + SCTP_INCR_READQ_COUNT(); \ + } \ +} + +#define sctp_free_a_strmoq(_stcb, _strmoq) { \ + if ((_strmoq)->holds_key_ref) { \ + sctp_auth_key_release(stcb, sp->auth_keyid); \ + (_strmoq)->holds_key_ref = 0; \ + } \ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_strmoq), (_strmoq)); \ + SCTP_DECR_STRMOQ_COUNT(); \ +} + +#define sctp_alloc_a_strmoq(_stcb, _strmoq) { \ + (_strmoq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_strmoq), struct sctp_stream_queue_pending); \ + if ((_strmoq)) { \ + memset(_strmoq, 0, sizeof(struct sctp_stream_queue_pending)); \ + SCTP_INCR_STRMOQ_COUNT(); \ + (_strmoq)->holds_key_ref = 0; \ + } \ +} + +#define sctp_free_a_chunk(_stcb, _chk) { \ + if ((_chk)->holds_key_ref) {\ + sctp_auth_key_release((_stcb), (_chk)->auth_keyid); \ + (_chk)->holds_key_ref = 0; \ + } \ + if (_stcb) { \ + SCTP_TCB_LOCK_ASSERT((_stcb)); \ + if ((_chk)->whoTo) { \ + sctp_free_remote_addr((_chk)->whoTo); \ + (_chk)->whoTo = NULL; \ + } \ + if (((_stcb)->asoc.free_chunk_cnt > SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit)) || \ + (SCTP_BASE_INFO(ipi_free_chunks) > SCTP_BASE_SYSCTL(sctp_system_free_resc_limit))) { \ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \ + SCTP_DECR_CHK_COUNT(); \ + } else { \ + TAILQ_INSERT_TAIL(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \ + (_stcb)->asoc.free_chunk_cnt++; \ + atomic_add_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \ + } \ + } else { \ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \ + SCTP_DECR_CHK_COUNT(); \ + } \ +} + +#define sctp_alloc_a_chunk(_stcb, _chk) { \ + if (TAILQ_EMPTY(&(_stcb)->asoc.free_chunks)) { \ + (_chk) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_chunk), struct sctp_tmit_chunk); \ + if ((_chk)) { \ + SCTP_INCR_CHK_COUNT(); \ + (_chk)->whoTo = NULL; \ + (_chk)->holds_key_ref = 0; \ + } \ + } else { \ + (_chk) = TAILQ_FIRST(&(_stcb)->asoc.free_chunks); \ + TAILQ_REMOVE(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \ + atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \ + (_chk)->holds_key_ref = 0; \ + SCTP_STAT_INCR(sctps_cached_chk); \ + (_stcb)->asoc.free_chunk_cnt--; \ + } \ +} + + +#define sctp_free_remote_addr(__net) { \ + if ((__net)) { \ + if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&(__net)->ref_count)) { \ + (void)SCTP_OS_TIMER_STOP(&(__net)->rxt_timer.timer); \ + (void)SCTP_OS_TIMER_STOP(&(__net)->pmtu_timer.timer); \ + (void)SCTP_OS_TIMER_STOP(&(__net)->fr_timer.timer); \ + if ((__net)->ro.ro_rt) { \ + RTFREE((__net)->ro.ro_rt); \ + (__net)->ro.ro_rt = NULL; \ + } \ + if ((__net)->src_addr_selected) { \ + sctp_free_ifa((__net)->ro._s_addr); \ + (__net)->ro._s_addr = NULL; \ + } \ + (__net)->src_addr_selected = 0; \ + (__net)->dest_state = SCTP_ADDR_NOT_REACHABLE; \ + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_net), (__net)); \ + SCTP_DECR_RADDR_COUNT(); \ + } \ + } \ +} + +#define sctp_sbfree(ctl, stcb, sb, m) { \ + SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_cc, SCTP_BUF_LEN((m))); \ + SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_mbcnt, MSIZE); \ + if (((ctl)->do_not_ref_stcb == 0) && stcb) {\ + SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.sb_cc, SCTP_BUF_LEN((m))); \ + SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \ + } \ + if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \ + SCTP_BUF_TYPE(m) != MT_OOBDATA) \ + atomic_subtract_int(&(sb)->sb_ctl,SCTP_BUF_LEN((m))); \ +} + +#define sctp_sballoc(stcb, sb, m) { \ + atomic_add_int(&(sb)->sb_cc,SCTP_BUF_LEN((m))); \ + atomic_add_int(&(sb)->sb_mbcnt, MSIZE); \ + if (stcb) { \ + atomic_add_int(&(stcb)->asoc.sb_cc,SCTP_BUF_LEN((m))); \ + atomic_add_int(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \ + } \ + if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \ + SCTP_BUF_TYPE(m) != MT_OOBDATA) \ + atomic_add_int(&(sb)->sb_ctl,SCTP_BUF_LEN((m))); \ +} + + +#define sctp_ucount_incr(val) { \ + val++; \ +} + +#define sctp_ucount_decr(val) { \ + if (val > 0) { \ + val--; \ + } else { \ + val = 0; \ + } \ +} + +#define sctp_mbuf_crush(data) do { \ + struct mbuf *_m; \ + _m = (data); \ + while(_m && (SCTP_BUF_LEN(_m) == 0)) { \ + (data) = SCTP_BUF_NEXT(_m); \ + SCTP_BUF_NEXT(_m) = NULL; \ + sctp_m_free(_m); \ + _m = (data); \ + } \ +} while (0) + +#define sctp_flight_size_decrease(tp1) do { \ + if (tp1->whoTo->flight_size >= tp1->book_size) \ + tp1->whoTo->flight_size -= tp1->book_size; \ + else \ + tp1->whoTo->flight_size = 0; \ +} while (0) + +#define sctp_flight_size_increase(tp1) do { \ + (tp1)->whoTo->flight_size += (tp1)->book_size; \ +} while (0) + +#ifdef SCTP_FS_SPEC_LOG +#define sctp_total_flight_decrease(stcb, tp1) do { \ + if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \ + stcb->asoc.fs_index = 0;\ + stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \ + stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.TSN_seq; \ + stcb->asoc.fslog[stcb->asoc.fs_index].book = tp1->book_size; \ + stcb->asoc.fslog[stcb->asoc.fs_index].sent = tp1->sent; \ + stcb->asoc.fslog[stcb->asoc.fs_index].incr = 0; \ + stcb->asoc.fslog[stcb->asoc.fs_index].decr = 1; \ + stcb->asoc.fs_index++; \ + tp1->window_probe = 0; \ + if (stcb->asoc.total_flight >= tp1->book_size) { \ + stcb->asoc.total_flight -= tp1->book_size; \ + if (stcb->asoc.total_flight_count > 0) \ + stcb->asoc.total_flight_count--; \ + } else { \ + stcb->asoc.total_flight = 0; \ + stcb->asoc.total_flight_count = 0; \ + } \ +} while (0) + +#define sctp_total_flight_increase(stcb, tp1) do { \ + if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \ + stcb->asoc.fs_index = 0;\ + stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \ + stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.TSN_seq; \ + stcb->asoc.fslog[stcb->asoc.fs_index].book = tp1->book_size; \ + stcb->asoc.fslog[stcb->asoc.fs_index].sent = tp1->sent; \ + stcb->asoc.fslog[stcb->asoc.fs_index].incr = 1; \ + stcb->asoc.fslog[stcb->asoc.fs_index].decr = 0; \ + stcb->asoc.fs_index++; \ + (stcb)->asoc.total_flight_count++; \ + (stcb)->asoc.total_flight += (tp1)->book_size; \ +} while (0) + +#else + +#define sctp_total_flight_decrease(stcb, tp1) do { \ + tp1->window_probe = 0; \ + if (stcb->asoc.total_flight >= tp1->book_size) { \ + stcb->asoc.total_flight -= tp1->book_size; \ + if (stcb->asoc.total_flight_count > 0) \ + stcb->asoc.total_flight_count--; \ + } else { \ + stcb->asoc.total_flight = 0; \ + stcb->asoc.total_flight_count = 0; \ + } \ +} while (0) + +#define sctp_total_flight_increase(stcb, tp1) do { \ + (stcb)->asoc.total_flight_count++; \ + (stcb)->asoc.total_flight += (tp1)->book_size; \ +} while (0) + +#endif + + +struct sctp_nets; +struct sctp_inpcb; +struct sctp_tcb; +struct sctphdr; + + +void sctp_close(struct socket *so); +int sctp_disconnect(struct socket *so); + +void sctp_ctlinput __P((int, struct sockaddr *, void *)); +int sctp_ctloutput __P((struct socket *, struct sockopt *)); +void sctp_input_with_port __P((struct mbuf *, int, uint16_t)); +void sctp_input __P((struct mbuf *, int)); +void sctp_pathmtu_adjustment __P((struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *, uint16_t)); +void sctp_drain __P((void)); +void sctp_init __P((void)); + +void sctp_finish(void); + +int sctp_flush(struct socket *, int); +int sctp_shutdown __P((struct socket *)); +void sctp_notify +__P((struct sctp_inpcb *, struct ip *ip, struct sctphdr *, + struct sockaddr *, struct sctp_tcb *, + struct sctp_nets *)); + + int sctp_bindx(struct socket *, int, struct sockaddr_storage *, + int, int, struct proc *); + +/* can't use sctp_assoc_t here */ + int sctp_peeloff(struct socket *, struct socket *, int, caddr_t, int *); + + int sctp_ingetaddr(struct socket *, + struct sockaddr ** +); + + int sctp_peeraddr(struct socket *, + struct sockaddr ** +); + + int sctp_listen(struct socket *, int, struct thread *); + + int sctp_accept(struct socket *, struct sockaddr **); + +#endif /* _KERNEL */ + +#endif /* !_NETINET_SCTP_VAR_HH_ */ diff --git a/freebsd/sys/netinet/sctputil.c b/freebsd/sys/netinet/sctputil.c new file mode 100644 index 00000000..7e8ac1ea --- /dev/null +++ b/freebsd/sys/netinet/sctputil.c @@ -0,0 +1,6977 @@ +#include + +/*- + * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* $KAME: sctputil.c,v 1.37 2005/03/07 23:26:09 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#ifdef INET6 +#endif +#include +#include +#include +#include +#include /* for sctp_deliver_data() */ +#include +#include +#include +#include + + +#ifndef KTR_SCTP +#define KTR_SCTP KTR_SUBSYS +#endif + +void +sctp_sblog(struct sockbuf *sb, + struct sctp_tcb *stcb, int from, int incr) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.sb.stcb = stcb; + sctp_clog.x.sb.so_sbcc = sb->sb_cc; + if (stcb) + sctp_clog.x.sb.stcb_sbcc = stcb->asoc.sb_cc; + else + sctp_clog.x.sb.stcb_sbcc = 0; + sctp_clog.x.sb.incr = incr; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_SB, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); +} + +void +sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.close.inp = (void *)inp; + sctp_clog.x.close.sctp_flags = inp->sctp_flags; + if (stcb) { + sctp_clog.x.close.stcb = (void *)stcb; + sctp_clog.x.close.state = (uint16_t) stcb->asoc.state; + } else { + sctp_clog.x.close.stcb = 0; + sctp_clog.x.close.state = 0; + } + sctp_clog.x.close.loc = loc; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_CLOSE, + 0, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); +} + + +void +rto_logging(struct sctp_nets *net, int from) +{ + struct sctp_cwnd_log sctp_clog; + + memset(&sctp_clog, 0, sizeof(sctp_clog)); + sctp_clog.x.rto.net = (void *)net; + sctp_clog.x.rto.rtt = net->prev_rtt; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_RTT, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); + +} + +void +sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t tsn, uint16_t sseq, uint16_t stream, int from) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.strlog.stcb = stcb; + sctp_clog.x.strlog.n_tsn = tsn; + sctp_clog.x.strlog.n_sseq = sseq; + sctp_clog.x.strlog.e_tsn = 0; + sctp_clog.x.strlog.e_sseq = 0; + sctp_clog.x.strlog.strm = stream; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_STRM, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); + +} + +void +sctp_log_nagle_event(struct sctp_tcb *stcb, int action) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.nagle.stcb = (void *)stcb; + sctp_clog.x.nagle.total_flight = stcb->asoc.total_flight; + sctp_clog.x.nagle.total_in_queue = stcb->asoc.total_output_queue_size; + sctp_clog.x.nagle.count_in_queue = stcb->asoc.chunks_on_out_queue; + sctp_clog.x.nagle.count_in_flight = stcb->asoc.total_flight_count; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_NAGLE, + action, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); +} + + +void +sctp_log_sack(uint32_t old_cumack, uint32_t cumack, uint32_t tsn, uint16_t gaps, uint16_t dups, int from) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.sack.cumack = cumack; + sctp_clog.x.sack.oldcumack = old_cumack; + sctp_clog.x.sack.tsn = tsn; + sctp_clog.x.sack.numGaps = gaps; + sctp_clog.x.sack.numDups = dups; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_SACK, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); +} + +void +sctp_log_map(uint32_t map, uint32_t cum, uint32_t high, int from) +{ + struct sctp_cwnd_log sctp_clog; + + memset(&sctp_clog, 0, sizeof(sctp_clog)); + sctp_clog.x.map.base = map; + sctp_clog.x.map.cum = cum; + sctp_clog.x.map.high = high; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_MAP, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); +} + +void +sctp_log_fr(uint32_t biggest_tsn, uint32_t biggest_new_tsn, uint32_t tsn, + int from) +{ + struct sctp_cwnd_log sctp_clog; + + memset(&sctp_clog, 0, sizeof(sctp_clog)); + sctp_clog.x.fr.largest_tsn = biggest_tsn; + sctp_clog.x.fr.largest_new_tsn = biggest_new_tsn; + sctp_clog.x.fr.tsn = tsn; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_FR, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); + +} + + +void +sctp_log_mb(struct mbuf *m, int from) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.mb.mp = m; + sctp_clog.x.mb.mbuf_flags = (uint8_t) (SCTP_BUF_GET_FLAGS(m)); + sctp_clog.x.mb.size = (uint16_t) (SCTP_BUF_LEN(m)); + sctp_clog.x.mb.data = SCTP_BUF_AT(m, 0); + if (SCTP_BUF_IS_EXTENDED(m)) { + sctp_clog.x.mb.ext = SCTP_BUF_EXTEND_BASE(m); + sctp_clog.x.mb.refcnt = (uint8_t) (SCTP_BUF_EXTEND_REFCNT(m)); + } else { + sctp_clog.x.mb.ext = 0; + sctp_clog.x.mb.refcnt = 0; + } + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_MBUF, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); +} + + +void +sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_read *poschk, + int from) +{ + struct sctp_cwnd_log sctp_clog; + + if (control == NULL) { + SCTP_PRINTF("Gak log of NULL?\n"); + return; + } + sctp_clog.x.strlog.stcb = control->stcb; + sctp_clog.x.strlog.n_tsn = control->sinfo_tsn; + sctp_clog.x.strlog.n_sseq = control->sinfo_ssn; + sctp_clog.x.strlog.strm = control->sinfo_stream; + if (poschk != NULL) { + sctp_clog.x.strlog.e_tsn = poschk->sinfo_tsn; + sctp_clog.x.strlog.e_sseq = poschk->sinfo_ssn; + } else { + sctp_clog.x.strlog.e_tsn = 0; + sctp_clog.x.strlog.e_sseq = 0; + } + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_STRM, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); + +} + +void +sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net, int augment, uint8_t from) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.cwnd.net = net; + if (stcb->asoc.send_queue_cnt > 255) + sctp_clog.x.cwnd.cnt_in_send = 255; + else + sctp_clog.x.cwnd.cnt_in_send = stcb->asoc.send_queue_cnt; + if (stcb->asoc.stream_queue_cnt > 255) + sctp_clog.x.cwnd.cnt_in_str = 255; + else + sctp_clog.x.cwnd.cnt_in_str = stcb->asoc.stream_queue_cnt; + + if (net) { + sctp_clog.x.cwnd.cwnd_new_value = net->cwnd; + sctp_clog.x.cwnd.inflight = net->flight_size; + sctp_clog.x.cwnd.pseudo_cumack = net->pseudo_cumack; + sctp_clog.x.cwnd.meets_pseudo_cumack = net->new_pseudo_cumack; + sctp_clog.x.cwnd.need_new_pseudo_cumack = net->find_pseudo_cumack; + } + if (SCTP_CWNDLOG_PRESEND == from) { + sctp_clog.x.cwnd.meets_pseudo_cumack = stcb->asoc.peers_rwnd; + } + sctp_clog.x.cwnd.cwnd_augment = augment; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_CWND, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); + +} + +void +sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from) +{ + struct sctp_cwnd_log sctp_clog; + + memset(&sctp_clog, 0, sizeof(sctp_clog)); + if (inp) { + sctp_clog.x.lock.sock = (void *)inp->sctp_socket; + + } else { + sctp_clog.x.lock.sock = (void *)NULL; + } + sctp_clog.x.lock.inp = (void *)inp; + if (stcb) { + sctp_clog.x.lock.tcb_lock = mtx_owned(&stcb->tcb_mtx); + } else { + sctp_clog.x.lock.tcb_lock = SCTP_LOCK_UNKNOWN; + } + if (inp) { + sctp_clog.x.lock.inp_lock = mtx_owned(&inp->inp_mtx); + sctp_clog.x.lock.create_lock = mtx_owned(&inp->inp_create_mtx); + } else { + sctp_clog.x.lock.inp_lock = SCTP_LOCK_UNKNOWN; + sctp_clog.x.lock.create_lock = SCTP_LOCK_UNKNOWN; + } + sctp_clog.x.lock.info_lock = rw_wowned(&SCTP_BASE_INFO(ipi_ep_mtx)); + if (inp && (inp->sctp_socket)) { + sctp_clog.x.lock.sock_lock = mtx_owned(&(inp->sctp_socket->so_rcv.sb_mtx)); + sctp_clog.x.lock.sockrcvbuf_lock = mtx_owned(&(inp->sctp_socket->so_rcv.sb_mtx)); + sctp_clog.x.lock.socksndbuf_lock = mtx_owned(&(inp->sctp_socket->so_snd.sb_mtx)); + } else { + sctp_clog.x.lock.sock_lock = SCTP_LOCK_UNKNOWN; + sctp_clog.x.lock.sockrcvbuf_lock = SCTP_LOCK_UNKNOWN; + sctp_clog.x.lock.socksndbuf_lock = SCTP_LOCK_UNKNOWN; + } + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_LOCK_EVENT, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); + +} + +void +sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *net, int error, int burst, uint8_t from) +{ + struct sctp_cwnd_log sctp_clog; + + memset(&sctp_clog, 0, sizeof(sctp_clog)); + sctp_clog.x.cwnd.net = net; + sctp_clog.x.cwnd.cwnd_new_value = error; + sctp_clog.x.cwnd.inflight = net->flight_size; + sctp_clog.x.cwnd.cwnd_augment = burst; + if (stcb->asoc.send_queue_cnt > 255) + sctp_clog.x.cwnd.cnt_in_send = 255; + else + sctp_clog.x.cwnd.cnt_in_send = stcb->asoc.send_queue_cnt; + if (stcb->asoc.stream_queue_cnt > 255) + sctp_clog.x.cwnd.cnt_in_str = 255; + else + sctp_clog.x.cwnd.cnt_in_str = stcb->asoc.stream_queue_cnt; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_MAXBURST, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); + +} + +void +sctp_log_rwnd(uint8_t from, uint32_t peers_rwnd, uint32_t snd_size, uint32_t overhead) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.rwnd.rwnd = peers_rwnd; + sctp_clog.x.rwnd.send_size = snd_size; + sctp_clog.x.rwnd.overhead = overhead; + sctp_clog.x.rwnd.new_rwnd = 0; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_RWND, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); +} + +void +sctp_log_rwnd_set(uint8_t from, uint32_t peers_rwnd, uint32_t flight_size, uint32_t overhead, uint32_t a_rwndval) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.rwnd.rwnd = peers_rwnd; + sctp_clog.x.rwnd.send_size = flight_size; + sctp_clog.x.rwnd.overhead = overhead; + sctp_clog.x.rwnd.new_rwnd = a_rwndval; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_RWND, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); +} + +void +sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mbcnt_q, uint32_t mbcnt) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.mbcnt.total_queue_size = total_oq; + sctp_clog.x.mbcnt.size_change = book; + sctp_clog.x.mbcnt.total_queue_mb_size = total_mbcnt_q; + sctp_clog.x.mbcnt.mbcnt_change = mbcnt; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_MBCNT, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); + +} + +void +sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d) +{ + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_MISC_EVENT, + from, + a, b, c, d); +} + +void +sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t cumtsn, uint32_t wake_cnt, int from) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.wake.stcb = (void *)stcb; + sctp_clog.x.wake.wake_cnt = wake_cnt; + sctp_clog.x.wake.flight = stcb->asoc.total_flight_count; + sctp_clog.x.wake.send_q = stcb->asoc.send_queue_cnt; + sctp_clog.x.wake.sent_q = stcb->asoc.sent_queue_cnt; + + if (stcb->asoc.stream_queue_cnt < 0xff) + sctp_clog.x.wake.stream_qcnt = (uint8_t) stcb->asoc.stream_queue_cnt; + else + sctp_clog.x.wake.stream_qcnt = 0xff; + + if (stcb->asoc.chunks_on_out_queue < 0xff) + sctp_clog.x.wake.chunks_on_oque = (uint8_t) stcb->asoc.chunks_on_out_queue; + else + sctp_clog.x.wake.chunks_on_oque = 0xff; + + sctp_clog.x.wake.sctpflags = 0; + /* set in the defered mode stuff */ + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) + sctp_clog.x.wake.sctpflags |= 1; + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT) + sctp_clog.x.wake.sctpflags |= 2; + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT) + sctp_clog.x.wake.sctpflags |= 4; + /* what about the sb */ + if (stcb->sctp_socket) { + struct socket *so = stcb->sctp_socket; + + sctp_clog.x.wake.sbflags = (uint8_t) ((so->so_snd.sb_flags & 0x00ff)); + } else { + sctp_clog.x.wake.sbflags = 0xff; + } + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_WAKE, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); + +} + +void +sctp_log_block(uint8_t from, struct socket *so, struct sctp_association *asoc, int sendlen) +{ + struct sctp_cwnd_log sctp_clog; + + sctp_clog.x.blk.onsb = asoc->total_output_queue_size; + sctp_clog.x.blk.send_sent_qcnt = (uint16_t) (asoc->send_queue_cnt + asoc->sent_queue_cnt); + sctp_clog.x.blk.peer_rwnd = asoc->peers_rwnd; + sctp_clog.x.blk.stream_qcnt = (uint16_t) asoc->stream_queue_cnt; + sctp_clog.x.blk.chunks_on_oque = (uint16_t) asoc->chunks_on_out_queue; + sctp_clog.x.blk.flight_size = (uint16_t) (asoc->total_flight / 1024); + sctp_clog.x.blk.sndlen = sendlen; + SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", + SCTP_LOG_EVENT_BLOCK, + from, + sctp_clog.x.misc.log1, + sctp_clog.x.misc.log2, + sctp_clog.x.misc.log3, + sctp_clog.x.misc.log4); + +} + +int +sctp_fill_stat_log(void *optval, size_t *optsize) +{ + /* May need to fix this if ktrdump does not work */ + return (0); +} + +#ifdef SCTP_AUDITING_ENABLED +uint8_t sctp_audit_data[SCTP_AUDIT_SIZE][2]; +static int sctp_audit_indx = 0; + +static +void +sctp_print_audit_report(void) +{ + int i; + int cnt; + + cnt = 0; + for (i = sctp_audit_indx; i < SCTP_AUDIT_SIZE; i++) { + if ((sctp_audit_data[i][0] == 0xe0) && + (sctp_audit_data[i][1] == 0x01)) { + cnt = 0; + SCTP_PRINTF("\n"); + } else if (sctp_audit_data[i][0] == 0xf0) { + cnt = 0; + SCTP_PRINTF("\n"); + } else if ((sctp_audit_data[i][0] == 0xc0) && + (sctp_audit_data[i][1] == 0x01)) { + SCTP_PRINTF("\n"); + cnt = 0; + } + SCTP_PRINTF("%2.2x%2.2x ", (uint32_t) sctp_audit_data[i][0], + (uint32_t) sctp_audit_data[i][1]); + cnt++; + if ((cnt % 14) == 0) + SCTP_PRINTF("\n"); + } + for (i = 0; i < sctp_audit_indx; i++) { + if ((sctp_audit_data[i][0] == 0xe0) && + (sctp_audit_data[i][1] == 0x01)) { + cnt = 0; + SCTP_PRINTF("\n"); + } else if (sctp_audit_data[i][0] == 0xf0) { + cnt = 0; + SCTP_PRINTF("\n"); + } else if ((sctp_audit_data[i][0] == 0xc0) && + (sctp_audit_data[i][1] == 0x01)) { + SCTP_PRINTF("\n"); + cnt = 0; + } + SCTP_PRINTF("%2.2x%2.2x ", (uint32_t) sctp_audit_data[i][0], + (uint32_t) sctp_audit_data[i][1]); + cnt++; + if ((cnt % 14) == 0) + SCTP_PRINTF("\n"); + } + SCTP_PRINTF("\n"); +} + +void +sctp_auditing(int from, struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + int resend_cnt, tot_out, rep, tot_book_cnt; + struct sctp_nets *lnet; + struct sctp_tmit_chunk *chk; + + sctp_audit_data[sctp_audit_indx][0] = 0xAA; + sctp_audit_data[sctp_audit_indx][1] = 0x000000ff & from; + sctp_audit_indx++; + if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { + sctp_audit_indx = 0; + } + if (inp == NULL) { + sctp_audit_data[sctp_audit_indx][0] = 0xAF; + sctp_audit_data[sctp_audit_indx][1] = 0x01; + sctp_audit_indx++; + if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { + sctp_audit_indx = 0; + } + return; + } + if (stcb == NULL) { + sctp_audit_data[sctp_audit_indx][0] = 0xAF; + sctp_audit_data[sctp_audit_indx][1] = 0x02; + sctp_audit_indx++; + if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { + sctp_audit_indx = 0; + } + return; + } + sctp_audit_data[sctp_audit_indx][0] = 0xA1; + sctp_audit_data[sctp_audit_indx][1] = + (0x000000ff & stcb->asoc.sent_queue_retran_cnt); + sctp_audit_indx++; + if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { + sctp_audit_indx = 0; + } + rep = 0; + tot_book_cnt = 0; + resend_cnt = tot_out = 0; + TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { + if (chk->sent == SCTP_DATAGRAM_RESEND) { + resend_cnt++; + } else if (chk->sent < SCTP_DATAGRAM_RESEND) { + tot_out += chk->book_size; + tot_book_cnt++; + } + } + if (resend_cnt != stcb->asoc.sent_queue_retran_cnt) { + sctp_audit_data[sctp_audit_indx][0] = 0xAF; + sctp_audit_data[sctp_audit_indx][1] = 0xA1; + sctp_audit_indx++; + if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { + sctp_audit_indx = 0; + } + SCTP_PRINTF("resend_cnt:%d asoc-tot:%d\n", + resend_cnt, stcb->asoc.sent_queue_retran_cnt); + rep = 1; + stcb->asoc.sent_queue_retran_cnt = resend_cnt; + sctp_audit_data[sctp_audit_indx][0] = 0xA2; + sctp_audit_data[sctp_audit_indx][1] = + (0x000000ff & stcb->asoc.sent_queue_retran_cnt); + sctp_audit_indx++; + if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { + sctp_audit_indx = 0; + } + } + if (tot_out != stcb->asoc.total_flight) { + sctp_audit_data[sctp_audit_indx][0] = 0xAF; + sctp_audit_data[sctp_audit_indx][1] = 0xA2; + sctp_audit_indx++; + if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { + sctp_audit_indx = 0; + } + rep = 1; + SCTP_PRINTF("tot_flt:%d asoc_tot:%d\n", tot_out, + (int)stcb->asoc.total_flight); + stcb->asoc.total_flight = tot_out; + } + if (tot_book_cnt != stcb->asoc.total_flight_count) { + sctp_audit_data[sctp_audit_indx][0] = 0xAF; + sctp_audit_data[sctp_audit_indx][1] = 0xA5; + sctp_audit_indx++; + if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { + sctp_audit_indx = 0; + } + rep = 1; + SCTP_PRINTF("tot_flt_book:%d\n", tot_book_cnt); + + stcb->asoc.total_flight_count = tot_book_cnt; + } + tot_out = 0; + TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) { + tot_out += lnet->flight_size; + } + if (tot_out != stcb->asoc.total_flight) { + sctp_audit_data[sctp_audit_indx][0] = 0xAF; + sctp_audit_data[sctp_audit_indx][1] = 0xA3; + sctp_audit_indx++; + if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { + sctp_audit_indx = 0; + } + rep = 1; + SCTP_PRINTF("real flight:%d net total was %d\n", + stcb->asoc.total_flight, tot_out); + /* now corrective action */ + TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) { + + tot_out = 0; + TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { + if ((chk->whoTo == lnet) && + (chk->sent < SCTP_DATAGRAM_RESEND)) { + tot_out += chk->book_size; + } + } + if (lnet->flight_size != tot_out) { + SCTP_PRINTF("net:%p flight was %d corrected to %d\n", + lnet, lnet->flight_size, + tot_out); + lnet->flight_size = tot_out; + } + } + } + if (rep) { + sctp_print_audit_report(); + } +} + +void +sctp_audit_log(uint8_t ev, uint8_t fd) +{ + + sctp_audit_data[sctp_audit_indx][0] = ev; + sctp_audit_data[sctp_audit_indx][1] = fd; + sctp_audit_indx++; + if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { + sctp_audit_indx = 0; + } +} + +#endif + +/* + * sctp_stop_timers_for_shutdown() should be called + * when entering the SHUTDOWN_SENT or SHUTDOWN_ACK_SENT + * state to make sure that all timers are stopped. + */ +void +sctp_stop_timers_for_shutdown(struct sctp_tcb *stcb) +{ + struct sctp_association *asoc; + struct sctp_nets *net; + + asoc = &stcb->asoc; + + (void)SCTP_OS_TIMER_STOP(&asoc->hb_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->dack_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->strreset_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->asconf_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->autoclose_timer.timer); + (void)SCTP_OS_TIMER_STOP(&asoc->delayed_event_timer.timer); + TAILQ_FOREACH(net, &asoc->nets, sctp_next) { + (void)SCTP_OS_TIMER_STOP(&net->fr_timer.timer); + (void)SCTP_OS_TIMER_STOP(&net->pmtu_timer.timer); + } +} + +/* + * a list of sizes based on typical mtu's, used only if next hop size not + * returned. + */ +static uint32_t sctp_mtu_sizes[] = { + 68, + 296, + 508, + 512, + 544, + 576, + 1006, + 1492, + 1500, + 1536, + 2002, + 2048, + 4352, + 4464, + 8166, + 17914, + 32000, + 65535 +}; + +/* + * Return the largest MTU smaller than val. If there is no + * entry, just return val. + */ +uint32_t +sctp_get_prev_mtu(uint32_t val) +{ + uint32_t i; + + if (val <= sctp_mtu_sizes[0]) { + return (val); + } + for (i = 1; i < (sizeof(sctp_mtu_sizes) / sizeof(uint32_t)); i++) { + if (val <= sctp_mtu_sizes[i]) { + break; + } + } + return (sctp_mtu_sizes[i - 1]); +} + +/* + * Return the smallest MTU larger than val. If there is no + * entry, just return val. + */ +uint32_t +sctp_get_next_mtu(struct sctp_inpcb *inp, uint32_t val) +{ + /* select another MTU that is just bigger than this one */ + uint32_t i; + + for (i = 0; i < (sizeof(sctp_mtu_sizes) / sizeof(uint32_t)); i++) { + if (val < sctp_mtu_sizes[i]) { + return (sctp_mtu_sizes[i]); + } + } + return (val); +} + +void +sctp_fill_random_store(struct sctp_pcb *m) +{ + /* + * Here we use the MD5/SHA-1 to hash with our good randomNumbers and + * our counter. The result becomes our good random numbers and we + * then setup to give these out. Note that we do no locking to + * protect this. This is ok, since if competing folks call this we + * will get more gobbled gook in the random store which is what we + * want. There is a danger that two guys will use the same random + * numbers, but thats ok too since that is random as well :-> + */ + m->store_at = 0; + (void)sctp_hmac(SCTP_HMAC, (uint8_t *) m->random_numbers, + sizeof(m->random_numbers), (uint8_t *) & m->random_counter, + sizeof(m->random_counter), (uint8_t *) m->random_store); + m->random_counter++; +} + +uint32_t +sctp_select_initial_TSN(struct sctp_pcb *inp) +{ + /* + * A true implementation should use random selection process to get + * the initial stream sequence number, using RFC1750 as a good + * guideline + */ + uint32_t x, *xp; + uint8_t *p; + int store_at, new_store; + + if (inp->initial_sequence_debug != 0) { + uint32_t ret; + + ret = inp->initial_sequence_debug; + inp->initial_sequence_debug++; + return (ret); + } +retry: + store_at = inp->store_at; + new_store = store_at + sizeof(uint32_t); + if (new_store >= (SCTP_SIGNATURE_SIZE - 3)) { + new_store = 0; + } + if (!atomic_cmpset_int(&inp->store_at, store_at, new_store)) { + goto retry; + } + if (new_store == 0) { + /* Refill the random store */ + sctp_fill_random_store(inp); + } + p = &inp->random_store[store_at]; + xp = (uint32_t *) p; + x = *xp; + return (x); +} + +uint32_t +sctp_select_a_tag(struct sctp_inpcb *inp, uint16_t lport, uint16_t rport, int save_in_twait) +{ + uint32_t x, not_done; + struct timeval now; + + (void)SCTP_GETTIME_TIMEVAL(&now); + not_done = 1; + while (not_done) { + x = sctp_select_initial_TSN(&inp->sctp_ep); + if (x == 0) { + /* we never use 0 */ + continue; + } + if (sctp_is_vtag_good(inp, x, lport, rport, &now, save_in_twait)) { + not_done = 0; + } + } + return (x); +} + +int +sctp_init_asoc(struct sctp_inpcb *m, struct sctp_tcb *stcb, + uint32_t override_tag, uint32_t vrf_id) +{ + struct sctp_association *asoc; + + /* + * Anything set to zero is taken care of by the allocation routine's + * bzero + */ + + /* + * Up front select what scoping to apply on addresses I tell my peer + * Not sure what to do with these right now, we will need to come up + * with a way to set them. We may need to pass them through from the + * caller in the sctp_aloc_assoc() function. + */ + int i; + + asoc = &stcb->asoc; + /* init all variables to a known value. */ + SCTP_SET_STATE(&stcb->asoc, SCTP_STATE_INUSE); + asoc->max_burst = m->sctp_ep.max_burst; + asoc->heart_beat_delay = TICKS_TO_MSEC(m->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]); + asoc->cookie_life = m->sctp_ep.def_cookie_life; + asoc->sctp_cmt_on_off = m->sctp_cmt_on_off; + asoc->sctp_nr_sack_on_off = (uint8_t) SCTP_BASE_SYSCTL(sctp_nr_sack_on_off); + asoc->sctp_cmt_pf = (uint8_t) SCTP_BASE_SYSCTL(sctp_cmt_pf); + asoc->sctp_frag_point = m->sctp_frag_point; +#ifdef INET + asoc->default_tos = m->ip_inp.inp.inp_ip_tos; +#else + asoc->default_tos = 0; +#endif + +#ifdef INET6 + asoc->default_flowlabel = ((struct in6pcb *)m)->in6p_flowinfo; +#else + asoc->default_flowlabel = 0; +#endif + asoc->sb_send_resv = 0; + if (override_tag) { + asoc->my_vtag = override_tag; + } else { + asoc->my_vtag = sctp_select_a_tag(m, stcb->sctp_ep->sctp_lport, stcb->rport, 1); + } + /* Get the nonce tags */ + asoc->my_vtag_nonce = sctp_select_a_tag(m, stcb->sctp_ep->sctp_lport, stcb->rport, 0); + asoc->peer_vtag_nonce = sctp_select_a_tag(m, stcb->sctp_ep->sctp_lport, stcb->rport, 0); + asoc->vrf_id = vrf_id; + + if (sctp_is_feature_on(m, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) + asoc->hb_is_disabled = 1; + else + asoc->hb_is_disabled = 0; + +#ifdef SCTP_ASOCLOG_OF_TSNS + asoc->tsn_in_at = 0; + asoc->tsn_out_at = 0; + asoc->tsn_in_wrapped = 0; + asoc->tsn_out_wrapped = 0; + asoc->cumack_log_at = 0; + asoc->cumack_log_atsnt = 0; +#endif +#ifdef SCTP_FS_SPEC_LOG + asoc->fs_index = 0; +#endif + asoc->refcnt = 0; + asoc->assoc_up_sent = 0; + asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number = asoc->sending_seq = + sctp_select_initial_TSN(&m->sctp_ep); + asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; + /* we are optimisitic here */ + asoc->peer_supports_pktdrop = 1; + asoc->peer_supports_nat = 0; + asoc->sent_queue_retran_cnt = 0; + + /* for CMT */ + asoc->last_net_cmt_send_started = NULL; + + /* This will need to be adjusted */ + asoc->last_cwr_tsn = asoc->init_seq_number - 1; + asoc->last_acked_seq = asoc->init_seq_number - 1; + asoc->advanced_peer_ack_point = asoc->last_acked_seq; + asoc->asconf_seq_in = asoc->last_acked_seq; + + /* here we are different, we hold the next one we expect */ + asoc->str_reset_seq_in = asoc->last_acked_seq + 1; + + asoc->initial_init_rto_max = m->sctp_ep.initial_init_rto_max; + asoc->initial_rto = m->sctp_ep.initial_rto; + + asoc->max_init_times = m->sctp_ep.max_init_times; + asoc->max_send_times = m->sctp_ep.max_send_times; + asoc->def_net_failure = m->sctp_ep.def_net_failure; + asoc->free_chunk_cnt = 0; + + asoc->iam_blocking = 0; + /* ECN Nonce initialization */ + asoc->context = m->sctp_context; + asoc->def_send = m->def_send; + asoc->ecn_nonce_allowed = 0; + asoc->receiver_nonce_sum = 1; + asoc->nonce_sum_expect_base = 1; + asoc->nonce_sum_check = 1; + asoc->nonce_resync_tsn = 0; + asoc->nonce_wait_for_ecne = 0; + asoc->nonce_wait_tsn = 0; + asoc->delayed_ack = TICKS_TO_MSEC(m->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]); + asoc->sack_freq = m->sctp_ep.sctp_sack_freq; + asoc->pr_sctp_cnt = 0; + asoc->total_output_queue_size = 0; + + if (m->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + struct in6pcb *inp6; + + /* Its a V6 socket */ + inp6 = (struct in6pcb *)m; + asoc->ipv6_addr_legal = 1; + /* Now look at the binding flag to see if V4 will be legal */ + if (SCTP_IPV6_V6ONLY(inp6) == 0) { + asoc->ipv4_addr_legal = 1; + } else { + /* V4 addresses are NOT legal on the association */ + asoc->ipv4_addr_legal = 0; + } + } else { + /* Its a V4 socket, no - V6 */ + asoc->ipv4_addr_legal = 1; + asoc->ipv6_addr_legal = 0; + } + + asoc->my_rwnd = max(SCTP_SB_LIMIT_RCV(m->sctp_socket), SCTP_MINIMAL_RWND); + asoc->peers_rwnd = SCTP_SB_LIMIT_RCV(m->sctp_socket); + + asoc->smallest_mtu = m->sctp_frag_point; + asoc->minrto = m->sctp_ep.sctp_minrto; + asoc->maxrto = m->sctp_ep.sctp_maxrto; + + asoc->locked_on_sending = NULL; + asoc->stream_locked_on = 0; + asoc->ecn_echo_cnt_onq = 0; + asoc->stream_locked = 0; + + asoc->send_sack = 1; + + LIST_INIT(&asoc->sctp_restricted_addrs); + + TAILQ_INIT(&asoc->nets); + TAILQ_INIT(&asoc->pending_reply_queue); + TAILQ_INIT(&asoc->asconf_ack_sent); + /* Setup to fill the hb random cache at first HB */ + asoc->hb_random_idx = 4; + + asoc->sctp_autoclose_ticks = m->sctp_ep.auto_close_time; + + /* + * JRS - Pick the default congestion control module based on the + * sysctl. + */ + switch (m->sctp_ep.sctp_default_cc_module) { + /* JRS - Standard TCP congestion control */ + case SCTP_CC_RFC2581: + { + stcb->asoc.congestion_control_module = SCTP_CC_RFC2581; + stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_set_initial_cc_param; + stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_cwnd_update_after_sack; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_cwnd_update_after_fr; + stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_cwnd_update_after_timeout; + stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_cwnd_update_after_ecn_echo; + stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped; + stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_cwnd_update_after_fr_timer; + break; + } + /* JRS - High Speed TCP congestion control (Floyd) */ + case SCTP_CC_HSTCP: + { + stcb->asoc.congestion_control_module = SCTP_CC_HSTCP; + stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_set_initial_cc_param; + stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_hs_cwnd_update_after_sack; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_hs_cwnd_update_after_fr; + stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_cwnd_update_after_timeout; + stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_cwnd_update_after_ecn_echo; + stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped; + stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_cwnd_update_after_fr_timer; + break; + } + /* JRS - HTCP congestion control */ + case SCTP_CC_HTCP: + { + stcb->asoc.congestion_control_module = SCTP_CC_HTCP; + stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_htcp_set_initial_cc_param; + stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_htcp_cwnd_update_after_sack; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_htcp_cwnd_update_after_fr; + stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_htcp_cwnd_update_after_timeout; + stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_htcp_cwnd_update_after_ecn_echo; + stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped; + stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_htcp_cwnd_update_after_fr_timer; + break; + } + /* JRS - By default, use RFC2581 */ + default: + { + stcb->asoc.congestion_control_module = SCTP_CC_RFC2581; + stcb->asoc.cc_functions.sctp_set_initial_cc_param = &sctp_set_initial_cc_param; + stcb->asoc.cc_functions.sctp_cwnd_update_after_sack = &sctp_cwnd_update_after_sack; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr = &sctp_cwnd_update_after_fr; + stcb->asoc.cc_functions.sctp_cwnd_update_after_timeout = &sctp_cwnd_update_after_timeout; + stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo = &sctp_cwnd_update_after_ecn_echo; + stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped = &sctp_cwnd_update_after_packet_dropped; + stcb->asoc.cc_functions.sctp_cwnd_update_after_output = &sctp_cwnd_update_after_output; + stcb->asoc.cc_functions.sctp_cwnd_update_after_fr_timer = &sctp_cwnd_update_after_fr_timer; + break; + } + } + + /* + * Now the stream parameters, here we allocate space for all streams + * that we request by default. + */ + asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams = + m->sctp_ep.pre_open_stream_count; + SCTP_MALLOC(asoc->strmout, struct sctp_stream_out *, + asoc->streamoutcnt * sizeof(struct sctp_stream_out), + SCTP_M_STRMO); + if (asoc->strmout == NULL) { + /* big trouble no memory */ + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM); + return (ENOMEM); + } + for (i = 0; i < asoc->streamoutcnt; i++) { + /* + * inbound side must be set to 0xffff, also NOTE when we get + * the INIT-ACK back (for INIT sender) we MUST reduce the + * count (streamoutcnt) but first check if we sent to any of + * the upper streams that were dropped (if some were). Those + * that were dropped must be notified to the upper layer as + * failed to send. + */ + asoc->strmout[i].next_sequence_sent = 0x0; + TAILQ_INIT(&asoc->strmout[i].outqueue); + asoc->strmout[i].stream_no = i; + asoc->strmout[i].last_msg_incomplete = 0; + asoc->strmout[i].next_spoke.tqe_next = 0; + asoc->strmout[i].next_spoke.tqe_prev = 0; + } + /* Now the mapping array */ + asoc->mapping_array_size = SCTP_INITIAL_MAPPING_ARRAY; + SCTP_MALLOC(asoc->mapping_array, uint8_t *, asoc->mapping_array_size, + SCTP_M_MAP); + if (asoc->mapping_array == NULL) { + SCTP_FREE(asoc->strmout, SCTP_M_STRMO); + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM); + return (ENOMEM); + } + memset(asoc->mapping_array, 0, asoc->mapping_array_size); + SCTP_MALLOC(asoc->nr_mapping_array, uint8_t *, asoc->mapping_array_size, + SCTP_M_MAP); + if (asoc->nr_mapping_array == NULL) { + SCTP_FREE(asoc->strmout, SCTP_M_STRMO); + SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM); + return (ENOMEM); + } + memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size); + + /* Now the init of the other outqueues */ + TAILQ_INIT(&asoc->free_chunks); + TAILQ_INIT(&asoc->out_wheel); + TAILQ_INIT(&asoc->control_send_queue); + TAILQ_INIT(&asoc->asconf_send_queue); + TAILQ_INIT(&asoc->send_queue); + TAILQ_INIT(&asoc->sent_queue); + TAILQ_INIT(&asoc->reasmqueue); + TAILQ_INIT(&asoc->resetHead); + asoc->max_inbound_streams = m->sctp_ep.max_open_streams_intome; + TAILQ_INIT(&asoc->asconf_queue); + /* authentication fields */ + asoc->authinfo.random = NULL; + asoc->authinfo.active_keyid = 0; + asoc->authinfo.assoc_key = NULL; + asoc->authinfo.assoc_keyid = 0; + asoc->authinfo.recv_key = NULL; + asoc->authinfo.recv_keyid = 0; + LIST_INIT(&asoc->shared_keys); + asoc->marked_retrans = 0; + asoc->timoinit = 0; + asoc->timodata = 0; + asoc->timosack = 0; + asoc->timoshutdown = 0; + asoc->timoheartbeat = 0; + asoc->timocookie = 0; + asoc->timoshutdownack = 0; + (void)SCTP_GETTIME_TIMEVAL(&asoc->start_time); + asoc->discontinuity_time = asoc->start_time; + /* + * sa_ignore MEMLEAK {memory is put in the assoc mapping array and + * freed later when the association is freed. + */ + return (0); +} + +void +sctp_print_mapping_array(struct sctp_association *asoc) +{ + unsigned int i, limit; + + printf("Mapping array size: %d, baseTSN: %8.8x, cumAck: %8.8x, highestTSN: (%8.8x, %8.8x).\n", + asoc->mapping_array_size, + asoc->mapping_array_base_tsn, + asoc->cumulative_tsn, + asoc->highest_tsn_inside_map, + asoc->highest_tsn_inside_nr_map); + for (limit = asoc->mapping_array_size; limit > 1; limit--) { + if (asoc->mapping_array[limit - 1]) { + break; + } + } + printf("Renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit); + for (i = 0; i < limit; i++) { + printf("%2.2x%c", asoc->mapping_array[i], ((i + 1) % 16) ? ' ' : '\n'); + } + if (limit % 16) + printf("\n"); + for (limit = asoc->mapping_array_size; limit > 1; limit--) { + if (asoc->nr_mapping_array[limit - 1]) { + break; + } + } + printf("Non renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit); + for (i = 0; i < limit; i++) { + printf("%2.2x%c", asoc->nr_mapping_array[i], ((i + 1) % 16) ? ' ' : '\n'); + } + if (limit % 16) + printf("\n"); +} + +int +sctp_expand_mapping_array(struct sctp_association *asoc, uint32_t needed) +{ + /* mapping array needs to grow */ + uint8_t *new_array1, *new_array2; + uint32_t new_size; + + new_size = asoc->mapping_array_size + ((needed + 7) / 8 + SCTP_MAPPING_ARRAY_INCR); + SCTP_MALLOC(new_array1, uint8_t *, new_size, SCTP_M_MAP); + SCTP_MALLOC(new_array2, uint8_t *, new_size, SCTP_M_MAP); + if ((new_array1 == NULL) || (new_array2 == NULL)) { + /* can't get more, forget it */ + SCTP_PRINTF("No memory for expansion of SCTP mapping array %d\n", new_size); + if (new_array1) { + SCTP_FREE(new_array1, SCTP_M_MAP); + } + if (new_array2) { + SCTP_FREE(new_array2, SCTP_M_MAP); + } + return (-1); + } + memset(new_array1, 0, new_size); + memset(new_array2, 0, new_size); + memcpy(new_array1, asoc->mapping_array, asoc->mapping_array_size); + memcpy(new_array2, asoc->nr_mapping_array, asoc->mapping_array_size); + SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); + SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP); + asoc->mapping_array = new_array1; + asoc->nr_mapping_array = new_array2; + asoc->mapping_array_size = new_size; + return (0); +} + + +static void +sctp_iterator_work(struct sctp_iterator *it) +{ + int iteration_count = 0; + int inp_skip = 0; + int first_in = 1; + struct sctp_inpcb *tinp; + + SCTP_INP_INFO_RLOCK(); + SCTP_ITERATOR_LOCK(); + if (it->inp) { + SCTP_INP_RLOCK(it->inp); + SCTP_INP_DECR_REF(it->inp); + } + if (it->inp == NULL) { + /* iterator is complete */ +done_with_iterator: + SCTP_ITERATOR_UNLOCK(); + SCTP_INP_INFO_RUNLOCK(); + if (it->function_atend != NULL) { + (*it->function_atend) (it->pointer, it->val); + } + SCTP_FREE(it, SCTP_M_ITER); + return; + } +select_a_new_ep: + if (first_in) { + first_in = 0; + } else { + SCTP_INP_RLOCK(it->inp); + } + while (((it->pcb_flags) && + ((it->inp->sctp_flags & it->pcb_flags) != it->pcb_flags)) || + ((it->pcb_features) && + ((it->inp->sctp_features & it->pcb_features) != it->pcb_features))) { + /* endpoint flags or features don't match, so keep looking */ + if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { + SCTP_INP_RUNLOCK(it->inp); + goto done_with_iterator; + } + tinp = it->inp; + it->inp = LIST_NEXT(it->inp, sctp_list); + SCTP_INP_RUNLOCK(tinp); + if (it->inp == NULL) { + goto done_with_iterator; + } + SCTP_INP_RLOCK(it->inp); + } + /* now go through each assoc which is in the desired state */ + if (it->done_current_ep == 0) { + if (it->function_inp != NULL) + inp_skip = (*it->function_inp) (it->inp, it->pointer, it->val); + it->done_current_ep = 1; + } + if (it->stcb == NULL) { + /* run the per instance function */ + it->stcb = LIST_FIRST(&it->inp->sctp_asoc_list); + } + if ((inp_skip) || it->stcb == NULL) { + if (it->function_inp_end != NULL) { + inp_skip = (*it->function_inp_end) (it->inp, + it->pointer, + it->val); + } + SCTP_INP_RUNLOCK(it->inp); + goto no_stcb; + } + while (it->stcb) { + SCTP_TCB_LOCK(it->stcb); + if (it->asoc_state && ((it->stcb->asoc.state & it->asoc_state) != it->asoc_state)) { + /* not in the right state... keep looking */ + SCTP_TCB_UNLOCK(it->stcb); + goto next_assoc; + } + /* see if we have limited out the iterator loop */ + iteration_count++; + if (iteration_count > SCTP_ITERATOR_MAX_AT_ONCE) { + /* Pause to let others grab the lock */ + atomic_add_int(&it->stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(it->stcb); + SCTP_INP_INCR_REF(it->inp); + SCTP_INP_RUNLOCK(it->inp); + SCTP_ITERATOR_UNLOCK(); + SCTP_INP_INFO_RUNLOCK(); + SCTP_INP_INFO_RLOCK(); + SCTP_ITERATOR_LOCK(); + if (sctp_it_ctl.iterator_flags) { + /* We won't be staying here */ + SCTP_INP_DECR_REF(it->inp); + atomic_add_int(&it->stcb->asoc.refcnt, -1); + if (sctp_it_ctl.iterator_flags & + SCTP_ITERATOR_MUST_EXIT) { + goto done_with_iterator; + } + if (sctp_it_ctl.iterator_flags & + SCTP_ITERATOR_STOP_CUR_IT) { + sctp_it_ctl.iterator_flags &= ~SCTP_ITERATOR_STOP_CUR_IT; + goto done_with_iterator; + } + if (sctp_it_ctl.iterator_flags & + SCTP_ITERATOR_STOP_CUR_INP) { + sctp_it_ctl.iterator_flags &= ~SCTP_ITERATOR_STOP_CUR_INP; + goto no_stcb; + } + /* If we reach here huh? */ + printf("Unknown it ctl flag %x\n", + sctp_it_ctl.iterator_flags); + sctp_it_ctl.iterator_flags = 0; + } + SCTP_INP_RLOCK(it->inp); + SCTP_INP_DECR_REF(it->inp); + SCTP_TCB_LOCK(it->stcb); + atomic_add_int(&it->stcb->asoc.refcnt, -1); + iteration_count = 0; + } + /* run function on this one */ + (*it->function_assoc) (it->inp, it->stcb, it->pointer, it->val); + + /* + * we lie here, it really needs to have its own type but + * first I must verify that this won't effect things :-0 + */ + if (it->no_chunk_output == 0) + sctp_chunk_output(it->inp, it->stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); + + SCTP_TCB_UNLOCK(it->stcb); +next_assoc: + it->stcb = LIST_NEXT(it->stcb, sctp_tcblist); + if (it->stcb == NULL) { + /* Run last function */ + if (it->function_inp_end != NULL) { + inp_skip = (*it->function_inp_end) (it->inp, + it->pointer, + it->val); + } + } + } + SCTP_INP_RUNLOCK(it->inp); +no_stcb: + /* done with all assocs on this endpoint, move on to next endpoint */ + it->done_current_ep = 0; + if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { + it->inp = NULL; + } else { + it->inp = LIST_NEXT(it->inp, sctp_list); + } + if (it->inp == NULL) { + goto done_with_iterator; + } + goto select_a_new_ep; +} + +void +sctp_iterator_worker(void) +{ + struct sctp_iterator *it = NULL; + + /* This function is called with the WQ lock in place */ + + sctp_it_ctl.iterator_running = 1; + sctp_it_ctl.cur_it = it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead); + while (it) { + /* now lets work on this one */ + TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); + SCTP_IPI_ITERATOR_WQ_UNLOCK(); + CURVNET_SET(it->vn); + sctp_iterator_work(it); + + CURVNET_RESTORE(); + SCTP_IPI_ITERATOR_WQ_LOCK(); + if (sctp_it_ctl.iterator_flags & SCTP_ITERATOR_MUST_EXIT) { + sctp_it_ctl.cur_it = NULL; + break; + } + /* sa_ignore FREED_MEMORY */ + sctp_it_ctl.cur_it = it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead); + } + sctp_it_ctl.iterator_running = 0; + return; +} + + +static void +sctp_handle_addr_wq(void) +{ + /* deal with the ADDR wq from the rtsock calls */ + struct sctp_laddr *wi; + struct sctp_asconf_iterator *asc; + + SCTP_MALLOC(asc, struct sctp_asconf_iterator *, + sizeof(struct sctp_asconf_iterator), SCTP_M_ASC_IT); + if (asc == NULL) { + /* Try later, no memory */ + sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, + (struct sctp_inpcb *)NULL, + (struct sctp_tcb *)NULL, + (struct sctp_nets *)NULL); + return; + } + LIST_INIT(&asc->list_of_work); + asc->cnt = 0; + + SCTP_WQ_ADDR_LOCK(); + wi = LIST_FIRST(&SCTP_BASE_INFO(addr_wq)); + while (wi != NULL) { + LIST_REMOVE(wi, sctp_nxt_addr); + LIST_INSERT_HEAD(&asc->list_of_work, wi, sctp_nxt_addr); + asc->cnt++; + wi = LIST_FIRST(&SCTP_BASE_INFO(addr_wq)); + } + SCTP_WQ_ADDR_UNLOCK(); + + if (asc->cnt == 0) { + SCTP_FREE(asc, SCTP_M_ASC_IT); + } else { + (void)sctp_initiate_iterator(sctp_asconf_iterator_ep, + sctp_asconf_iterator_stcb, + NULL, /* No ep end for boundall */ + SCTP_PCB_FLAGS_BOUNDALL, + SCTP_PCB_ANY_FEATURES, + SCTP_ASOC_ANY_STATE, + (void *)asc, 0, + sctp_asconf_iterator_end, NULL, 0); + } +} + +int retcode = 0; +int cur_oerr = 0; + +void +sctp_timeout_handler(void *t) +{ + struct sctp_inpcb *inp; + struct sctp_tcb *stcb; + struct sctp_nets *net; + struct sctp_timer *tmr; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + int did_output, type; + + tmr = (struct sctp_timer *)t; + inp = (struct sctp_inpcb *)tmr->ep; + stcb = (struct sctp_tcb *)tmr->tcb; + net = (struct sctp_nets *)tmr->net; + CURVNET_SET((struct vnet *)tmr->vnet); + did_output = 1; + +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xF0, (uint8_t) tmr->type); + sctp_auditing(3, inp, stcb, net); +#endif + + /* sanity checks... */ + if (tmr->self != (void *)tmr) { + /* + * SCTP_PRINTF("Stale SCTP timer fired (%p), ignoring...\n", + * tmr); + */ + CURVNET_RESTORE(); + return; + } + tmr->stopped_from = 0xa001; + if (!SCTP_IS_TIMER_TYPE_VALID(tmr->type)) { + /* + * SCTP_PRINTF("SCTP timer fired with invalid type: 0x%x\n", + * tmr->type); + */ + CURVNET_RESTORE(); + return; + } + tmr->stopped_from = 0xa002; + if ((tmr->type != SCTP_TIMER_TYPE_ADDR_WQ) && (inp == NULL)) { + CURVNET_RESTORE(); + return; + } + /* if this is an iterator timeout, get the struct and clear inp */ + tmr->stopped_from = 0xa003; + type = tmr->type; + if (inp) { + SCTP_INP_INCR_REF(inp); + if ((inp->sctp_socket == 0) && + ((tmr->type != SCTP_TIMER_TYPE_INPKILL) && + (tmr->type != SCTP_TIMER_TYPE_INIT) && + (tmr->type != SCTP_TIMER_TYPE_SEND) && + (tmr->type != SCTP_TIMER_TYPE_RECV) && + (tmr->type != SCTP_TIMER_TYPE_HEARTBEAT) && + (tmr->type != SCTP_TIMER_TYPE_SHUTDOWN) && + (tmr->type != SCTP_TIMER_TYPE_SHUTDOWNACK) && + (tmr->type != SCTP_TIMER_TYPE_SHUTDOWNGUARD) && + (tmr->type != SCTP_TIMER_TYPE_ASOCKILL)) + ) { + SCTP_INP_DECR_REF(inp); + CURVNET_RESTORE(); + return; + } + } + tmr->stopped_from = 0xa004; + if (stcb) { + atomic_add_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state == 0) { + atomic_add_int(&stcb->asoc.refcnt, -1); + if (inp) { + SCTP_INP_DECR_REF(inp); + } + CURVNET_RESTORE(); + return; + } + } + tmr->stopped_from = 0xa005; + SCTPDBG(SCTP_DEBUG_TIMER1, "Timer type %d goes off\n", tmr->type); + if (!SCTP_OS_TIMER_ACTIVE(&tmr->timer)) { + if (inp) { + SCTP_INP_DECR_REF(inp); + } + if (stcb) { + atomic_add_int(&stcb->asoc.refcnt, -1); + } + CURVNET_RESTORE(); + return; + } + tmr->stopped_from = 0xa006; + + if (stcb) { + SCTP_TCB_LOCK(stcb); + atomic_add_int(&stcb->asoc.refcnt, -1); + if ((tmr->type != SCTP_TIMER_TYPE_ASOCKILL) && + ((stcb->asoc.state == 0) || + (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED))) { + SCTP_TCB_UNLOCK(stcb); + if (inp) { + SCTP_INP_DECR_REF(inp); + } + CURVNET_RESTORE(); + return; + } + } + /* record in stopped what t-o occured */ + tmr->stopped_from = tmr->type; + + /* mark as being serviced now */ + if (SCTP_OS_TIMER_PENDING(&tmr->timer)) { + /* + * Callout has been rescheduled. + */ + goto get_out; + } + if (!SCTP_OS_TIMER_ACTIVE(&tmr->timer)) { + /* + * Not active, so no action. + */ + goto get_out; + } + SCTP_OS_TIMER_DEACTIVATE(&tmr->timer); + + /* call the handler for the appropriate timer type */ + switch (tmr->type) { + case SCTP_TIMER_TYPE_ZERO_COPY: + if (inp == NULL) { + break; + } + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) { + SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket); + } + break; + case SCTP_TIMER_TYPE_ZCOPY_SENDQ: + if (inp == NULL) { + break; + } + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) { + SCTP_ZERO_COPY_SENDQ_EVENT(inp, inp->sctp_socket); + } + break; + case SCTP_TIMER_TYPE_ADDR_WQ: + sctp_handle_addr_wq(); + break; + case SCTP_TIMER_TYPE_SEND: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + SCTP_STAT_INCR(sctps_timodata); + stcb->asoc.timodata++; + stcb->asoc.num_send_timers_up--; + if (stcb->asoc.num_send_timers_up < 0) { + stcb->asoc.num_send_timers_up = 0; + } + SCTP_TCB_LOCK_ASSERT(stcb); + cur_oerr = stcb->asoc.overall_error_count; + retcode = sctp_t3rxt_timer(inp, stcb, net); + if (retcode) { + /* no need to unlock on tcb its gone */ + + goto out_decr; + } + SCTP_TCB_LOCK_ASSERT(stcb); +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(4, inp, stcb, net); +#endif + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); + if ((stcb->asoc.num_send_timers_up == 0) && + (stcb->asoc.sent_queue_cnt > 0) + ) { + struct sctp_tmit_chunk *chk; + + /* + * safeguard. If there on some on the sent queue + * somewhere but no timers running something is + * wrong... so we start a timer on the first chunk + * on the send queue on whatever net it is sent to. + */ + chk = TAILQ_FIRST(&stcb->asoc.sent_queue); + sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, + chk->whoTo); + } + break; + case SCTP_TIMER_TYPE_INIT: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + SCTP_STAT_INCR(sctps_timoinit); + stcb->asoc.timoinit++; + if (sctp_t1init_timer(inp, stcb, net)) { + /* no need to unlock on tcb its gone */ + goto out_decr; + } + /* We do output but not here */ + did_output = 0; + break; + case SCTP_TIMER_TYPE_RECV: + if ((stcb == NULL) || (inp == NULL)) { + break; + } { + SCTP_STAT_INCR(sctps_timosack); + stcb->asoc.timosack++; + sctp_send_sack(stcb); + } +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(4, inp, stcb, net); +#endif + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SACK_TMR, SCTP_SO_NOT_LOCKED); + break; + case SCTP_TIMER_TYPE_SHUTDOWN: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + if (sctp_shutdown_timer(inp, stcb, net)) { + /* no need to unlock on tcb its gone */ + goto out_decr; + } + SCTP_STAT_INCR(sctps_timoshutdown); + stcb->asoc.timoshutdown++; +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(4, inp, stcb, net); +#endif + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_NOT_LOCKED); + break; + case SCTP_TIMER_TYPE_HEARTBEAT: + { + struct sctp_nets *lnet; + int cnt_of_unconf = 0; + + if ((stcb == NULL) || (inp == NULL)) { + break; + } + SCTP_STAT_INCR(sctps_timoheartbeat); + stcb->asoc.timoheartbeat++; + TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) { + if ((lnet->dest_state & SCTP_ADDR_UNCONFIRMED) && + (lnet->dest_state & SCTP_ADDR_REACHABLE)) { + cnt_of_unconf++; + } + } + if (cnt_of_unconf == 0) { + if (sctp_heartbeat_timer(inp, stcb, lnet, + cnt_of_unconf)) { + /* no need to unlock on tcb its gone */ + goto out_decr; + } + } +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(4, inp, stcb, lnet); +#endif + sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, + stcb->sctp_ep, stcb, lnet); + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_HB_TMR, SCTP_SO_NOT_LOCKED); + } + break; + case SCTP_TIMER_TYPE_COOKIE: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + if (sctp_cookie_timer(inp, stcb, net)) { + /* no need to unlock on tcb its gone */ + goto out_decr; + } + SCTP_STAT_INCR(sctps_timocookie); + stcb->asoc.timocookie++; +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(4, inp, stcb, net); +#endif + /* + * We consider T3 and Cookie timer pretty much the same with + * respect to where from in chunk_output. + */ + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); + break; + case SCTP_TIMER_TYPE_NEWCOOKIE: + { + struct timeval tv; + int i, secret; + + if (inp == NULL) { + break; + } + SCTP_STAT_INCR(sctps_timosecret); + (void)SCTP_GETTIME_TIMEVAL(&tv); + SCTP_INP_WLOCK(inp); + inp->sctp_ep.time_of_secret_change = tv.tv_sec; + inp->sctp_ep.last_secret_number = + inp->sctp_ep.current_secret_number; + inp->sctp_ep.current_secret_number++; + if (inp->sctp_ep.current_secret_number >= + SCTP_HOW_MANY_SECRETS) { + inp->sctp_ep.current_secret_number = 0; + } + secret = (int)inp->sctp_ep.current_secret_number; + for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) { + inp->sctp_ep.secret_key[secret][i] = + sctp_select_initial_TSN(&inp->sctp_ep); + } + SCTP_INP_WUNLOCK(inp); + sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, stcb, net); + } + did_output = 0; + break; + case SCTP_TIMER_TYPE_PATHMTURAISE: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + SCTP_STAT_INCR(sctps_timopathmtu); + sctp_pathmtu_timer(inp, stcb, net); + did_output = 0; + break; + case SCTP_TIMER_TYPE_SHUTDOWNACK: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + if (sctp_shutdownack_timer(inp, stcb, net)) { + /* no need to unlock on tcb its gone */ + goto out_decr; + } + SCTP_STAT_INCR(sctps_timoshutdownack); + stcb->asoc.timoshutdownack++; +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(4, inp, stcb, net); +#endif + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SHUT_ACK_TMR, SCTP_SO_NOT_LOCKED); + break; + case SCTP_TIMER_TYPE_SHUTDOWNGUARD: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + SCTP_STAT_INCR(sctps_timoshutdownguard); + sctp_abort_an_association(inp, stcb, + SCTP_SHUTDOWN_GUARD_EXPIRES, NULL, SCTP_SO_NOT_LOCKED); + /* no need to unlock on tcb its gone */ + goto out_decr; + + case SCTP_TIMER_TYPE_STRRESET: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + if (sctp_strreset_timer(inp, stcb, net)) { + /* no need to unlock on tcb its gone */ + goto out_decr; + } + SCTP_STAT_INCR(sctps_timostrmrst); + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_TMR, SCTP_SO_NOT_LOCKED); + break; + case SCTP_TIMER_TYPE_EARLYFR: + /* Need to do FR of things for net */ + if ((stcb == NULL) || (inp == NULL)) { + break; + } + SCTP_STAT_INCR(sctps_timoearlyfr); + sctp_early_fr_timer(inp, stcb, net); + break; + case SCTP_TIMER_TYPE_ASCONF: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + if (sctp_asconf_timer(inp, stcb, net)) { + /* no need to unlock on tcb its gone */ + goto out_decr; + } + SCTP_STAT_INCR(sctps_timoasconf); +#ifdef SCTP_AUDITING_ENABLED + sctp_auditing(4, inp, stcb, net); +#endif + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_ASCONF_TMR, SCTP_SO_NOT_LOCKED); + break; + case SCTP_TIMER_TYPE_PRIM_DELETED: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + sctp_delete_prim_timer(inp, stcb, net); + SCTP_STAT_INCR(sctps_timodelprim); + break; + + case SCTP_TIMER_TYPE_AUTOCLOSE: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + SCTP_STAT_INCR(sctps_timoautoclose); + sctp_autoclose_timer(inp, stcb, net); + sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_AUTOCLOSE_TMR, SCTP_SO_NOT_LOCKED); + did_output = 0; + break; + case SCTP_TIMER_TYPE_ASOCKILL: + if ((stcb == NULL) || (inp == NULL)) { + break; + } + SCTP_STAT_INCR(sctps_timoassockill); + /* Can we free it yet? */ + SCTP_INP_DECR_REF(inp); + sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_1); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(inp); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_2); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + /* + * free asoc, always unlocks (or destroy's) so prevent + * duplicate unlock or unlock of a free mtx :-0 + */ + stcb = NULL; + goto out_no_decr; + case SCTP_TIMER_TYPE_INPKILL: + SCTP_STAT_INCR(sctps_timoinpkill); + if (inp == NULL) { + break; + } + /* + * special case, take away our increment since WE are the + * killer + */ + SCTP_INP_DECR_REF(inp); + sctp_timer_stop(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_3); + sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, + SCTP_CALLED_FROM_INPKILL_TIMER); + inp = NULL; + goto out_no_decr; + default: + SCTPDBG(SCTP_DEBUG_TIMER1, "sctp_timeout_handler:unknown timer %d\n", + tmr->type); + break; + }; +#ifdef SCTP_AUDITING_ENABLED + sctp_audit_log(0xF1, (uint8_t) tmr->type); + if (inp) + sctp_auditing(5, inp, stcb, net); +#endif + if ((did_output) && stcb) { + /* + * Now we need to clean up the control chunk chain if an + * ECNE is on it. It must be marked as UNSENT again so next + * call will continue to send it until such time that we get + * a CWR, to remove it. It is, however, less likely that we + * will find a ecn echo on the chain though. + */ + sctp_fix_ecn_echo(&stcb->asoc); + } +get_out: + if (stcb) { + SCTP_TCB_UNLOCK(stcb); + } +out_decr: + if (inp) { + SCTP_INP_DECR_REF(inp); + } +out_no_decr: + SCTPDBG(SCTP_DEBUG_TIMER1, "Timer now complete (type %d)\n", + type); + CURVNET_RESTORE(); +} + +void +sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net) +{ + int to_ticks; + struct sctp_timer *tmr; + + if ((t_type != SCTP_TIMER_TYPE_ADDR_WQ) && (inp == NULL)) + return; + + to_ticks = 0; + + tmr = NULL; + if (stcb) { + SCTP_TCB_LOCK_ASSERT(stcb); + } + switch (t_type) { + case SCTP_TIMER_TYPE_ZERO_COPY: + tmr = &inp->sctp_ep.zero_copy_timer; + to_ticks = SCTP_ZERO_COPY_TICK_DELAY; + break; + case SCTP_TIMER_TYPE_ZCOPY_SENDQ: + tmr = &inp->sctp_ep.zero_copy_sendq_timer; + to_ticks = SCTP_ZERO_COPY_SENDQ_TICK_DELAY; + break; + case SCTP_TIMER_TYPE_ADDR_WQ: + /* Only 1 tick away :-) */ + tmr = &SCTP_BASE_INFO(addr_wq_timer); + to_ticks = SCTP_ADDRESS_TICK_DELAY; + break; + case SCTP_TIMER_TYPE_SEND: + /* Here we use the RTO timer */ + { + int rto_val; + + if ((stcb == NULL) || (net == NULL)) { + return; + } + tmr = &net->rxt_timer; + if (net->RTO == 0) { + rto_val = stcb->asoc.initial_rto; + } else { + rto_val = net->RTO; + } + to_ticks = MSEC_TO_TICKS(rto_val); + } + break; + case SCTP_TIMER_TYPE_INIT: + /* + * Here we use the INIT timer default usually about 1 + * minute. + */ + if ((stcb == NULL) || (net == NULL)) { + return; + } + tmr = &net->rxt_timer; + if (net->RTO == 0) { + to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto); + } else { + to_ticks = MSEC_TO_TICKS(net->RTO); + } + break; + case SCTP_TIMER_TYPE_RECV: + /* + * Here we use the Delayed-Ack timer value from the inp + * ususually about 200ms. + */ + if (stcb == NULL) { + return; + } + tmr = &stcb->asoc.dack_timer; + to_ticks = MSEC_TO_TICKS(stcb->asoc.delayed_ack); + break; + case SCTP_TIMER_TYPE_SHUTDOWN: + /* Here we use the RTO of the destination. */ + if ((stcb == NULL) || (net == NULL)) { + return; + } + if (net->RTO == 0) { + to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto); + } else { + to_ticks = MSEC_TO_TICKS(net->RTO); + } + tmr = &net->rxt_timer; + break; + case SCTP_TIMER_TYPE_HEARTBEAT: + /* + * the net is used here so that we can add in the RTO. Even + * though we use a different timer. We also add the HB timer + * PLUS a random jitter. + */ + if ((inp == NULL) || (stcb == NULL)) { + return; + } else { + uint32_t rndval; + uint8_t this_random; + int cnt_of_unconf = 0; + struct sctp_nets *lnet; + + TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) { + if ((lnet->dest_state & SCTP_ADDR_UNCONFIRMED) && + (lnet->dest_state & SCTP_ADDR_REACHABLE)) { + cnt_of_unconf++; + } + } + if (cnt_of_unconf) { + net = lnet = NULL; + (void)sctp_heartbeat_timer(inp, stcb, lnet, cnt_of_unconf); + } + if (stcb->asoc.hb_random_idx > 3) { + rndval = sctp_select_initial_TSN(&inp->sctp_ep); + memcpy(stcb->asoc.hb_random_values, &rndval, + sizeof(stcb->asoc.hb_random_values)); + stcb->asoc.hb_random_idx = 0; + } + this_random = stcb->asoc.hb_random_values[stcb->asoc.hb_random_idx]; + stcb->asoc.hb_random_idx++; + stcb->asoc.hb_ect_randombit = 0; + /* + * this_random will be 0 - 256 ms RTO is in ms. + */ + if ((stcb->asoc.hb_is_disabled) && + (cnt_of_unconf == 0)) { + return; + } + if (net) { + int delay; + + delay = stcb->asoc.heart_beat_delay; + TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) { + if ((lnet->dest_state & SCTP_ADDR_UNCONFIRMED) && + ((lnet->dest_state & SCTP_ADDR_OUT_OF_SCOPE) == 0) && + (lnet->dest_state & SCTP_ADDR_REACHABLE)) { + delay = 0; + } + } + if (net->RTO == 0) { + /* Never been checked */ + to_ticks = this_random + stcb->asoc.initial_rto + delay; + } else { + /* set rto_val to the ms */ + to_ticks = delay + net->RTO + this_random; + } + } else { + if (cnt_of_unconf) { + to_ticks = this_random + stcb->asoc.initial_rto; + } else { + to_ticks = stcb->asoc.heart_beat_delay + this_random + stcb->asoc.initial_rto; + } + } + /* + * Now we must convert the to_ticks that are now in + * ms to ticks. + */ + to_ticks = MSEC_TO_TICKS(to_ticks); + tmr = &stcb->asoc.hb_timer; + } + break; + case SCTP_TIMER_TYPE_COOKIE: + /* + * Here we can use the RTO timer from the network since one + * RTT was compelete. If a retran happened then we will be + * using the RTO initial value. + */ + if ((stcb == NULL) || (net == NULL)) { + return; + } + if (net->RTO == 0) { + to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto); + } else { + to_ticks = MSEC_TO_TICKS(net->RTO); + } + tmr = &net->rxt_timer; + break; + case SCTP_TIMER_TYPE_NEWCOOKIE: + /* + * nothing needed but the endpoint here ususually about 60 + * minutes. + */ + if (inp == NULL) { + return; + } + tmr = &inp->sctp_ep.signature_change; + to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_SIGNATURE]; + break; + case SCTP_TIMER_TYPE_ASOCKILL: + if (stcb == NULL) { + return; + } + tmr = &stcb->asoc.strreset_timer; + to_ticks = MSEC_TO_TICKS(SCTP_ASOC_KILL_TIMEOUT); + break; + case SCTP_TIMER_TYPE_INPKILL: + /* + * The inp is setup to die. We re-use the signature_chage + * timer since that has stopped and we are in the GONE + * state. + */ + if (inp == NULL) { + return; + } + tmr = &inp->sctp_ep.signature_change; + to_ticks = MSEC_TO_TICKS(SCTP_INP_KILL_TIMEOUT); + break; + case SCTP_TIMER_TYPE_PATHMTURAISE: + /* + * Here we use the value found in the EP for PMTU ususually + * about 10 minutes. + */ + if ((stcb == NULL) || (inp == NULL)) { + return; + } + if (net == NULL) { + return; + } + to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_PMTU]; + tmr = &net->pmtu_timer; + break; + case SCTP_TIMER_TYPE_SHUTDOWNACK: + /* Here we use the RTO of the destination */ + if ((stcb == NULL) || (net == NULL)) { + return; + } + if (net->RTO == 0) { + to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto); + } else { + to_ticks = MSEC_TO_TICKS(net->RTO); + } + tmr = &net->rxt_timer; + break; + case SCTP_TIMER_TYPE_SHUTDOWNGUARD: + /* + * Here we use the endpoints shutdown guard timer usually + * about 3 minutes. + */ + if ((inp == NULL) || (stcb == NULL)) { + return; + } + to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN]; + tmr = &stcb->asoc.shut_guard_timer; + break; + case SCTP_TIMER_TYPE_STRRESET: + /* + * Here the timer comes from the stcb but its value is from + * the net's RTO. + */ + if ((stcb == NULL) || (net == NULL)) { + return; + } + if (net->RTO == 0) { + to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto); + } else { + to_ticks = MSEC_TO_TICKS(net->RTO); + } + tmr = &stcb->asoc.strreset_timer; + break; + + case SCTP_TIMER_TYPE_EARLYFR: + { + unsigned int msec; + + if ((stcb == NULL) || (net == NULL)) { + return; + } + if (net->flight_size > net->cwnd) { + /* no need to start */ + return; + } + SCTP_STAT_INCR(sctps_earlyfrstart); + if (net->lastsa == 0) { + /* Hmm no rtt estimate yet? */ + msec = stcb->asoc.initial_rto >> 2; + } else { + msec = ((net->lastsa >> 2) + net->lastsv) >> 1; + } + if (msec < SCTP_BASE_SYSCTL(sctp_early_fr_msec)) { + msec = SCTP_BASE_SYSCTL(sctp_early_fr_msec); + if (msec < SCTP_MINFR_MSEC_FLOOR) { + msec = SCTP_MINFR_MSEC_FLOOR; + } + } + to_ticks = MSEC_TO_TICKS(msec); + tmr = &net->fr_timer; + } + break; + case SCTP_TIMER_TYPE_ASCONF: + /* + * Here the timer comes from the stcb but its value is from + * the net's RTO. + */ + if ((stcb == NULL) || (net == NULL)) { + return; + } + if (net->RTO == 0) { + to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto); + } else { + to_ticks = MSEC_TO_TICKS(net->RTO); + } + tmr = &stcb->asoc.asconf_timer; + break; + case SCTP_TIMER_TYPE_PRIM_DELETED: + if ((stcb == NULL) || (net != NULL)) { + return; + } + to_ticks = MSEC_TO_TICKS(stcb->asoc.initial_rto); + tmr = &stcb->asoc.delete_prim_timer; + break; + case SCTP_TIMER_TYPE_AUTOCLOSE: + if (stcb == NULL) { + return; + } + if (stcb->asoc.sctp_autoclose_ticks == 0) { + /* + * Really an error since stcb is NOT set to + * autoclose + */ + return; + } + to_ticks = stcb->asoc.sctp_autoclose_ticks; + tmr = &stcb->asoc.autoclose_timer; + break; + default: + SCTPDBG(SCTP_DEBUG_TIMER1, "%s: Unknown timer type %d\n", + __FUNCTION__, t_type); + return; + break; + }; + if ((to_ticks <= 0) || (tmr == NULL)) { + SCTPDBG(SCTP_DEBUG_TIMER1, "%s: %d:software error to_ticks:%d tmr:%p not set ??\n", + __FUNCTION__, t_type, to_ticks, tmr); + return; + } + if (SCTP_OS_TIMER_PENDING(&tmr->timer)) { + /* + * we do NOT allow you to have it already running. if it is + * we leave the current one up unchanged + */ + return; + } + /* At this point we can proceed */ + if (t_type == SCTP_TIMER_TYPE_SEND) { + stcb->asoc.num_send_timers_up++; + } + tmr->stopped_from = 0; + tmr->type = t_type; + tmr->ep = (void *)inp; + tmr->tcb = (void *)stcb; + tmr->net = (void *)net; + tmr->self = (void *)tmr; + tmr->vnet = (void *)curvnet; + tmr->ticks = sctp_get_tick_count(); + (void)SCTP_OS_TIMER_START(&tmr->timer, to_ticks, sctp_timeout_handler, tmr); + return; +} + +void +sctp_timer_stop(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct sctp_nets *net, uint32_t from) +{ + struct sctp_timer *tmr; + + if ((t_type != SCTP_TIMER_TYPE_ADDR_WQ) && + (inp == NULL)) + return; + + tmr = NULL; + if (stcb) { + SCTP_TCB_LOCK_ASSERT(stcb); + } + switch (t_type) { + case SCTP_TIMER_TYPE_ZERO_COPY: + tmr = &inp->sctp_ep.zero_copy_timer; + break; + case SCTP_TIMER_TYPE_ZCOPY_SENDQ: + tmr = &inp->sctp_ep.zero_copy_sendq_timer; + break; + case SCTP_TIMER_TYPE_ADDR_WQ: + tmr = &SCTP_BASE_INFO(addr_wq_timer); + break; + case SCTP_TIMER_TYPE_EARLYFR: + if ((stcb == NULL) || (net == NULL)) { + return; + } + tmr = &net->fr_timer; + SCTP_STAT_INCR(sctps_earlyfrstop); + break; + case SCTP_TIMER_TYPE_SEND: + if ((stcb == NULL) || (net == NULL)) { + return; + } + tmr = &net->rxt_timer; + break; + case SCTP_TIMER_TYPE_INIT: + if ((stcb == NULL) || (net == NULL)) { + return; + } + tmr = &net->rxt_timer; + break; + case SCTP_TIMER_TYPE_RECV: + if (stcb == NULL) { + return; + } + tmr = &stcb->asoc.dack_timer; + break; + case SCTP_TIMER_TYPE_SHUTDOWN: + if ((stcb == NULL) || (net == NULL)) { + return; + } + tmr = &net->rxt_timer; + break; + case SCTP_TIMER_TYPE_HEARTBEAT: + if (stcb == NULL) { + return; + } + tmr = &stcb->asoc.hb_timer; + break; + case SCTP_TIMER_TYPE_COOKIE: + if ((stcb == NULL) || (net == NULL)) { + return; + } + tmr = &net->rxt_timer; + break; + case SCTP_TIMER_TYPE_NEWCOOKIE: + /* nothing needed but the endpoint here */ + tmr = &inp->sctp_ep.signature_change; + /* + * We re-use the newcookie timer for the INP kill timer. We + * must assure that we do not kill it by accident. + */ + break; + case SCTP_TIMER_TYPE_ASOCKILL: + /* + * Stop the asoc kill timer. + */ + if (stcb == NULL) { + return; + } + tmr = &stcb->asoc.strreset_timer; + break; + + case SCTP_TIMER_TYPE_INPKILL: + /* + * The inp is setup to die. We re-use the signature_chage + * timer since that has stopped and we are in the GONE + * state. + */ + tmr = &inp->sctp_ep.signature_change; + break; + case SCTP_TIMER_TYPE_PATHMTURAISE: + if ((stcb == NULL) || (net == NULL)) { + return; + } + tmr = &net->pmtu_timer; + break; + case SCTP_TIMER_TYPE_SHUTDOWNACK: + if ((stcb == NULL) || (net == NULL)) { + return; + } + tmr = &net->rxt_timer; + break; + case SCTP_TIMER_TYPE_SHUTDOWNGUARD: + if (stcb == NULL) { + return; + } + tmr = &stcb->asoc.shut_guard_timer; + break; + case SCTP_TIMER_TYPE_STRRESET: + if (stcb == NULL) { + return; + } + tmr = &stcb->asoc.strreset_timer; + break; + case SCTP_TIMER_TYPE_ASCONF: + if (stcb == NULL) { + return; + } + tmr = &stcb->asoc.asconf_timer; + break; + case SCTP_TIMER_TYPE_PRIM_DELETED: + if (stcb == NULL) { + return; + } + tmr = &stcb->asoc.delete_prim_timer; + break; + case SCTP_TIMER_TYPE_AUTOCLOSE: + if (stcb == NULL) { + return; + } + tmr = &stcb->asoc.autoclose_timer; + break; + default: + SCTPDBG(SCTP_DEBUG_TIMER1, "%s: Unknown timer type %d\n", + __FUNCTION__, t_type); + break; + }; + if (tmr == NULL) { + return; + } + if ((tmr->type != t_type) && tmr->type) { + /* + * Ok we have a timer that is under joint use. Cookie timer + * per chance with the SEND timer. We therefore are NOT + * running the timer that the caller wants stopped. So just + * return. + */ + return; + } + if ((t_type == SCTP_TIMER_TYPE_SEND) && (stcb != NULL)) { + stcb->asoc.num_send_timers_up--; + if (stcb->asoc.num_send_timers_up < 0) { + stcb->asoc.num_send_timers_up = 0; + } + } + tmr->self = NULL; + tmr->stopped_from = from; + (void)SCTP_OS_TIMER_STOP(&tmr->timer); + return; +} + +uint32_t +sctp_calculate_len(struct mbuf *m) +{ + uint32_t tlen = 0; + struct mbuf *at; + + at = m; + while (at) { + tlen += SCTP_BUF_LEN(at); + at = SCTP_BUF_NEXT(at); + } + return (tlen); +} + +void +sctp_mtu_size_reset(struct sctp_inpcb *inp, + struct sctp_association *asoc, uint32_t mtu) +{ + /* + * Reset the P-MTU size on this association, this involves changing + * the asoc MTU, going through ANY chunk+overhead larger than mtu to + * allow the DF flag to be cleared. + */ + struct sctp_tmit_chunk *chk; + unsigned int eff_mtu, ovh; + + asoc->smallest_mtu = mtu; + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ovh = SCTP_MIN_OVERHEAD; + } else { + ovh = SCTP_MIN_V4_OVERHEAD; + } + eff_mtu = mtu - ovh; + TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) { + if (chk->send_size > eff_mtu) { + chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; + } + } + TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { + if (chk->send_size > eff_mtu) { + chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; + } + } +} + + +/* + * given an association and starting time of the current RTT period return + * RTO in number of msecs net should point to the current network + */ +uint32_t +sctp_calculate_rto(struct sctp_tcb *stcb, + struct sctp_association *asoc, + struct sctp_nets *net, + struct timeval *told, + int safe) +{ + /*- + * given an association and the starting time of the current RTT + * period (in value1/value2) return RTO in number of msecs. + */ + int calc_time = 0; + int o_calctime; + uint32_t new_rto = 0; + int first_measure = 0; + struct timeval now, then, *old; + + /* Copy it out for sparc64 */ + if (safe == sctp_align_unsafe_makecopy) { + old = &then; + memcpy(&then, told, sizeof(struct timeval)); + } else if (safe == sctp_align_safe_nocopy) { + old = told; + } else { + /* error */ + SCTP_PRINTF("Huh, bad rto calc call\n"); + return (0); + } + /************************/ + /* 1. calculate new RTT */ + /************************/ + /* get the current time */ + (void)SCTP_GETTIME_TIMEVAL(&now); + /* compute the RTT value */ + if ((u_long)now.tv_sec > (u_long)old->tv_sec) { + calc_time = ((u_long)now.tv_sec - (u_long)old->tv_sec) * 1000; + if ((u_long)now.tv_usec > (u_long)old->tv_usec) { + calc_time += (((u_long)now.tv_usec - + (u_long)old->tv_usec) / 1000); + } else if ((u_long)now.tv_usec < (u_long)old->tv_usec) { + /* Borrow 1,000ms from current calculation */ + calc_time -= 1000; + /* Add in the slop over */ + calc_time += ((int)now.tv_usec / 1000); + /* Add in the pre-second ms's */ + calc_time += (((int)1000000 - (int)old->tv_usec) / 1000); + } + } else if ((u_long)now.tv_sec == (u_long)old->tv_sec) { + if ((u_long)now.tv_usec > (u_long)old->tv_usec) { + calc_time = ((u_long)now.tv_usec - + (u_long)old->tv_usec) / 1000; + } else if ((u_long)now.tv_usec < (u_long)old->tv_usec) { + /* impossible .. garbage in nothing out */ + goto calc_rto; + } else if ((u_long)now.tv_usec == (u_long)old->tv_usec) { + /* + * We have to have 1 usec :-D this must be the + * loopback. + */ + calc_time = 1; + } else { + /* impossible .. garbage in nothing out */ + goto calc_rto; + } + } else { + /* Clock wrapped? */ + goto calc_rto; + } + /***************************/ + /* 2. update RTTVAR & SRTT */ + /***************************/ + net->rtt = o_calctime = calc_time; + /* this is Van Jacobson's integer version */ + if (net->RTO_measured) { + calc_time -= (net->lastsa >> SCTP_RTT_SHIFT); /* take away 1/8th when + * shift=3 */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RTTVAR_LOGGING_ENABLE) { + rto_logging(net, SCTP_LOG_RTTVAR); + } + net->prev_rtt = o_calctime; + net->lastsa += calc_time; /* add 7/8th into sa when + * shift=3 */ + if (calc_time < 0) { + calc_time = -calc_time; + } + calc_time -= (net->lastsv >> SCTP_RTT_VAR_SHIFT); /* take away 1/4 when + * VAR shift=2 */ + net->lastsv += calc_time; + if (net->lastsv == 0) { + net->lastsv = SCTP_CLOCK_GRANULARITY; + } + } else { + /* First RTO measurment */ + net->RTO_measured = 1; + net->lastsa = calc_time << SCTP_RTT_SHIFT; /* Multiply by 8 when + * shift=3 */ + net->lastsv = calc_time; + if (net->lastsv == 0) { + net->lastsv = SCTP_CLOCK_GRANULARITY; + } + first_measure = 1; + net->prev_rtt = o_calctime; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RTTVAR_LOGGING_ENABLE) { + rto_logging(net, SCTP_LOG_INITIAL_RTT); + } + } +calc_rto: + new_rto = (net->lastsa >> SCTP_RTT_SHIFT) + net->lastsv; + if ((new_rto > SCTP_SAT_NETWORK_MIN) && + (stcb->asoc.sat_network_lockout == 0)) { + stcb->asoc.sat_network = 1; + } else if ((!first_measure) && stcb->asoc.sat_network) { + stcb->asoc.sat_network = 0; + stcb->asoc.sat_network_lockout = 1; + } + /* bound it, per C6/C7 in Section 5.3.1 */ + if (new_rto < stcb->asoc.minrto) { + new_rto = stcb->asoc.minrto; + } + if (new_rto > stcb->asoc.maxrto) { + new_rto = stcb->asoc.maxrto; + } + /* we are now returning the RTO */ + return (new_rto); +} + +/* + * return a pointer to a contiguous piece of data from the given mbuf chain + * starting at 'off' for 'len' bytes. If the desired piece spans more than + * one mbuf, a copy is made at 'ptr'. caller must ensure that the buffer size + * is >= 'len' returns NULL if there there isn't 'len' bytes in the chain. + */ +caddr_t +sctp_m_getptr(struct mbuf *m, int off, int len, uint8_t * in_ptr) +{ + uint32_t count; + uint8_t *ptr; + + ptr = in_ptr; + if ((off < 0) || (len <= 0)) + return (NULL); + + /* find the desired start location */ + while ((m != NULL) && (off > 0)) { + if (off < SCTP_BUF_LEN(m)) + break; + off -= SCTP_BUF_LEN(m); + m = SCTP_BUF_NEXT(m); + } + if (m == NULL) + return (NULL); + + /* is the current mbuf large enough (eg. contiguous)? */ + if ((SCTP_BUF_LEN(m) - off) >= len) { + return (mtod(m, caddr_t)+off); + } else { + /* else, it spans more than one mbuf, so save a temp copy... */ + while ((m != NULL) && (len > 0)) { + count = min(SCTP_BUF_LEN(m) - off, len); + bcopy(mtod(m, caddr_t)+off, ptr, count); + len -= count; + ptr += count; + off = 0; + m = SCTP_BUF_NEXT(m); + } + if ((m == NULL) && (len > 0)) + return (NULL); + else + return ((caddr_t)in_ptr); + } +} + + + +struct sctp_paramhdr * +sctp_get_next_param(struct mbuf *m, + int offset, + struct sctp_paramhdr *pull, + int pull_limit) +{ + /* This just provides a typed signature to Peter's Pull routine */ + return ((struct sctp_paramhdr *)sctp_m_getptr(m, offset, pull_limit, + (uint8_t *) pull)); +} + + +int +sctp_add_pad_tombuf(struct mbuf *m, int padlen) +{ + /* + * add padlen bytes of 0 filled padding to the end of the mbuf. If + * padlen is > 3 this routine will fail. + */ + uint8_t *dp; + int i; + + if (padlen > 3) { + SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS); + return (ENOBUFS); + } + if (padlen <= M_TRAILINGSPACE(m)) { + /* + * The easy way. We hope the majority of the time we hit + * here :) + */ + dp = (uint8_t *) (mtod(m, caddr_t)+SCTP_BUF_LEN(m)); + SCTP_BUF_LEN(m) += padlen; + } else { + /* Hard way we must grow the mbuf */ + struct mbuf *tmp; + + tmp = sctp_get_mbuf_for_msg(padlen, 0, M_DONTWAIT, 1, MT_DATA); + if (tmp == NULL) { + /* Out of space GAK! we are in big trouble. */ + SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + return (ENOSPC); + } + /* setup and insert in middle */ + SCTP_BUF_LEN(tmp) = padlen; + SCTP_BUF_NEXT(tmp) = NULL; + SCTP_BUF_NEXT(m) = tmp; + dp = mtod(tmp, uint8_t *); + } + /* zero out the pad */ + for (i = 0; i < padlen; i++) { + *dp = 0; + dp++; + } + return (0); +} + +int +sctp_pad_lastmbuf(struct mbuf *m, int padval, struct mbuf *last_mbuf) +{ + /* find the last mbuf in chain and pad it */ + struct mbuf *m_at; + + m_at = m; + if (last_mbuf) { + return (sctp_add_pad_tombuf(last_mbuf, padval)); + } else { + while (m_at) { + if (SCTP_BUF_NEXT(m_at) == NULL) { + return (sctp_add_pad_tombuf(m_at, padval)); + } + m_at = SCTP_BUF_NEXT(m_at); + } + } + SCTP_LTRACE_ERR_RET_PKT(m, NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EFAULT); + return (EFAULT); +} + +static void +sctp_notify_assoc_change(uint32_t event, struct sctp_tcb *stcb, + uint32_t error, void *data, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + struct mbuf *m_notify; + struct sctp_assoc_change *sac; + struct sctp_queued_to_read *control; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + + /* + * For TCP model AND UDP connected sockets we will send an error up + * when an ABORT comes in. + */ + if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && + ((event == SCTP_COMM_LOST) || (event == SCTP_CANT_STR_ASSOC))) { + if (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_COOKIE_WAIT) { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNREFUSED); + stcb->sctp_socket->so_error = ECONNREFUSED; + } else { + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET); + stcb->sctp_socket->so_error = ECONNRESET; + } + /* Wake ANY sleepers */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + if (!so_locked) { + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + SCTP_SOCKET_UNLOCK(so, 1); + return; + } + } +#endif + socantrcvmore(stcb->sctp_socket); + sorwakeup(stcb->sctp_socket); + sowwakeup(stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if (!so_locked) { + SCTP_SOCKET_UNLOCK(so, 1); + } +#endif + } + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVASSOCEVNT)) { + /* event not enabled */ + return; + } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_assoc_change), 0, M_DONTWAIT, 1, MT_DATA); + if (m_notify == NULL) + /* no space left */ + return; + SCTP_BUF_LEN(m_notify) = 0; + + sac = mtod(m_notify, struct sctp_assoc_change *); + sac->sac_type = SCTP_ASSOC_CHANGE; + sac->sac_flags = 0; + sac->sac_length = sizeof(struct sctp_assoc_change); + sac->sac_state = event; + sac->sac_error = error; + /* XXX verify these stream counts */ + sac->sac_outbound_streams = stcb->asoc.streamoutcnt; + sac->sac_inbound_streams = stcb->asoc.streamincnt; + sac->sac_assoc_id = sctp_get_associd(stcb); + SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_assoc_change); + SCTP_BUF_NEXT(m_notify) = NULL; + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, + m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->length = SCTP_BUF_LEN(m_notify); + /* not that we need this */ + control->tail_mbuf = m_notify; + control->spec_flags = M_NOTIFICATION; + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, + so_locked); + if (event == SCTP_COMM_LOST) { + /* Wake up any sleeper */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(stcb->sctp_ep); + if (!so_locked) { + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + SCTP_SOCKET_UNLOCK(so, 1); + return; + } + } +#endif + sctp_sowwakeup(stcb->sctp_ep, stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if (!so_locked) { + SCTP_SOCKET_UNLOCK(so, 1); + } +#endif + } +} + +static void +sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state, + struct sockaddr *sa, uint32_t error) +{ + struct mbuf *m_notify; + struct sctp_paddr_change *spc; + struct sctp_queued_to_read *control; + + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVPADDREVNT)) { + /* event not enabled */ + return; + } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_paddr_change), 0, M_DONTWAIT, 1, MT_DATA); + if (m_notify == NULL) + return; + SCTP_BUF_LEN(m_notify) = 0; + spc = mtod(m_notify, struct sctp_paddr_change *); + spc->spc_type = SCTP_PEER_ADDR_CHANGE; + spc->spc_flags = 0; + spc->spc_length = sizeof(struct sctp_paddr_change); + switch (sa->sa_family) { + case AF_INET: + memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in)); + break; +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *sin6; + + memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in6)); + + sin6 = (struct sockaddr_in6 *)&spc->spc_aaddr; + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { + if (sin6->sin6_scope_id == 0) { + /* recover scope_id for user */ + (void)sa6_recoverscope(sin6); + } else { + /* clear embedded scope_id for user */ + in6_clearscope(&sin6->sin6_addr); + } + } + break; + } +#endif + default: + /* TSNH */ + break; + } + spc->spc_state = state; + spc->spc_error = error; + spc->spc_assoc_id = sctp_get_associd(stcb); + + SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_paddr_change); + SCTP_BUF_NEXT(m_notify) = NULL; + + /* append to socket */ + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, + m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->length = SCTP_BUF_LEN(m_notify); + control->spec_flags = M_NOTIFICATION; + /* not that we need this */ + control->tail_mbuf = m_notify; + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, + SCTP_READ_LOCK_NOT_HELD, + SCTP_SO_NOT_LOCKED); +} + + +static void +sctp_notify_send_failed(struct sctp_tcb *stcb, uint32_t error, + struct sctp_tmit_chunk *chk, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + struct mbuf *m_notify; + struct sctp_send_failed *ssf; + struct sctp_queued_to_read *control; + int length; + + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) { + /* event not enabled */ + return; + } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_send_failed), 0, M_DONTWAIT, 1, MT_DATA); + if (m_notify == NULL) + /* no space left */ + return; + length = sizeof(struct sctp_send_failed) + chk->send_size; + length -= sizeof(struct sctp_data_chunk); + SCTP_BUF_LEN(m_notify) = 0; + ssf = mtod(m_notify, struct sctp_send_failed *); + ssf->ssf_type = SCTP_SEND_FAILED; + if (error == SCTP_NOTIFY_DATAGRAM_UNSENT) + ssf->ssf_flags = SCTP_DATA_UNSENT; + else + ssf->ssf_flags = SCTP_DATA_SENT; + ssf->ssf_length = length; + ssf->ssf_error = error; + /* not exactly what the user sent in, but should be close :) */ + bzero(&ssf->ssf_info, sizeof(ssf->ssf_info)); + ssf->ssf_info.sinfo_stream = chk->rec.data.stream_number; + ssf->ssf_info.sinfo_ssn = chk->rec.data.stream_seq; + ssf->ssf_info.sinfo_flags = chk->rec.data.rcv_flags; + ssf->ssf_info.sinfo_ppid = chk->rec.data.payloadtype; + ssf->ssf_info.sinfo_context = chk->rec.data.context; + ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb); + ssf->ssf_assoc_id = sctp_get_associd(stcb); + + if (chk->data) { + /* + * trim off the sctp chunk header(it should be there) + */ + if (chk->send_size >= sizeof(struct sctp_data_chunk)) { + m_adj(chk->data, sizeof(struct sctp_data_chunk)); + sctp_mbuf_crush(chk->data); + chk->send_size -= sizeof(struct sctp_data_chunk); + } + } + SCTP_BUF_NEXT(m_notify) = chk->data; + SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed); + /* Steal off the mbuf */ + chk->data = NULL; + /* + * For this case, we check the actual socket buffer, since the assoc + * is going away we don't want to overfill the socket buffer for a + * non-reader + */ + if (sctp_sbspace_failedmsgs(&stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) { + sctp_m_freem(m_notify); + return; + } + /* append to socket */ + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, + m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->spec_flags = M_NOTIFICATION; + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, + SCTP_READ_LOCK_NOT_HELD, + so_locked); +} + + +static void +sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error, + struct sctp_stream_queue_pending *sp, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + struct mbuf *m_notify; + struct sctp_send_failed *ssf; + struct sctp_queued_to_read *control; + int length; + + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSENDFAILEVNT)) { + /* event not enabled */ + return; + } + length = sizeof(struct sctp_send_failed) + sp->length; + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_send_failed), 0, M_DONTWAIT, 1, MT_DATA); + if (m_notify == NULL) + /* no space left */ + return; + SCTP_BUF_LEN(m_notify) = 0; + ssf = mtod(m_notify, struct sctp_send_failed *); + ssf->ssf_type = SCTP_SEND_FAILED; + if (error == SCTP_NOTIFY_DATAGRAM_UNSENT) + ssf->ssf_flags = SCTP_DATA_UNSENT; + else + ssf->ssf_flags = SCTP_DATA_SENT; + ssf->ssf_length = length; + ssf->ssf_error = error; + /* not exactly what the user sent in, but should be close :) */ + bzero(&ssf->ssf_info, sizeof(ssf->ssf_info)); + ssf->ssf_info.sinfo_stream = sp->stream; + ssf->ssf_info.sinfo_ssn = sp->strseq; + if (sp->some_taken) { + ssf->ssf_info.sinfo_flags = SCTP_DATA_LAST_FRAG; + } else { + ssf->ssf_info.sinfo_flags = SCTP_DATA_NOT_FRAG; + } + ssf->ssf_info.sinfo_ppid = sp->ppid; + ssf->ssf_info.sinfo_context = sp->context; + ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb); + ssf->ssf_assoc_id = sctp_get_associd(stcb); + SCTP_BUF_NEXT(m_notify) = sp->data; + SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_send_failed); + + /* Steal off the mbuf */ + sp->data = NULL; + /* + * For this case, we check the actual socket buffer, since the assoc + * is going away we don't want to overfill the socket buffer for a + * non-reader + */ + if (sctp_sbspace_failedmsgs(&stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) { + sctp_m_freem(m_notify); + return; + } + /* append to socket */ + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, + m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->spec_flags = M_NOTIFICATION; + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked); +} + + + +static void +sctp_notify_adaptation_layer(struct sctp_tcb *stcb, + uint32_t error) +{ + struct mbuf *m_notify; + struct sctp_adaptation_event *sai; + struct sctp_queued_to_read *control; + + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_ADAPTATIONEVNT)) { + /* event not enabled */ + return; + } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_adaption_event), 0, M_DONTWAIT, 1, MT_DATA); + if (m_notify == NULL) + /* no space left */ + return; + SCTP_BUF_LEN(m_notify) = 0; + sai = mtod(m_notify, struct sctp_adaptation_event *); + sai->sai_type = SCTP_ADAPTATION_INDICATION; + sai->sai_flags = 0; + sai->sai_length = sizeof(struct sctp_adaptation_event); + sai->sai_adaptation_ind = stcb->asoc.peers_adaptation; + sai->sai_assoc_id = sctp_get_associd(stcb); + + SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_adaptation_event); + SCTP_BUF_NEXT(m_notify) = NULL; + + /* append to socket */ + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, + m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->length = SCTP_BUF_LEN(m_notify); + control->spec_flags = M_NOTIFICATION; + /* not that we need this */ + control->tail_mbuf = m_notify; + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); +} + +/* This always must be called with the read-queue LOCKED in the INP */ +static void +sctp_notify_partial_delivery_indication(struct sctp_tcb *stcb, uint32_t error, + uint32_t val, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + struct mbuf *m_notify; + struct sctp_pdapi_event *pdapi; + struct sctp_queued_to_read *control; + struct sockbuf *sb; + + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_PDAPIEVNT)) { + /* event not enabled */ + return; + } + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) { + return; + } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_pdapi_event), 0, M_DONTWAIT, 1, MT_DATA); + if (m_notify == NULL) + /* no space left */ + return; + SCTP_BUF_LEN(m_notify) = 0; + pdapi = mtod(m_notify, struct sctp_pdapi_event *); + pdapi->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT; + pdapi->pdapi_flags = 0; + pdapi->pdapi_length = sizeof(struct sctp_pdapi_event); + pdapi->pdapi_indication = error; + pdapi->pdapi_stream = (val >> 16); + pdapi->pdapi_seq = (val & 0x0000ffff); + pdapi->pdapi_assoc_id = sctp_get_associd(stcb); + + SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_pdapi_event); + SCTP_BUF_NEXT(m_notify) = NULL; + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, + m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->spec_flags = M_NOTIFICATION; + control->length = SCTP_BUF_LEN(m_notify); + /* not that we need this */ + control->tail_mbuf = m_notify; + control->held_length = 0; + control->length = 0; + sb = &stcb->sctp_socket->so_rcv; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m_notify)); + } + sctp_sballoc(stcb, sb, m_notify); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); + } + atomic_add_int(&control->length, SCTP_BUF_LEN(m_notify)); + control->end_added = 1; + if (stcb->asoc.control_pdapi) + TAILQ_INSERT_AFTER(&stcb->sctp_ep->read_queue, stcb->asoc.control_pdapi, control, next); + else { + /* we really should not see this case */ + TAILQ_INSERT_TAIL(&stcb->sctp_ep->read_queue, control, next); + } + if (stcb->sctp_ep && stcb->sctp_socket) { + /* This should always be the case */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + + so = SCTP_INP_SO(stcb->sctp_ep); + if (!so_locked) { + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + SCTP_SOCKET_UNLOCK(so, 1); + return; + } + } +#endif + sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if (!so_locked) { + SCTP_SOCKET_UNLOCK(so, 1); + } +#endif + } +} + +static void +sctp_notify_shutdown_event(struct sctp_tcb *stcb) +{ + struct mbuf *m_notify; + struct sctp_shutdown_event *sse; + struct sctp_queued_to_read *control; + + /* + * For TCP model AND UDP connected sockets we will send an error up + * when an SHUTDOWN completes + */ + if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + /* mark socket closed for read/write and wakeup! */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + + so = SCTP_INP_SO(stcb->sctp_ep); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + SCTP_SOCKET_UNLOCK(so, 1); + return; + } +#endif + socantsendmore(stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT)) { + /* event not enabled */ + return; + } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_event), 0, M_DONTWAIT, 1, MT_DATA); + if (m_notify == NULL) + /* no space left */ + return; + sse = mtod(m_notify, struct sctp_shutdown_event *); + sse->sse_type = SCTP_SHUTDOWN_EVENT; + sse->sse_flags = 0; + sse->sse_length = sizeof(struct sctp_shutdown_event); + sse->sse_assoc_id = sctp_get_associd(stcb); + + SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_shutdown_event); + SCTP_BUF_NEXT(m_notify) = NULL; + + /* append to socket */ + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, + m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->spec_flags = M_NOTIFICATION; + control->length = SCTP_BUF_LEN(m_notify); + /* not that we need this */ + control->tail_mbuf = m_notify; + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); +} + +static void +sctp_notify_sender_dry_event(struct sctp_tcb *stcb, + int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + struct mbuf *m_notify; + struct sctp_sender_dry_event *event; + struct sctp_queued_to_read *control; + + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_DRYEVNT)) { + /* event not enabled */ + return; + } + m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_sender_dry_event), 0, M_DONTWAIT, 1, MT_DATA); + if (m_notify == NULL) { + /* no space left */ + return; + } + SCTP_BUF_LEN(m_notify) = 0; + event = mtod(m_notify, struct sctp_sender_dry_event *); + event->sender_dry_type = SCTP_SENDER_DRY_EVENT; + event->sender_dry_flags = 0; + event->sender_dry_length = sizeof(struct sctp_sender_dry_event); + event->sender_dry_assoc_id = sctp_get_associd(stcb); + + SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_sender_dry_event); + SCTP_BUF_NEXT(m_notify) = NULL; + + /* append to socket */ + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->length = SCTP_BUF_LEN(m_notify); + control->spec_flags = M_NOTIFICATION; + /* not that we need this */ + control->tail_mbuf = m_notify; + sctp_add_to_readq(stcb->sctp_ep, stcb, control, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked); +} + + +static void +sctp_notify_stream_reset_add(struct sctp_tcb *stcb, int number_entries, int flag) +{ + struct mbuf *m_notify; + struct sctp_queued_to_read *control; + struct sctp_stream_reset_event *strreset; + int len; + + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_STREAM_RESETEVNT)) { + /* event not enabled */ + return; + } + m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + if (m_notify == NULL) + /* no space left */ + return; + SCTP_BUF_LEN(m_notify) = 0; + len = sizeof(struct sctp_stream_reset_event) + (number_entries * sizeof(uint16_t)); + if (len > M_TRAILINGSPACE(m_notify)) { + /* never enough room */ + sctp_m_freem(m_notify); + return; + } + strreset = mtod(m_notify, struct sctp_stream_reset_event *); + strreset->strreset_type = SCTP_STREAM_RESET_EVENT; + strreset->strreset_flags = SCTP_STRRESET_ADD_STREAM | flag; + strreset->strreset_length = len; + strreset->strreset_assoc_id = sctp_get_associd(stcb); + strreset->strreset_list[0] = number_entries; + + SCTP_BUF_LEN(m_notify) = len; + SCTP_BUF_NEXT(m_notify) = NULL; + if (sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) { + /* no space */ + sctp_m_freem(m_notify); + return; + } + /* append to socket */ + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, + m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->spec_flags = M_NOTIFICATION; + control->length = SCTP_BUF_LEN(m_notify); + /* not that we need this */ + control->tail_mbuf = m_notify; + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); +} + + +static void +sctp_notify_stream_reset(struct sctp_tcb *stcb, + int number_entries, uint16_t * list, int flag) +{ + struct mbuf *m_notify; + struct sctp_queued_to_read *control; + struct sctp_stream_reset_event *strreset; + int len; + + if (sctp_is_feature_off(stcb->sctp_ep, SCTP_PCB_FLAGS_STREAM_RESETEVNT)) { + /* event not enabled */ + return; + } + m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_DONTWAIT, 1, MT_DATA); + if (m_notify == NULL) + /* no space left */ + return; + SCTP_BUF_LEN(m_notify) = 0; + len = sizeof(struct sctp_stream_reset_event) + (number_entries * sizeof(uint16_t)); + if (len > M_TRAILINGSPACE(m_notify)) { + /* never enough room */ + sctp_m_freem(m_notify); + return; + } + strreset = mtod(m_notify, struct sctp_stream_reset_event *); + strreset->strreset_type = SCTP_STREAM_RESET_EVENT; + if (number_entries == 0) { + strreset->strreset_flags = flag | SCTP_STRRESET_ALL_STREAMS; + } else { + strreset->strreset_flags = flag | SCTP_STRRESET_STREAM_LIST; + } + strreset->strreset_length = len; + strreset->strreset_assoc_id = sctp_get_associd(stcb); + if (number_entries) { + int i; + + for (i = 0; i < number_entries; i++) { + strreset->strreset_list[i] = ntohs(list[i]); + } + } + SCTP_BUF_LEN(m_notify) = len; + SCTP_BUF_NEXT(m_notify) = NULL; + if (sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) { + /* no space */ + sctp_m_freem(m_notify); + return; + } + /* append to socket */ + control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, + 0, 0, 0, 0, 0, 0, + m_notify); + if (control == NULL) { + /* no memory */ + sctp_m_freem(m_notify); + return; + } + control->spec_flags = M_NOTIFICATION; + control->length = SCTP_BUF_LEN(m_notify); + /* not that we need this */ + control->tail_mbuf = m_notify; + sctp_add_to_readq(stcb->sctp_ep, stcb, + control, + &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); +} + + +void +sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, + uint32_t error, void *data, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + if ((stcb == NULL) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) { + /* If the socket is gone we are out of here */ + return; + } + if (stcb->sctp_socket->so_rcv.sb_state & SBS_CANTRCVMORE) { + return; + } + if (stcb && ((stcb->asoc.state & SCTP_STATE_COOKIE_WAIT) || + (stcb->asoc.state & SCTP_STATE_COOKIE_ECHOED))) { + if ((notification == SCTP_NOTIFY_INTERFACE_DOWN) || + (notification == SCTP_NOTIFY_INTERFACE_UP) || + (notification == SCTP_NOTIFY_INTERFACE_CONFIRMED)) { + /* Don't report these in front states */ + return; + } + } + switch (notification) { + case SCTP_NOTIFY_ASSOC_UP: + if (stcb->asoc.assoc_up_sent == 0) { + sctp_notify_assoc_change(SCTP_COMM_UP, stcb, error, NULL, so_locked); + stcb->asoc.assoc_up_sent = 1; + } + if (stcb->asoc.adaptation_needed && (stcb->asoc.adaptation_sent == 0)) { + sctp_notify_adaptation_layer(stcb, error); + } + if (stcb->asoc.peer_supports_auth == 0) { + sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0, + NULL, so_locked); + } + break; + case SCTP_NOTIFY_ASSOC_DOWN: + sctp_notify_assoc_change(SCTP_SHUTDOWN_COMP, stcb, error, NULL, so_locked); + break; + case SCTP_NOTIFY_INTERFACE_DOWN: + { + struct sctp_nets *net; + + net = (struct sctp_nets *)data; + sctp_notify_peer_addr_change(stcb, SCTP_ADDR_UNREACHABLE, + (struct sockaddr *)&net->ro._l_addr, error); + break; + } + case SCTP_NOTIFY_INTERFACE_UP: + { + struct sctp_nets *net; + + net = (struct sctp_nets *)data; + sctp_notify_peer_addr_change(stcb, SCTP_ADDR_AVAILABLE, + (struct sockaddr *)&net->ro._l_addr, error); + break; + } + case SCTP_NOTIFY_INTERFACE_CONFIRMED: + { + struct sctp_nets *net; + + net = (struct sctp_nets *)data; + sctp_notify_peer_addr_change(stcb, SCTP_ADDR_CONFIRMED, + (struct sockaddr *)&net->ro._l_addr, error); + break; + } + case SCTP_NOTIFY_SPECIAL_SP_FAIL: + sctp_notify_send_failed2(stcb, error, + (struct sctp_stream_queue_pending *)data, so_locked); + break; + case SCTP_NOTIFY_DG_FAIL: + sctp_notify_send_failed(stcb, error, + (struct sctp_tmit_chunk *)data, so_locked); + break; + case SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION: + { + uint32_t val; + + val = *((uint32_t *) data); + + sctp_notify_partial_delivery_indication(stcb, error, val, so_locked); + break; + } + case SCTP_NOTIFY_STRDATA_ERR: + break; + case SCTP_NOTIFY_ASSOC_ABORTED: + if ((stcb) && (((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_WAIT) || + ((stcb->asoc.state & SCTP_STATE_MASK) == SCTP_STATE_COOKIE_ECHOED))) { + sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, NULL, so_locked); + } else { + sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, NULL, so_locked); + } + break; + case SCTP_NOTIFY_PEER_OPENED_STREAM: + break; + case SCTP_NOTIFY_STREAM_OPENED_OK: + break; + case SCTP_NOTIFY_ASSOC_RESTART: + sctp_notify_assoc_change(SCTP_RESTART, stcb, error, data, so_locked); + if (stcb->asoc.peer_supports_auth == 0) { + sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0, + NULL, so_locked); + } + break; + case SCTP_NOTIFY_HB_RESP: + break; + case SCTP_NOTIFY_STR_RESET_INSTREAM_ADD_OK: + sctp_notify_stream_reset_add(stcb, error, SCTP_STRRESET_INBOUND_STR); + break; + case SCTP_NOTIFY_STR_RESET_ADD_OK: + sctp_notify_stream_reset_add(stcb, error, SCTP_STRRESET_OUTBOUND_STR); + break; + case SCTP_NOTIFY_STR_RESET_ADD_FAIL: + sctp_notify_stream_reset_add(stcb, error, (SCTP_STRRESET_FAILED | SCTP_STRRESET_OUTBOUND_STR)); + break; + + case SCTP_NOTIFY_STR_RESET_SEND: + sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), SCTP_STRRESET_OUTBOUND_STR); + break; + case SCTP_NOTIFY_STR_RESET_RECV: + sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), SCTP_STRRESET_INBOUND_STR); + break; + case SCTP_NOTIFY_STR_RESET_FAILED_OUT: + sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), (SCTP_STRRESET_OUTBOUND_STR | SCTP_STRRESET_FAILED)); + break; + case SCTP_NOTIFY_STR_RESET_FAILED_IN: + sctp_notify_stream_reset(stcb, error, ((uint16_t *) data), (SCTP_STRRESET_INBOUND_STR | SCTP_STRRESET_FAILED)); + break; + case SCTP_NOTIFY_ASCONF_ADD_IP: + sctp_notify_peer_addr_change(stcb, SCTP_ADDR_ADDED, data, + error); + break; + case SCTP_NOTIFY_ASCONF_DELETE_IP: + sctp_notify_peer_addr_change(stcb, SCTP_ADDR_REMOVED, data, + error); + break; + case SCTP_NOTIFY_ASCONF_SET_PRIMARY: + sctp_notify_peer_addr_change(stcb, SCTP_ADDR_MADE_PRIM, data, + error); + break; + case SCTP_NOTIFY_ASCONF_SUCCESS: + break; + case SCTP_NOTIFY_ASCONF_FAILED: + break; + case SCTP_NOTIFY_PEER_SHUTDOWN: + sctp_notify_shutdown_event(stcb); + break; + case SCTP_NOTIFY_AUTH_NEW_KEY: + sctp_notify_authentication(stcb, SCTP_AUTH_NEWKEY, error, + (uint16_t) (uintptr_t) data, + so_locked); + break; + case SCTP_NOTIFY_AUTH_FREE_KEY: + sctp_notify_authentication(stcb, SCTP_AUTH_FREE_KEY, error, + (uint16_t) (uintptr_t) data, + so_locked); + break; + case SCTP_NOTIFY_NO_PEER_AUTH: + sctp_notify_authentication(stcb, SCTP_AUTH_NO_AUTH, error, + (uint16_t) (uintptr_t) data, + so_locked); + break; + case SCTP_NOTIFY_SENDER_DRY: + sctp_notify_sender_dry_event(stcb, so_locked); + break; + default: + SCTPDBG(SCTP_DEBUG_UTIL1, "%s: unknown notification %xh (%u)\n", + __FUNCTION__, notification, notification); + break; + } /* end switch */ +} + +void +sctp_report_all_outbound(struct sctp_tcb *stcb, int holds_lock, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + struct sctp_association *asoc; + struct sctp_stream_out *outs; + struct sctp_tmit_chunk *chk; + struct sctp_stream_queue_pending *sp; + int i; + + asoc = &stcb->asoc; + + if (stcb == NULL) { + return; + } + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + /* already being freed */ + return; + } + if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) { + return; + } + /* now through all the gunk freeing chunks */ + if (holds_lock == 0) { + SCTP_TCB_SEND_LOCK(stcb); + } + /* sent queue SHOULD be empty */ + if (!TAILQ_EMPTY(&asoc->sent_queue)) { + chk = TAILQ_FIRST(&asoc->sent_queue); + while (chk) { + TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next); + asoc->sent_queue_cnt--; + if (chk->data != NULL) { + sctp_free_bufspace(stcb, asoc, chk, 1); + sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, + SCTP_NOTIFY_DATAGRAM_SENT, chk, so_locked); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + } + sctp_free_a_chunk(stcb, chk); + /* sa_ignore FREED_MEMORY */ + chk = TAILQ_FIRST(&asoc->sent_queue); + } + } + /* pending send queue SHOULD be empty */ + if (!TAILQ_EMPTY(&asoc->send_queue)) { + chk = TAILQ_FIRST(&asoc->send_queue); + while (chk) { + TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next); + asoc->send_queue_cnt--; + if (chk->data != NULL) { + sctp_free_bufspace(stcb, asoc, chk, 1); + sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, + SCTP_NOTIFY_DATAGRAM_UNSENT, chk, so_locked); + if (chk->data) { + sctp_m_freem(chk->data); + chk->data = NULL; + } + } + sctp_free_a_chunk(stcb, chk); + /* sa_ignore FREED_MEMORY */ + chk = TAILQ_FIRST(&asoc->send_queue); + } + } + for (i = 0; i < stcb->asoc.streamoutcnt; i++) { + /* For each stream */ + outs = &stcb->asoc.strmout[i]; + /* clean up any sends there */ + stcb->asoc.locked_on_sending = NULL; + sp = TAILQ_FIRST(&outs->outqueue); + while (sp) { + stcb->asoc.stream_queue_cnt--; + TAILQ_REMOVE(&outs->outqueue, sp, next); + sctp_free_spbufspace(stcb, asoc, sp); + if (sp->data) { + sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, + SCTP_NOTIFY_DATAGRAM_UNSENT, (void *)sp, so_locked); + if (sp->data) { + sctp_m_freem(sp->data); + sp->data = NULL; + } + } + if (sp->net) { + sctp_free_remote_addr(sp->net); + sp->net = NULL; + } + /* Free the chunk */ + sctp_free_a_strmoq(stcb, sp); + /* sa_ignore FREED_MEMORY */ + sp = TAILQ_FIRST(&outs->outqueue); + } + } + + if (holds_lock == 0) { + SCTP_TCB_SEND_UNLOCK(stcb); + } +} + +void +sctp_abort_notification(struct sctp_tcb *stcb, int error, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + + if (stcb == NULL) { + return; + } + if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || + (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) { + return; + } + /* Tell them we lost the asoc */ + sctp_report_all_outbound(stcb, 1, so_locked); + if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || + ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) { + stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_WAS_ABORTED; + } + sctp_ulp_notify(SCTP_NOTIFY_ASSOC_ABORTED, stcb, error, NULL, so_locked); +} + +void +sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + struct mbuf *m, int iphlen, struct sctphdr *sh, struct mbuf *op_err, + uint32_t vrf_id, uint16_t port) +{ + uint32_t vtag; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + + vtag = 0; + if (stcb != NULL) { + /* We have a TCB to abort, send notification too */ + vtag = stcb->asoc.peer_vtag; + sctp_abort_notification(stcb, 0, SCTP_SO_NOT_LOCKED); + /* get the assoc vrf id and table id */ + vrf_id = stcb->asoc.vrf_id; + stcb->asoc.state |= SCTP_STATE_WAS_ABORTED; + } + sctp_send_abort(m, iphlen, sh, vtag, op_err, vrf_id, port); + if (stcb != NULL) { + /* Ok, now lets free it */ +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(inp); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); +#endif + SCTP_STAT_INCR_COUNTER32(sctps_aborted); + if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_4); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } +} + +#ifdef SCTP_ASOCLOG_OF_TSNS +void +sctp_print_out_track_log(struct sctp_tcb *stcb) +{ +#ifdef NOSIY_PRINTS + int i; + + SCTP_PRINTF("Last ep reason:%x\n", stcb->sctp_ep->last_abort_code); + SCTP_PRINTF("IN bound TSN log-aaa\n"); + if ((stcb->asoc.tsn_in_at == 0) && (stcb->asoc.tsn_in_wrapped == 0)) { + SCTP_PRINTF("None rcvd\n"); + goto none_in; + } + if (stcb->asoc.tsn_in_wrapped) { + for (i = stcb->asoc.tsn_in_at; i < SCTP_TSN_LOG_SIZE; i++) { + SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n", + stcb->asoc.in_tsnlog[i].tsn, + stcb->asoc.in_tsnlog[i].strm, + stcb->asoc.in_tsnlog[i].seq, + stcb->asoc.in_tsnlog[i].flgs, + stcb->asoc.in_tsnlog[i].sz); + } + } + if (stcb->asoc.tsn_in_at) { + for (i = 0; i < stcb->asoc.tsn_in_at; i++) { + SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n", + stcb->asoc.in_tsnlog[i].tsn, + stcb->asoc.in_tsnlog[i].strm, + stcb->asoc.in_tsnlog[i].seq, + stcb->asoc.in_tsnlog[i].flgs, + stcb->asoc.in_tsnlog[i].sz); + } + } +none_in: + SCTP_PRINTF("OUT bound TSN log-aaa\n"); + if ((stcb->asoc.tsn_out_at == 0) && + (stcb->asoc.tsn_out_wrapped == 0)) { + SCTP_PRINTF("None sent\n"); + } + if (stcb->asoc.tsn_out_wrapped) { + for (i = stcb->asoc.tsn_out_at; i < SCTP_TSN_LOG_SIZE; i++) { + SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n", + stcb->asoc.out_tsnlog[i].tsn, + stcb->asoc.out_tsnlog[i].strm, + stcb->asoc.out_tsnlog[i].seq, + stcb->asoc.out_tsnlog[i].flgs, + stcb->asoc.out_tsnlog[i].sz); + } + } + if (stcb->asoc.tsn_out_at) { + for (i = 0; i < stcb->asoc.tsn_out_at; i++) { + SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n", + stcb->asoc.out_tsnlog[i].tsn, + stcb->asoc.out_tsnlog[i].strm, + stcb->asoc.out_tsnlog[i].seq, + stcb->asoc.out_tsnlog[i].flgs, + stcb->asoc.out_tsnlog[i].sz); + } + } +#endif +} + +#endif + +void +sctp_abort_an_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, + int error, struct mbuf *op_err, + int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + uint32_t vtag; + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + +#endif + +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + so = SCTP_INP_SO(inp); +#endif + if (stcb == NULL) { + /* Got to have a TCB */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + if (LIST_FIRST(&inp->sctp_asoc_list) == NULL) { + sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, + SCTP_CALLED_DIRECTLY_NOCMPSET); + } + } + return; + } else { + stcb->asoc.state |= SCTP_STATE_WAS_ABORTED; + } + vtag = stcb->asoc.peer_vtag; + /* notify the ulp */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) + sctp_abort_notification(stcb, error, so_locked); + /* notify the peer */ +#if defined(SCTP_PANIC_ON_ABORT) + panic("aborting an association"); +#endif + sctp_send_abort_tcb(stcb, op_err, so_locked); + SCTP_STAT_INCR_COUNTER32(sctps_aborted); + if ((SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_OPEN) || + (SCTP_GET_STATE(&stcb->asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { + SCTP_STAT_DECR_GAUGE32(sctps_currestab); + } + /* now free the asoc */ +#ifdef SCTP_ASOCLOG_OF_TSNS + sctp_print_out_track_log(stcb); +#endif +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if (!so_locked) { + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + } +#endif + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_5); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if (!so_locked) { + SCTP_SOCKET_UNLOCK(so, 1); + } +#endif +} + +void +sctp_handle_ootb(struct mbuf *m, int iphlen, int offset, struct sctphdr *sh, + struct sctp_inpcb *inp, struct mbuf *op_err, uint32_t vrf_id, uint16_t port) +{ + struct sctp_chunkhdr *ch, chunk_buf; + unsigned int chk_length; + + SCTP_STAT_INCR_COUNTER32(sctps_outoftheblue); + /* Generate a TO address for future reference */ + if (inp && (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { + if (LIST_FIRST(&inp->sctp_asoc_list) == NULL) { + sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, + SCTP_CALLED_DIRECTLY_NOCMPSET); + } + } + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, + sizeof(*ch), (uint8_t *) & chunk_buf); + while (ch != NULL) { + chk_length = ntohs(ch->chunk_length); + if (chk_length < sizeof(*ch)) { + /* break to abort land */ + break; + } + switch (ch->chunk_type) { + case SCTP_COOKIE_ECHO: + /* We hit here only if the assoc is being freed */ + return; + case SCTP_PACKET_DROPPED: + /* we don't respond to pkt-dropped */ + return; + case SCTP_ABORT_ASSOCIATION: + /* we don't respond with an ABORT to an ABORT */ + return; + case SCTP_SHUTDOWN_COMPLETE: + /* + * we ignore it since we are not waiting for it and + * peer is gone + */ + return; + case SCTP_SHUTDOWN_ACK: + sctp_send_shutdown_complete2(m, iphlen, sh, vrf_id, port); + return; + default: + break; + } + offset += SCTP_SIZE32(chk_length); + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, + sizeof(*ch), (uint8_t *) & chunk_buf); + } + sctp_send_abort(m, iphlen, sh, 0, op_err, vrf_id, port); +} + +/* + * check the inbound datagram to make sure there is not an abort inside it, + * if there is return 1, else return 0. + */ +int +sctp_is_there_an_abort_here(struct mbuf *m, int iphlen, uint32_t * vtagfill) +{ + struct sctp_chunkhdr *ch; + struct sctp_init_chunk *init_chk, chunk_buf; + int offset; + unsigned int chk_length; + + offset = iphlen + sizeof(struct sctphdr); + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch), + (uint8_t *) & chunk_buf); + while (ch != NULL) { + chk_length = ntohs(ch->chunk_length); + if (chk_length < sizeof(*ch)) { + /* packet is probably corrupt */ + break; + } + /* we seem to be ok, is it an abort? */ + if (ch->chunk_type == SCTP_ABORT_ASSOCIATION) { + /* yep, tell them */ + return (1); + } + if (ch->chunk_type == SCTP_INITIATION) { + /* need to update the Vtag */ + init_chk = (struct sctp_init_chunk *)sctp_m_getptr(m, + offset, sizeof(*init_chk), (uint8_t *) & chunk_buf); + if (init_chk != NULL) { + *vtagfill = ntohl(init_chk->init.initiate_tag); + } + } + /* Nope, move to the next chunk */ + offset += SCTP_SIZE32(chk_length); + ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, + sizeof(*ch), (uint8_t *) & chunk_buf); + } + return (0); +} + +/* + * currently (2/02), ifa_addr embeds scope_id's and don't have sin6_scope_id + * set (i.e. it's 0) so, create this function to compare link local scopes + */ +#ifdef INET6 +uint32_t +sctp_is_same_scope(struct sockaddr_in6 *addr1, struct sockaddr_in6 *addr2) +{ + struct sockaddr_in6 a, b; + + /* save copies */ + a = *addr1; + b = *addr2; + + if (a.sin6_scope_id == 0) + if (sa6_recoverscope(&a)) { + /* can't get scope, so can't match */ + return (0); + } + if (b.sin6_scope_id == 0) + if (sa6_recoverscope(&b)) { + /* can't get scope, so can't match */ + return (0); + } + if (a.sin6_scope_id != b.sin6_scope_id) + return (0); + + return (1); +} + +/* + * returns a sockaddr_in6 with embedded scope recovered and removed + */ +struct sockaddr_in6 * +sctp_recover_scope(struct sockaddr_in6 *addr, struct sockaddr_in6 *store) +{ + /* check and strip embedded scope junk */ + if (addr->sin6_family == AF_INET6) { + if (IN6_IS_SCOPE_LINKLOCAL(&addr->sin6_addr)) { + if (addr->sin6_scope_id == 0) { + *store = *addr; + if (!sa6_recoverscope(store)) { + /* use the recovered scope */ + addr = store; + } + } else { + /* else, return the original "to" addr */ + in6_clearscope(&addr->sin6_addr); + } + } + } + return (addr); +} + +#endif + +/* + * are the two addresses the same? currently a "scopeless" check returns: 1 + * if same, 0 if not + */ +int +sctp_cmpaddr(struct sockaddr *sa1, struct sockaddr *sa2) +{ + + /* must be valid */ + if (sa1 == NULL || sa2 == NULL) + return (0); + + /* must be the same family */ + if (sa1->sa_family != sa2->sa_family) + return (0); + + switch (sa1->sa_family) { +#ifdef INET6 + case AF_INET6: + { + /* IPv6 addresses */ + struct sockaddr_in6 *sin6_1, *sin6_2; + + sin6_1 = (struct sockaddr_in6 *)sa1; + sin6_2 = (struct sockaddr_in6 *)sa2; + return (SCTP6_ARE_ADDR_EQUAL(sin6_1, + sin6_2)); + } +#endif + case AF_INET: + { + /* IPv4 addresses */ + struct sockaddr_in *sin_1, *sin_2; + + sin_1 = (struct sockaddr_in *)sa1; + sin_2 = (struct sockaddr_in *)sa2; + return (sin_1->sin_addr.s_addr == sin_2->sin_addr.s_addr); + } + default: + /* we don't do these... */ + return (0); + } +} + +void +sctp_print_address(struct sockaddr *sa) +{ +#ifdef INET6 + char ip6buf[INET6_ADDRSTRLEN]; + + ip6buf[0] = 0; +#endif + + switch (sa->sa_family) { +#ifdef INET6 + case AF_INET6: + { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)sa; + SCTP_PRINTF("IPv6 address: %s:port:%d scope:%u\n", + ip6_sprintf(ip6buf, &sin6->sin6_addr), + ntohs(sin6->sin6_port), + sin6->sin6_scope_id); + break; + } +#endif + case AF_INET: + { + struct sockaddr_in *sin; + unsigned char *p; + + sin = (struct sockaddr_in *)sa; + p = (unsigned char *)&sin->sin_addr; + SCTP_PRINTF("IPv4 address: %u.%u.%u.%u:%d\n", + p[0], p[1], p[2], p[3], ntohs(sin->sin_port)); + break; + } + default: + SCTP_PRINTF("?\n"); + break; + } +} + +void +sctp_print_address_pkt(struct ip *iph, struct sctphdr *sh) +{ + switch (iph->ip_v) { + case IPVERSION: + { + struct sockaddr_in lsa, fsa; + + bzero(&lsa, sizeof(lsa)); + lsa.sin_len = sizeof(lsa); + lsa.sin_family = AF_INET; + lsa.sin_addr = iph->ip_src; + lsa.sin_port = sh->src_port; + bzero(&fsa, sizeof(fsa)); + fsa.sin_len = sizeof(fsa); + fsa.sin_family = AF_INET; + fsa.sin_addr = iph->ip_dst; + fsa.sin_port = sh->dest_port; + SCTP_PRINTF("src: "); + sctp_print_address((struct sockaddr *)&lsa); + SCTP_PRINTF("dest: "); + sctp_print_address((struct sockaddr *)&fsa); + break; + } +#ifdef INET6 + case IPV6_VERSION >> 4: + { + struct ip6_hdr *ip6; + struct sockaddr_in6 lsa6, fsa6; + + ip6 = (struct ip6_hdr *)iph; + bzero(&lsa6, sizeof(lsa6)); + lsa6.sin6_len = sizeof(lsa6); + lsa6.sin6_family = AF_INET6; + lsa6.sin6_addr = ip6->ip6_src; + lsa6.sin6_port = sh->src_port; + bzero(&fsa6, sizeof(fsa6)); + fsa6.sin6_len = sizeof(fsa6); + fsa6.sin6_family = AF_INET6; + fsa6.sin6_addr = ip6->ip6_dst; + fsa6.sin6_port = sh->dest_port; + SCTP_PRINTF("src: "); + sctp_print_address((struct sockaddr *)&lsa6); + SCTP_PRINTF("dest: "); + sctp_print_address((struct sockaddr *)&fsa6); + break; + } +#endif + default: + /* TSNH */ + break; + } +} + +void +sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp, + struct sctp_inpcb *new_inp, + struct sctp_tcb *stcb, + int waitflags) +{ + /* + * go through our old INP and pull off any control structures that + * belong to stcb and move then to the new inp. + */ + struct socket *old_so, *new_so; + struct sctp_queued_to_read *control, *nctl; + struct sctp_readhead tmp_queue; + struct mbuf *m; + int error = 0; + + old_so = old_inp->sctp_socket; + new_so = new_inp->sctp_socket; + TAILQ_INIT(&tmp_queue); + error = sblock(&old_so->so_rcv, waitflags); + if (error) { + /* + * Gak, can't get sblock, we have a problem. data will be + * left stranded.. and we don't dare look at it since the + * other thread may be reading something. Oh well, its a + * screwed up app that does a peeloff OR a accept while + * reading from the main socket... actually its only the + * peeloff() case, since I think read will fail on a + * listening socket.. + */ + return; + } + /* lock the socket buffers */ + SCTP_INP_READ_LOCK(old_inp); + control = TAILQ_FIRST(&old_inp->read_queue); + /* Pull off all for out target stcb */ + while (control) { + nctl = TAILQ_NEXT(control, next); + if (control->stcb == stcb) { + /* remove it we want it */ + TAILQ_REMOVE(&old_inp->read_queue, control, next); + TAILQ_INSERT_TAIL(&tmp_queue, control, next); + m = control->data; + while (m) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(&old_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m)); + } + sctp_sbfree(control, stcb, &old_so->so_rcv, m); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(&old_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); + } + m = SCTP_BUF_NEXT(m); + } + } + control = nctl; + } + SCTP_INP_READ_UNLOCK(old_inp); + /* Remove the sb-lock on the old socket */ + + sbunlock(&old_so->so_rcv); + /* Now we move them over to the new socket buffer */ + control = TAILQ_FIRST(&tmp_queue); + SCTP_INP_READ_LOCK(new_inp); + while (control) { + nctl = TAILQ_NEXT(control, next); + TAILQ_INSERT_TAIL(&new_inp->read_queue, control, next); + m = control->data; + while (m) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(&new_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m)); + } + sctp_sballoc(stcb, &new_so->so_rcv, m); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(&new_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); + } + m = SCTP_BUF_NEXT(m); + } + control = nctl; + } + SCTP_INP_READ_UNLOCK(new_inp); +} + +void +sctp_add_to_readq(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_queued_to_read *control, + struct sockbuf *sb, + int end, + int inp_read_lock_held, + int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + /* + * Here we must place the control on the end of the socket read + * queue AND increment sb_cc so that select will work properly on + * read. + */ + struct mbuf *m, *prev = NULL; + + if (inp == NULL) { + /* Gak, TSNH!! */ +#ifdef INVARIANTS + panic("Gak, inp NULL on add_to_readq"); +#endif + return; + } + if (inp_read_lock_held == 0) + SCTP_INP_READ_LOCK(inp); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) { + sctp_free_remote_addr(control->whoFrom); + if (control->data) { + sctp_m_freem(control->data); + control->data = NULL; + } + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), control); + if (inp_read_lock_held == 0) + SCTP_INP_READ_UNLOCK(inp); + return; + } + if (!(control->spec_flags & M_NOTIFICATION)) { + atomic_add_int(&inp->total_recvs, 1); + if (!control->do_not_ref_stcb) { + atomic_add_int(&stcb->total_recvs, 1); + } + } + m = control->data; + control->held_length = 0; + control->length = 0; + while (m) { + if (SCTP_BUF_LEN(m) == 0) { + /* Skip mbufs with NO length */ + if (prev == NULL) { + /* First one */ + control->data = sctp_m_free(m); + m = control->data; + } else { + SCTP_BUF_NEXT(prev) = sctp_m_free(m); + m = SCTP_BUF_NEXT(prev); + } + if (m == NULL) { + control->tail_mbuf = prev; + } + continue; + } + prev = m; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m)); + } + sctp_sballoc(stcb, sb, m); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); + } + atomic_add_int(&control->length, SCTP_BUF_LEN(m)); + m = SCTP_BUF_NEXT(m); + } + if (prev != NULL) { + control->tail_mbuf = prev; + } else { + /* Everything got collapsed out?? */ + sctp_free_remote_addr(control->whoFrom); + SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), control); + if (inp_read_lock_held == 0) + SCTP_INP_READ_UNLOCK(inp); + return; + } + if (end) { + control->end_added = 1; + } + TAILQ_INSERT_TAIL(&inp->read_queue, control, next); + if (inp_read_lock_held == 0) + SCTP_INP_READ_UNLOCK(inp); + if (inp && inp->sctp_socket) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) { + SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket); + } else { +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + + so = SCTP_INP_SO(inp); + if (!so_locked) { + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + SCTP_SOCKET_UNLOCK(so, 1); + return; + } + } +#endif + sctp_sorwakeup(inp, inp->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if (!so_locked) { + SCTP_SOCKET_UNLOCK(so, 1); + } +#endif + } + } +} + + +int +sctp_append_to_readq(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_queued_to_read *control, + struct mbuf *m, + int end, + int ctls_cumack, + struct sockbuf *sb) +{ + /* + * A partial delivery API event is underway. OR we are appending on + * the reassembly queue. + * + * If PDAPI this means we need to add m to the end of the data. + * Increase the length in the control AND increment the sb_cc. + * Otherwise sb is NULL and all we need to do is put it at the end + * of the mbuf chain. + */ + int len = 0; + struct mbuf *mm, *tail = NULL, *prev = NULL; + + if (inp) { + SCTP_INP_READ_LOCK(inp); + } + if (control == NULL) { +get_out: + if (inp) { + SCTP_INP_READ_UNLOCK(inp); + } + return (-1); + } + if (inp && (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ)) { + SCTP_INP_READ_UNLOCK(inp); + return 0; + } + if (control->end_added) { + /* huh this one is complete? */ + goto get_out; + } + mm = m; + if (mm == NULL) { + goto get_out; + } + while (mm) { + if (SCTP_BUF_LEN(mm) == 0) { + /* Skip mbufs with NO lenght */ + if (prev == NULL) { + /* First one */ + m = sctp_m_free(mm); + mm = m; + } else { + SCTP_BUF_NEXT(prev) = sctp_m_free(mm); + mm = SCTP_BUF_NEXT(prev); + } + continue; + } + prev = mm; + len += SCTP_BUF_LEN(mm); + if (sb) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(mm)); + } + sctp_sballoc(stcb, sb, mm); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); + } + } + mm = SCTP_BUF_NEXT(mm); + } + if (prev) { + tail = prev; + } else { + /* Really there should always be a prev */ + if (m == NULL) { + /* Huh nothing left? */ +#ifdef INVARIANTS + panic("Nothing left to add?"); +#else + goto get_out; +#endif + } + tail = m; + } + if (control->tail_mbuf) { + /* append */ + SCTP_BUF_NEXT(control->tail_mbuf) = m; + control->tail_mbuf = tail; + } else { + /* nothing there */ +#ifdef INVARIANTS + if (control->data != NULL) { + panic("This should NOT happen"); + } +#endif + control->data = m; + control->tail_mbuf = tail; + } + atomic_add_int(&control->length, len); + if (end) { + /* message is complete */ + if (stcb && (control == stcb->asoc.control_pdapi)) { + stcb->asoc.control_pdapi = NULL; + } + control->held_length = 0; + control->end_added = 1; + } + if (stcb == NULL) { + control->do_not_ref_stcb = 1; + } + /* + * When we are appending in partial delivery, the cum-ack is used + * for the actual pd-api highest tsn on this mbuf. The true cum-ack + * is populated in the outbound sinfo structure from the true cumack + * if the association exists... + */ + control->sinfo_tsn = control->sinfo_cumtsn = ctls_cumack; + if (inp) { + SCTP_INP_READ_UNLOCK(inp); + } + if (inp && inp->sctp_socket) { + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ZERO_COPY_ACTIVE)) { + SCTP_ZERO_COPY_EVENT(inp, inp->sctp_socket); + } else { +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + + so = SCTP_INP_SO(inp); + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + SCTP_SOCKET_UNLOCK(so, 1); + return (0); + } +#endif + sctp_sorwakeup(inp, inp->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + SCTP_SOCKET_UNLOCK(so, 1); +#endif + } + } + return (0); +} + + + +/*************HOLD THIS COMMENT FOR PATCH FILE OF + *************ALTERNATE ROUTING CODE + */ + +/*************HOLD THIS COMMENT FOR END OF PATCH FILE OF + *************ALTERNATE ROUTING CODE + */ + +struct mbuf * +sctp_generate_invmanparam(int err) +{ + /* Return a MBUF with a invalid mandatory parameter */ + struct mbuf *m; + + m = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_DONTWAIT, 1, MT_DATA); + if (m) { + struct sctp_paramhdr *ph; + + SCTP_BUF_LEN(m) = sizeof(struct sctp_paramhdr); + ph = mtod(m, struct sctp_paramhdr *); + ph->param_length = htons(sizeof(struct sctp_paramhdr)); + ph->param_type = htons(err); + } + return (m); +} + +#ifdef SCTP_MBCNT_LOGGING +void +sctp_free_bufspace(struct sctp_tcb *stcb, struct sctp_association *asoc, + struct sctp_tmit_chunk *tp1, int chk_cnt) +{ + if (tp1->data == NULL) { + return; + } + asoc->chunks_on_out_queue -= chk_cnt; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBCNT_LOGGING_ENABLE) { + sctp_log_mbcnt(SCTP_LOG_MBCNT_DECREASE, + asoc->total_output_queue_size, + tp1->book_size, + 0, + tp1->mbcnt); + } + if (asoc->total_output_queue_size >= tp1->book_size) { + atomic_add_int(&asoc->total_output_queue_size, -tp1->book_size); + } else { + asoc->total_output_queue_size = 0; + } + + if (stcb->sctp_socket && (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) || + ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)))) { + if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) { + stcb->sctp_socket->so_snd.sb_cc -= tp1->book_size; + } else { + stcb->sctp_socket->so_snd.sb_cc = 0; + + } + } +} + +#endif + +int +sctp_release_pr_sctp_chunk(struct sctp_tcb *stcb, struct sctp_tmit_chunk *tp1, + int reason, int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +) +{ + struct sctp_stream_out *strq; + struct sctp_tmit_chunk *chk = NULL; + struct sctp_stream_queue_pending *sp; + uint16_t stream = 0, seq = 0; + uint8_t foundeom = 0; + int ret_sz = 0; + int notdone; + int do_wakeup_routine = 0; + + stream = tp1->rec.data.stream_number; + seq = tp1->rec.data.stream_seq; + do { + ret_sz += tp1->book_size; + if (tp1->data != NULL) { + if (tp1->sent < SCTP_DATAGRAM_RESEND) { + sctp_flight_size_decrease(tp1); + sctp_total_flight_decrease(stcb, tp1); + } + sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1); + stcb->asoc.peers_rwnd += tp1->send_size; + stcb->asoc.peers_rwnd += SCTP_BASE_SYSCTL(sctp_peer_chunk_oh); + sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, reason, tp1, so_locked); + if (tp1->data) { + sctp_m_freem(tp1->data); + tp1->data = NULL; + } + do_wakeup_routine = 1; + if (PR_SCTP_BUF_ENABLED(tp1->flags)) { + stcb->asoc.sent_queue_cnt_removeable--; + } + } + tp1->sent = SCTP_FORWARD_TSN_SKIP; + if ((tp1->rec.data.rcv_flags & SCTP_DATA_NOT_FRAG) == + SCTP_DATA_NOT_FRAG) { + /* not frag'ed we ae done */ + notdone = 0; + foundeom = 1; + } else if (tp1->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { + /* end of frag, we are done */ + notdone = 0; + foundeom = 1; + } else { + /* + * Its a begin or middle piece, we must mark all of + * it + */ + notdone = 1; + tp1 = TAILQ_NEXT(tp1, sctp_next); + } + } while (tp1 && notdone); + if (foundeom == 0) { + /* + * The multi-part message was scattered across the send and + * sent queue. + */ +next_on_sent: + tp1 = TAILQ_FIRST(&stcb->asoc.send_queue); + /* + * recurse throught the send_queue too, starting at the + * beginning. + */ + if ((tp1) && + (tp1->rec.data.stream_number == stream) && + (tp1->rec.data.stream_seq == seq)) { + /* + * save to chk in case we have some on stream out + * queue. If so and we have an un-transmitted one we + * don't have to fudge the TSN. + */ + chk = tp1; + ret_sz += tp1->book_size; + sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1); + sctp_ulp_notify(SCTP_NOTIFY_DG_FAIL, stcb, reason, tp1, so_locked); + if (tp1->data) { + sctp_m_freem(tp1->data); + tp1->data = NULL; + } + /* No flight involved here book the size to 0 */ + tp1->book_size = 0; + if (tp1->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { + foundeom = 1; + } + do_wakeup_routine = 1; + tp1->sent = SCTP_FORWARD_TSN_SKIP; + TAILQ_REMOVE(&stcb->asoc.send_queue, tp1, sctp_next); + /* + * on to the sent queue so we can wait for it to be + * passed by. + */ + TAILQ_INSERT_TAIL(&stcb->asoc.sent_queue, tp1, + sctp_next); + stcb->asoc.send_queue_cnt--; + stcb->asoc.sent_queue_cnt++; + goto next_on_sent; + } + } + if (foundeom == 0) { + /* + * Still no eom found. That means there is stuff left on the + * stream out queue.. yuck. + */ + strq = &stcb->asoc.strmout[stream]; + SCTP_TCB_SEND_LOCK(stcb); + sp = TAILQ_FIRST(&strq->outqueue); + while (sp->strseq <= seq) { + /* Check if its our SEQ */ + if (sp->strseq == seq) { + sp->discard_rest = 1; + /* + * We may need to put a chunk on the queue + * that holds the TSN that would have been + * sent with the LAST bit. + */ + if (chk == NULL) { + /* Yep, we have to */ + sctp_alloc_a_chunk(stcb, chk); + if (chk == NULL) { + /* + * we are hosed. All we can + * do is nothing.. which + * will cause an abort if + * the peer is paying + * attention. + */ + goto oh_well; + } + memset(chk, 0, sizeof(*chk)); + chk->rec.data.rcv_flags = SCTP_DATA_LAST_FRAG; + chk->sent = SCTP_FORWARD_TSN_SKIP; + chk->asoc = &stcb->asoc; + chk->rec.data.stream_seq = sp->strseq; + chk->rec.data.stream_number = sp->stream; + chk->rec.data.payloadtype = sp->ppid; + chk->rec.data.context = sp->context; + chk->flags = sp->act_flags; + if (sp->net) + chk->whoTo = sp->net; + else + chk->whoTo = stcb->asoc.primary_destination; + atomic_add_int(&chk->whoTo->ref_count, 1); + chk->rec.data.TSN_seq = atomic_fetchadd_int(&stcb->asoc.sending_seq, 1); + stcb->asoc.pr_sctp_cnt++; + chk->pr_sctp_on = 1; + TAILQ_INSERT_TAIL(&stcb->asoc.sent_queue, chk, sctp_next); + stcb->asoc.sent_queue_cnt++; + stcb->asoc.pr_sctp_cnt++; + } else { + chk->rec.data.rcv_flags |= SCTP_DATA_LAST_FRAG; + } + oh_well: + if (sp->data) { + /* + * Pull any data to free up the SB + * and allow sender to "add more" + * whilc we will throw away :-) + */ + sctp_free_spbufspace(stcb, &stcb->asoc, + sp); + ret_sz += sp->length; + do_wakeup_routine = 1; + sp->some_taken = 1; + sctp_m_freem(sp->data); + sp->length = 0; + sp->data = NULL; + sp->tail_mbuf = NULL; + } + break; + } else { + /* Next one please */ + sp = TAILQ_NEXT(sp, next); + } + } /* End while */ + SCTP_TCB_SEND_UNLOCK(stcb); + } + if (do_wakeup_routine) { +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + struct socket *so; + + so = SCTP_INP_SO(stcb->sctp_ep); + if (!so_locked) { + atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_UNLOCK(stcb); + SCTP_SOCKET_LOCK(so, 1); + SCTP_TCB_LOCK(stcb); + atomic_subtract_int(&stcb->asoc.refcnt, 1); + if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { + /* assoc was freed while we were unlocked */ + SCTP_SOCKET_UNLOCK(so, 1); + return (ret_sz); + } + } +#endif + sctp_sowwakeup(stcb->sctp_ep, stcb->sctp_socket); +#if defined (__APPLE__) || defined(SCTP_SO_LOCK_TESTING) + if (!so_locked) { + SCTP_SOCKET_UNLOCK(so, 1); + } +#endif + } + return (ret_sz); +} + +/* + * checks to see if the given address, sa, is one that is currently known by + * the kernel note: can't distinguish the same address on multiple interfaces + * and doesn't handle multiple addresses with different zone/scope id's note: + * ifa_ifwithaddr() compares the entire sockaddr struct + */ +struct sctp_ifa * +sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr, + int holds_lock) +{ + struct sctp_laddr *laddr; + + if (holds_lock == 0) { + SCTP_INP_RLOCK(inp); + } + LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { + if (laddr->ifa == NULL) + continue; + if (addr->sa_family != laddr->ifa->address.sa.sa_family) + continue; + if (addr->sa_family == AF_INET) { + if (((struct sockaddr_in *)addr)->sin_addr.s_addr == + laddr->ifa->address.sin.sin_addr.s_addr) { + /* found him. */ + if (holds_lock == 0) { + SCTP_INP_RUNLOCK(inp); + } + return (laddr->ifa); + break; + } + } +#ifdef INET6 + if (addr->sa_family == AF_INET6) { + if (SCTP6_ARE_ADDR_EQUAL((struct sockaddr_in6 *)addr, + &laddr->ifa->address.sin6)) { + /* found him. */ + if (holds_lock == 0) { + SCTP_INP_RUNLOCK(inp); + } + return (laddr->ifa); + break; + } + } +#endif + } + if (holds_lock == 0) { + SCTP_INP_RUNLOCK(inp); + } + return (NULL); +} + +uint32_t +sctp_get_ifa_hash_val(struct sockaddr *addr) +{ + if (addr->sa_family == AF_INET) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)addr; + return (sin->sin_addr.s_addr ^ (sin->sin_addr.s_addr >> 16)); + } else if (addr->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + uint32_t hash_of_addr; + + sin6 = (struct sockaddr_in6 *)addr; + hash_of_addr = (sin6->sin6_addr.s6_addr32[0] + + sin6->sin6_addr.s6_addr32[1] + + sin6->sin6_addr.s6_addr32[2] + + sin6->sin6_addr.s6_addr32[3]); + hash_of_addr = (hash_of_addr ^ (hash_of_addr >> 16)); + return (hash_of_addr); + } + return (0); +} + +struct sctp_ifa * +sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock) +{ + struct sctp_ifa *sctp_ifap; + struct sctp_vrf *vrf; + struct sctp_ifalist *hash_head; + uint32_t hash_of_addr; + + if (holds_lock == 0) + SCTP_IPI_ADDR_RLOCK(); + + vrf = sctp_find_vrf(vrf_id); + if (vrf == NULL) { +stage_right: + if (holds_lock == 0) + SCTP_IPI_ADDR_RUNLOCK(); + return (NULL); + } + hash_of_addr = sctp_get_ifa_hash_val(addr); + + hash_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)]; + if (hash_head == NULL) { + SCTP_PRINTF("hash_of_addr:%x mask:%x table:%x - ", + hash_of_addr, (uint32_t) vrf->vrf_addr_hashmark, + (uint32_t) (hash_of_addr & vrf->vrf_addr_hashmark)); + sctp_print_address(addr); + SCTP_PRINTF("No such bucket for address\n"); + if (holds_lock == 0) + SCTP_IPI_ADDR_RUNLOCK(); + + return (NULL); + } + LIST_FOREACH(sctp_ifap, hash_head, next_bucket) { + if (sctp_ifap == NULL) { +#ifdef INVARIANTS + panic("Huh LIST_FOREACH corrupt"); + goto stage_right; +#else + SCTP_PRINTF("LIST corrupt of sctp_ifap's?\n"); + goto stage_right; +#endif + } + if (addr->sa_family != sctp_ifap->address.sa.sa_family) + continue; + if (addr->sa_family == AF_INET) { + if (((struct sockaddr_in *)addr)->sin_addr.s_addr == + sctp_ifap->address.sin.sin_addr.s_addr) { + /* found him. */ + if (holds_lock == 0) + SCTP_IPI_ADDR_RUNLOCK(); + return (sctp_ifap); + break; + } + } +#ifdef INET6 + if (addr->sa_family == AF_INET6) { + if (SCTP6_ARE_ADDR_EQUAL((struct sockaddr_in6 *)addr, + &sctp_ifap->address.sin6)) { + /* found him. */ + if (holds_lock == 0) + SCTP_IPI_ADDR_RUNLOCK(); + return (sctp_ifap); + break; + } + } +#endif + } + if (holds_lock == 0) + SCTP_IPI_ADDR_RUNLOCK(); + return (NULL); +} + +static void +sctp_user_rcvd(struct sctp_tcb *stcb, uint32_t * freed_so_far, int hold_rlock, + uint32_t rwnd_req) +{ + /* User pulled some data, do we need a rwnd update? */ + int r_unlocked = 0; + uint32_t dif, rwnd; + struct socket *so = NULL; + + if (stcb == NULL) + return; + + atomic_add_int(&stcb->asoc.refcnt, 1); + + if (stcb->asoc.state & (SCTP_STATE_ABOUT_TO_BE_FREED | + SCTP_STATE_SHUTDOWN_RECEIVED | + SCTP_STATE_SHUTDOWN_ACK_SENT)) { + /* Pre-check If we are freeing no update */ + goto no_lock; + } + SCTP_INP_INCR_REF(stcb->sctp_ep); + if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { + goto out; + } + so = stcb->sctp_socket; + if (so == NULL) { + goto out; + } + atomic_add_int(&stcb->freed_by_sorcv_sincelast, *freed_so_far); + /* Have you have freed enough to look */ + *freed_so_far = 0; + /* Yep, its worth a look and the lock overhead */ + + /* Figure out what the rwnd would be */ + rwnd = sctp_calc_rwnd(stcb, &stcb->asoc); + if (rwnd >= stcb->asoc.my_last_reported_rwnd) { + dif = rwnd - stcb->asoc.my_last_reported_rwnd; + } else { + dif = 0; + } + if (dif >= rwnd_req) { + if (hold_rlock) { + SCTP_INP_READ_UNLOCK(stcb->sctp_ep); + r_unlocked = 1; + } + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + /* + * One last check before we allow the guy possibly + * to get in. There is a race, where the guy has not + * reached the gate. In that case + */ + goto out; + } + SCTP_TCB_LOCK(stcb); + if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + /* No reports here */ + SCTP_TCB_UNLOCK(stcb); + goto out; + } + SCTP_STAT_INCR(sctps_wu_sacks_sent); + sctp_send_sack(stcb); + + sctp_chunk_output(stcb->sctp_ep, stcb, + SCTP_OUTPUT_FROM_USR_RCVD, SCTP_SO_LOCKED); + /* make sure no timer is running */ + sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_6); + SCTP_TCB_UNLOCK(stcb); + } else { + /* Update how much we have pending */ + stcb->freed_by_sorcv_sincelast = dif; + } +out: + if (so && r_unlocked && hold_rlock) { + SCTP_INP_READ_LOCK(stcb->sctp_ep); + } + SCTP_INP_DECR_REF(stcb->sctp_ep); +no_lock: + atomic_add_int(&stcb->asoc.refcnt, -1); + return; +} + +int +sctp_sorecvmsg(struct socket *so, + struct uio *uio, + struct mbuf **mp, + struct sockaddr *from, + int fromlen, + int *msg_flags, + struct sctp_sndrcvinfo *sinfo, + int filling_sinfo) +{ + /* + * MSG flags we will look at MSG_DONTWAIT - non-blocking IO. + * MSG_PEEK - Look don't touch :-D (only valid with OUT mbuf copy + * mp=NULL thus uio is the copy method to userland) MSG_WAITALL - ?? + * On the way out we may send out any combination of: + * MSG_NOTIFICATION MSG_EOR + * + */ + struct sctp_inpcb *inp = NULL; + int my_len = 0; + int cp_len = 0, error = 0; + struct sctp_queued_to_read *control = NULL, *ctl = NULL, *nxt = NULL; + struct mbuf *m = NULL; + struct sctp_tcb *stcb = NULL; + int wakeup_read_socket = 0; + int freecnt_applied = 0; + int out_flags = 0, in_flags = 0; + int block_allowed = 1; + uint32_t freed_so_far = 0; + uint32_t copied_so_far = 0; + int in_eeor_mode = 0; + int no_rcv_needed = 0; + uint32_t rwnd_req = 0; + int hold_sblock = 0; + int hold_rlock = 0; + int slen = 0; + uint32_t held_length = 0; + int sockbuf_lock = 0; + + if (uio == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + return (EINVAL); + } + if (msg_flags) { + in_flags = *msg_flags; + if (in_flags & MSG_PEEK) + SCTP_STAT_INCR(sctps_read_peeks); + } else { + in_flags = 0; + } + slen = uio->uio_resid; + + /* Pull in and set up our int flags */ + if (in_flags & MSG_OOB) { + /* Out of band's NOT supported */ + return (EOPNOTSUPP); + } + if ((in_flags & MSG_PEEK) && (mp != NULL)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + return (EINVAL); + } + if ((in_flags & (MSG_DONTWAIT + | MSG_NBIO + )) || + SCTP_SO_IS_NBIO(so)) { + block_allowed = 0; + } + /* setup the endpoint */ + inp = (struct sctp_inpcb *)so->so_pcb; + if (inp == NULL) { + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EFAULT); + return (EFAULT); + } + rwnd_req = (SCTP_SB_LIMIT_RCV(so) >> SCTP_RWND_HIWAT_SHIFT); + /* Must be at least a MTU's worth */ + if (rwnd_req < SCTP_MIN_RWND) + rwnd_req = SCTP_MIN_RWND; + in_eeor_mode = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_SORECV_ENTER, + rwnd_req, in_eeor_mode, so->so_rcv.sb_cc, uio->uio_resid); + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) { + sctp_misc_ints(SCTP_SORECV_ENTERPL, + rwnd_req, block_allowed, so->so_rcv.sb_cc, uio->uio_resid); + } + error = sblock(&so->so_rcv, (block_allowed ? SBL_WAIT : 0)); + sockbuf_lock = 1; + if (error) { + goto release_unlocked; + } +restart: + + +restart_nosblocks: + if (hold_sblock == 0) { + SOCKBUF_LOCK(&so->so_rcv); + hold_sblock = 1; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { + goto out; + } + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + if (so->so_error) { + error = so->so_error; + if ((in_flags & MSG_PEEK) == 0) + so->so_error = 0; + goto out; + } else { + if (so->so_rcv.sb_cc == 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN); + /* indicate EOF */ + error = 0; + goto out; + } + } + } + if ((so->so_rcv.sb_cc <= held_length) && block_allowed) { + /* we need to wait for data */ + if ((so->so_rcv.sb_cc == 0) && + ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0) { + /* + * For active open side clear flags for + * re-use passive open is blocked by + * connect. + */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED) { + /* + * You were aborted, passive side + * always hits here + */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET); + error = ECONNRESET; + /* + * You get this once if you are + * active open side + */ + if (!(inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + /* + * Remove flag if on the + * active open side + */ + inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAS_ABORTED; + } + } + so->so_state &= ~(SS_ISCONNECTING | + SS_ISDISCONNECTING | + SS_ISCONFIRMING | + SS_ISCONNECTED); + if (error == 0) { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) == 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN); + error = ENOTCONN; + } else { + inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAS_CONNECTED; + } + } + goto out; + } + } + error = sbwait(&so->so_rcv); + if (error) { + goto out; + } + held_length = 0; + goto restart_nosblocks; + } else if (so->so_rcv.sb_cc == 0) { + if (so->so_error) { + error = so->so_error; + if ((in_flags & MSG_PEEK) == 0) + so->so_error = 0; + } else { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || + (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0) { + /* + * For active open side clear flags + * for re-use passive open is + * blocked by connect. + */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED) { + /* + * You were aborted, passive + * side always hits here + */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET); + error = ECONNRESET; + /* + * You get this once if you + * are active open side + */ + if (!(inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { + /* + * Remove flag if on + * the active open + * side + */ + inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAS_ABORTED; + } + } + so->so_state &= ~(SS_ISCONNECTING | + SS_ISDISCONNECTING | + SS_ISCONFIRMING | + SS_ISCONNECTED); + if (error == 0) { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) == 0) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN); + error = ENOTCONN; + } else { + inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAS_CONNECTED; + } + } + goto out; + } + } + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EWOULDBLOCK); + error = EWOULDBLOCK; + } + goto out; + } + if (hold_sblock == 1) { + SOCKBUF_UNLOCK(&so->so_rcv); + hold_sblock = 0; + } + /* we possibly have data we can read */ + /* sa_ignore FREED_MEMORY */ + control = TAILQ_FIRST(&inp->read_queue); + if (control == NULL) { + /* + * This could be happening since the appender did the + * increment but as not yet did the tailq insert onto the + * read_queue + */ + if (hold_rlock == 0) { + SCTP_INP_READ_LOCK(inp); + hold_rlock = 1; + } + control = TAILQ_FIRST(&inp->read_queue); + if ((control == NULL) && (so->so_rcv.sb_cc != 0)) { +#ifdef INVARIANTS + panic("Huh, its non zero and nothing on control?"); +#endif + so->so_rcv.sb_cc = 0; + } + SCTP_INP_READ_UNLOCK(inp); + hold_rlock = 0; + goto restart; + } + if ((control->length == 0) && + (control->do_not_ref_stcb)) { + /* + * Clean up code for freeing assoc that left behind a + * pdapi.. maybe a peer in EEOR that just closed after + * sending and never indicated a EOR. + */ + if (hold_rlock == 0) { + hold_rlock = 1; + SCTP_INP_READ_LOCK(inp); + } + control->held_length = 0; + if (control->data) { + /* Hmm there is data here .. fix */ + struct mbuf *m_tmp; + int cnt = 0; + + m_tmp = control->data; + while (m_tmp) { + cnt += SCTP_BUF_LEN(m_tmp); + if (SCTP_BUF_NEXT(m_tmp) == NULL) { + control->tail_mbuf = m_tmp; + control->end_added = 1; + } + m_tmp = SCTP_BUF_NEXT(m_tmp); + } + control->length = cnt; + } else { + /* remove it */ + TAILQ_REMOVE(&inp->read_queue, control, next); + /* Add back any hiddend data */ + sctp_free_remote_addr(control->whoFrom); + sctp_free_a_readq(stcb, control); + } + if (hold_rlock) { + hold_rlock = 0; + SCTP_INP_READ_UNLOCK(inp); + } + goto restart; + } + if ((control->length == 0) && + (control->end_added == 1)) { + /* + * Do we also need to check for (control->pdapi_aborted == + * 1)? + */ + if (hold_rlock == 0) { + hold_rlock = 1; + SCTP_INP_READ_LOCK(inp); + } + TAILQ_REMOVE(&inp->read_queue, control, next); + if (control->data) { +#ifdef INVARIANTS + panic("control->data not null but control->length == 0"); +#else + SCTP_PRINTF("Strange, data left in the control buffer. Cleaning up.\n"); + sctp_m_freem(control->data); + control->data = NULL; +#endif + } + if (control->aux_data) { + sctp_m_free(control->aux_data); + control->aux_data = NULL; + } + sctp_free_remote_addr(control->whoFrom); + sctp_free_a_readq(stcb, control); + if (hold_rlock) { + hold_rlock = 0; + SCTP_INP_READ_UNLOCK(inp); + } + goto restart; + } + if (control->length == 0) { + if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) && + (filling_sinfo)) { + /* find a more suitable one then this */ + ctl = TAILQ_NEXT(control, next); + while (ctl) { + if ((ctl->stcb != control->stcb) && (ctl->length) && + (ctl->some_taken || + (ctl->spec_flags & M_NOTIFICATION) || + ((ctl->do_not_ref_stcb == 0) && + (ctl->stcb->asoc.strmin[ctl->sinfo_stream].delivery_started == 0))) + ) { + /*- + * If we have a different TCB next, and there is data + * present. If we have already taken some (pdapi), OR we can + * ref the tcb and no delivery as started on this stream, we + * take it. Note we allow a notification on a different + * assoc to be delivered.. + */ + control = ctl; + goto found_one; + } else if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS)) && + (ctl->length) && + ((ctl->some_taken) || + ((ctl->do_not_ref_stcb == 0) && + ((ctl->spec_flags & M_NOTIFICATION) == 0) && + (ctl->stcb->asoc.strmin[ctl->sinfo_stream].delivery_started == 0)))) { + /*- + * If we have the same tcb, and there is data present, and we + * have the strm interleave feature present. Then if we have + * taken some (pdapi) or we can refer to tht tcb AND we have + * not started a delivery for this stream, we can take it. + * Note we do NOT allow a notificaiton on the same assoc to + * be delivered. + */ + control = ctl; + goto found_one; + } + ctl = TAILQ_NEXT(ctl, next); + } + } + /* + * if we reach here, not suitable replacement is available + * fragment interleave is NOT on. So stuff the sb_cc + * into the our held count, and its time to sleep again. + */ + held_length = so->so_rcv.sb_cc; + control->held_length = so->so_rcv.sb_cc; + goto restart; + } + /* Clear the held length since there is something to read */ + control->held_length = 0; + if (hold_rlock) { + SCTP_INP_READ_UNLOCK(inp); + hold_rlock = 0; + } +found_one: + /* + * If we reach here, control has a some data for us to read off. + * Note that stcb COULD be NULL. + */ + control->some_taken++; + if (hold_sblock) { + SOCKBUF_UNLOCK(&so->so_rcv); + hold_sblock = 0; + } + stcb = control->stcb; + if (stcb) { + if ((control->do_not_ref_stcb == 0) && + (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED)) { + if (freecnt_applied == 0) + stcb = NULL; + } else if (control->do_not_ref_stcb == 0) { + /* you can't free it on me please */ + /* + * The lock on the socket buffer protects us so the + * free code will stop. But since we used the + * socketbuf lock and the sender uses the tcb_lock + * to increment, we need to use the atomic add to + * the refcnt + */ + if (freecnt_applied) { +#ifdef INVARIANTS + panic("refcnt already incremented"); +#else + printf("refcnt already incremented?\n"); +#endif + } else { + atomic_add_int(&stcb->asoc.refcnt, 1); + freecnt_applied = 1; + } + /* + * Setup to remember how much we have not yet told + * the peer our rwnd has opened up. Note we grab the + * value from the tcb from last time. Note too that + * sack sending clears this when a sack is sent, + * which is fine. Once we hit the rwnd_req, we then + * will go to the sctp_user_rcvd() that will not + * lock until it KNOWs it MUST send a WUP-SACK. + */ + freed_so_far = stcb->freed_by_sorcv_sincelast; + stcb->freed_by_sorcv_sincelast = 0; + } + } + if (stcb && + ((control->spec_flags & M_NOTIFICATION) == 0) && + control->do_not_ref_stcb == 0) { + stcb->asoc.strmin[control->sinfo_stream].delivery_started = 1; + } + /* First lets get off the sinfo and sockaddr info */ + if ((sinfo) && filling_sinfo) { + memcpy(sinfo, control, sizeof(struct sctp_nonpad_sndrcvinfo)); + nxt = TAILQ_NEXT(control, next); + if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO)) { + struct sctp_extrcvinfo *s_extra; + + s_extra = (struct sctp_extrcvinfo *)sinfo; + if ((nxt) && + (nxt->length)) { + s_extra->sreinfo_next_flags = SCTP_NEXT_MSG_AVAIL; + if (nxt->sinfo_flags & SCTP_UNORDERED) { + s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_IS_UNORDERED; + } + if (nxt->spec_flags & M_NOTIFICATION) { + s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_IS_NOTIFICATION; + } + s_extra->sreinfo_next_aid = nxt->sinfo_assoc_id; + s_extra->sreinfo_next_length = nxt->length; + s_extra->sreinfo_next_ppid = nxt->sinfo_ppid; + s_extra->sreinfo_next_stream = nxt->sinfo_stream; + if (nxt->tail_mbuf != NULL) { + if (nxt->end_added) { + s_extra->sreinfo_next_flags |= SCTP_NEXT_MSG_ISCOMPLETE; + } + } + } else { + /* + * we explicitly 0 this, since the memcpy + * got some other things beyond the older + * sinfo_ that is on the control's structure + * :-D + */ + nxt = NULL; + s_extra->sreinfo_next_flags = SCTP_NO_NEXT_MSG; + s_extra->sreinfo_next_aid = 0; + s_extra->sreinfo_next_length = 0; + s_extra->sreinfo_next_ppid = 0; + s_extra->sreinfo_next_stream = 0; + } + } + /* + * update off the real current cum-ack, if we have an stcb. + */ + if ((control->do_not_ref_stcb == 0) && stcb) + sinfo->sinfo_cumtsn = stcb->asoc.cumulative_tsn; + /* + * mask off the high bits, we keep the actual chunk bits in + * there. + */ + sinfo->sinfo_flags &= 0x00ff; + if ((control->sinfo_flags >> 8) & SCTP_DATA_UNORDERED) { + sinfo->sinfo_flags |= SCTP_UNORDERED; + } + } +#ifdef SCTP_ASOCLOG_OF_TSNS + { + int index, newindex; + struct sctp_pcbtsn_rlog *entry; + + do { + index = inp->readlog_index; + newindex = index + 1; + if (newindex >= SCTP_READ_LOG_SIZE) { + newindex = 0; + } + } while (atomic_cmpset_int(&inp->readlog_index, index, newindex) == 0); + entry = &inp->readlog[index]; + entry->vtag = control->sinfo_assoc_id; + entry->strm = control->sinfo_stream; + entry->seq = control->sinfo_ssn; + entry->sz = control->length; + entry->flgs = control->sinfo_flags; + } +#endif + if (fromlen && from) { + struct sockaddr *to; + +#ifdef INET + cp_len = min((size_t)fromlen, (size_t)control->whoFrom->ro._l_addr.sin.sin_len); + memcpy(from, &control->whoFrom->ro._l_addr, cp_len); + ((struct sockaddr_in *)from)->sin_port = control->port_from; +#else + /* No AF_INET use AF_INET6 */ + cp_len = min((size_t)fromlen, (size_t)control->whoFrom->ro._l_addr.sin6.sin6_len); + memcpy(from, &control->whoFrom->ro._l_addr, cp_len); + ((struct sockaddr_in6 *)from)->sin6_port = control->port_from; +#endif + + to = from; +#if defined(INET) && defined(INET6) + if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) && + (to->sa_family == AF_INET) && + ((size_t)fromlen >= sizeof(struct sockaddr_in6))) { + struct sockaddr_in *sin; + struct sockaddr_in6 sin6; + + sin = (struct sockaddr_in *)to; + bzero(&sin6, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_addr.s6_addr32[2] = htonl(0xffff); + bcopy(&sin->sin_addr, + &sin6.sin6_addr.s6_addr32[3], + sizeof(sin6.sin6_addr.s6_addr32[3])); + sin6.sin6_port = sin->sin_port; + memcpy(from, (caddr_t)&sin6, sizeof(sin6)); + } +#endif +#if defined(INET6) + { + struct sockaddr_in6 lsa6, *to6; + + to6 = (struct sockaddr_in6 *)to; + sctp_recover_scope_mac(to6, (&lsa6)); + } +#endif + } + /* now copy out what data we can */ + if (mp == NULL) { + /* copy out each mbuf in the chain up to length */ +get_more_data: + m = control->data; + while (m) { + /* Move out all we can */ + cp_len = (int)uio->uio_resid; + my_len = (int)SCTP_BUF_LEN(m); + if (cp_len > my_len) { + /* not enough in this buf */ + cp_len = my_len; + } + if (hold_rlock) { + SCTP_INP_READ_UNLOCK(inp); + hold_rlock = 0; + } + if (cp_len > 0) + error = uiomove(mtod(m, char *), cp_len, uio); + /* re-read */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { + goto release; + } + if ((control->do_not_ref_stcb == 0) && stcb && + stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + no_rcv_needed = 1; + } + if (error) { + /* error we are out of here */ + goto release; + } + if ((SCTP_BUF_NEXT(m) == NULL) && + (cp_len >= SCTP_BUF_LEN(m)) && + ((control->end_added == 0) || + (control->end_added && + (TAILQ_NEXT(control, next) == NULL))) + ) { + SCTP_INP_READ_LOCK(inp); + hold_rlock = 1; + } + if (cp_len == SCTP_BUF_LEN(m)) { + if ((SCTP_BUF_NEXT(m) == NULL) && + (control->end_added)) { + out_flags |= MSG_EOR; + if ((control->do_not_ref_stcb == 0) && + (control->stcb != NULL) && + ((control->spec_flags & M_NOTIFICATION) == 0)) + control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0; + } + if (control->spec_flags & M_NOTIFICATION) { + out_flags |= MSG_NOTIFICATION; + } + /* we ate up the mbuf */ + if (in_flags & MSG_PEEK) { + /* just looking */ + m = SCTP_BUF_NEXT(m); + copied_so_far += cp_len; + } else { + /* dispose of the mbuf */ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(&so->so_rcv, + control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m)); + } + sctp_sbfree(control, stcb, &so->so_rcv, m); + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(&so->so_rcv, + control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); + } + copied_so_far += cp_len; + freed_so_far += cp_len; + freed_so_far += MSIZE; + atomic_subtract_int(&control->length, cp_len); + control->data = sctp_m_free(m); + m = control->data; + /* + * been through it all, must hold sb + * lock ok to null tail + */ + if (control->data == NULL) { +#ifdef INVARIANTS + if ((control->end_added == 0) || + (TAILQ_NEXT(control, next) == NULL)) { + /* + * If the end is not + * added, OR the + * next is NOT null + * we MUST have the + * lock. + */ + if (mtx_owned(&inp->inp_rdata_mtx) == 0) { + panic("Hmm we don't own the lock?"); + } + } +#endif + control->tail_mbuf = NULL; +#ifdef INVARIANTS + if ((control->end_added) && ((out_flags & MSG_EOR) == 0)) { + panic("end_added, nothing left and no MSG_EOR"); + } +#endif + } + } + } else { + /* Do we need to trim the mbuf? */ + if (control->spec_flags & M_NOTIFICATION) { + out_flags |= MSG_NOTIFICATION; + } + if ((in_flags & MSG_PEEK) == 0) { + SCTP_BUF_RESV_UF(m, cp_len); + SCTP_BUF_LEN(m) -= cp_len; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, cp_len); + } + atomic_subtract_int(&so->so_rcv.sb_cc, cp_len); + if ((control->do_not_ref_stcb == 0) && + stcb) { + atomic_subtract_int(&stcb->asoc.sb_cc, cp_len); + } + copied_so_far += cp_len; + freed_so_far += cp_len; + freed_so_far += MSIZE; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, + SCTP_LOG_SBRESULT, 0); + } + atomic_subtract_int(&control->length, cp_len); + } else { + copied_so_far += cp_len; + } + } + if ((out_flags & MSG_EOR) || (uio->uio_resid == 0)) { + break; + } + if (((stcb) && (in_flags & MSG_PEEK) == 0) && + (control->do_not_ref_stcb == 0) && + (freed_so_far >= rwnd_req)) { + sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req); + } + } /* end while(m) */ + /* + * At this point we have looked at it all and we either have + * a MSG_EOR/or read all the user wants... + * control->length == 0. + */ + if ((out_flags & MSG_EOR) && ((in_flags & MSG_PEEK) == 0)) { + /* we are done with this control */ + if (control->length == 0) { + if (control->data) { +#ifdef INVARIANTS + panic("control->data not null at read eor?"); +#else + SCTP_PRINTF("Strange, data left in the control buffer .. invarients would panic?\n"); + sctp_m_freem(control->data); + control->data = NULL; +#endif + } + done_with_control: + if (TAILQ_NEXT(control, next) == NULL) { + /* + * If we don't have a next we need a + * lock, if there is a next + * interrupt is filling ahead of us + * and we don't need a lock to + * remove this guy (which is the + * head of the queue). + */ + if (hold_rlock == 0) { + SCTP_INP_READ_LOCK(inp); + hold_rlock = 1; + } + } + TAILQ_REMOVE(&inp->read_queue, control, next); + /* Add back any hiddend data */ + if (control->held_length) { + held_length = 0; + control->held_length = 0; + wakeup_read_socket = 1; + } + if (control->aux_data) { + sctp_m_free(control->aux_data); + control->aux_data = NULL; + } + no_rcv_needed = control->do_not_ref_stcb; + sctp_free_remote_addr(control->whoFrom); + control->data = NULL; + sctp_free_a_readq(stcb, control); + control = NULL; + if ((freed_so_far >= rwnd_req) && + (no_rcv_needed == 0)) + sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req); + + } else { + /* + * The user did not read all of this + * message, turn off the returned MSG_EOR + * since we are leaving more behind on the + * control to read. + */ +#ifdef INVARIANTS + if (control->end_added && + (control->data == NULL) && + (control->tail_mbuf == NULL)) { + panic("Gak, control->length is corrupt?"); + } +#endif + no_rcv_needed = control->do_not_ref_stcb; + out_flags &= ~MSG_EOR; + } + } + if (out_flags & MSG_EOR) { + goto release; + } + if ((uio->uio_resid == 0) || + ((in_eeor_mode) && (copied_so_far >= max(so->so_rcv.sb_lowat, 1))) + ) { + goto release; + } + /* + * If I hit here the receiver wants more and this message is + * NOT done (pd-api). So two questions. Can we block? if not + * we are done. Did the user NOT set MSG_WAITALL? + */ + if (block_allowed == 0) { + goto release; + } + /* + * We need to wait for more data a few things: - We don't + * sbunlock() so we don't get someone else reading. - We + * must be sure to account for the case where what is added + * is NOT to our control when we wakeup. + */ + + /* + * Do we need to tell the transport a rwnd update might be + * needed before we go to sleep? + */ + if (((stcb) && (in_flags & MSG_PEEK) == 0) && + ((freed_so_far >= rwnd_req) && + (control->do_not_ref_stcb == 0) && + (no_rcv_needed == 0))) { + sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req); + } +wait_some_more: + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + goto release; + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) + goto release; + + if (hold_rlock == 1) { + SCTP_INP_READ_UNLOCK(inp); + hold_rlock = 0; + } + if (hold_sblock == 0) { + SOCKBUF_LOCK(&so->so_rcv); + hold_sblock = 1; + } + if ((copied_so_far) && (control->length == 0) && + (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE))) { + goto release; + } + if (so->so_rcv.sb_cc <= control->held_length) { + error = sbwait(&so->so_rcv); + if (error) { + goto release; + } + control->held_length = 0; + } + if (hold_sblock) { + SOCKBUF_UNLOCK(&so->so_rcv); + hold_sblock = 0; + } + if (control->length == 0) { + /* still nothing here */ + if (control->end_added == 1) { + /* he aborted, or is done i.e.did a shutdown */ + out_flags |= MSG_EOR; + if (control->pdapi_aborted) { + if ((control->do_not_ref_stcb == 0) && ((control->spec_flags & M_NOTIFICATION) == 0)) + control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0; + + out_flags |= MSG_TRUNC; + } else { + if ((control->do_not_ref_stcb == 0) && ((control->spec_flags & M_NOTIFICATION) == 0)) + control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0; + } + goto done_with_control; + } + if (so->so_rcv.sb_cc > held_length) { + control->held_length = so->so_rcv.sb_cc; + held_length = 0; + } + goto wait_some_more; + } else if (control->data == NULL) { + /* + * we must re-sync since data is probably being + * added + */ + SCTP_INP_READ_LOCK(inp); + if ((control->length > 0) && (control->data == NULL)) { + /* + * big trouble.. we have the lock and its + * corrupt? + */ +#ifdef INVARIANTS + panic("Impossible data==NULL length !=0"); +#endif + out_flags |= MSG_EOR; + out_flags |= MSG_TRUNC; + control->length = 0; + SCTP_INP_READ_UNLOCK(inp); + goto done_with_control; + } + SCTP_INP_READ_UNLOCK(inp); + /* We will fall around to get more data */ + } + goto get_more_data; + } else { + /*- + * Give caller back the mbuf chain, + * store in uio_resid the length + */ + wakeup_read_socket = 0; + if ((control->end_added == 0) || + (TAILQ_NEXT(control, next) == NULL)) { + /* Need to get rlock */ + if (hold_rlock == 0) { + SCTP_INP_READ_LOCK(inp); + hold_rlock = 1; + } + } + if (control->end_added) { + out_flags |= MSG_EOR; + if ((control->do_not_ref_stcb == 0) && ((control->spec_flags & M_NOTIFICATION) == 0)) + control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0; + } + if (control->spec_flags & M_NOTIFICATION) { + out_flags |= MSG_NOTIFICATION; + } + uio->uio_resid = control->length; + *mp = control->data; + m = control->data; + while (m) { + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(&so->so_rcv, + control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m)); + } + sctp_sbfree(control, stcb, &so->so_rcv, m); + freed_so_far += SCTP_BUF_LEN(m); + freed_so_far += MSIZE; + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { + sctp_sblog(&so->so_rcv, + control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); + } + m = SCTP_BUF_NEXT(m); + } + control->data = control->tail_mbuf = NULL; + control->length = 0; + if (out_flags & MSG_EOR) { + /* Done with this control */ + goto done_with_control; + } + } +release: + if (hold_rlock == 1) { + SCTP_INP_READ_UNLOCK(inp); + hold_rlock = 0; + } + if (hold_sblock == 1) { + SOCKBUF_UNLOCK(&so->so_rcv); + hold_sblock = 0; + } + sbunlock(&so->so_rcv); + sockbuf_lock = 0; + +release_unlocked: + if (hold_sblock) { + SOCKBUF_UNLOCK(&so->so_rcv); + hold_sblock = 0; + } + if ((stcb) && (in_flags & MSG_PEEK) == 0) { + if ((freed_so_far >= rwnd_req) && + (control && (control->do_not_ref_stcb == 0)) && + (no_rcv_needed == 0)) + sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req); + } +out: + if (msg_flags) { + *msg_flags = out_flags; + } + if (((out_flags & MSG_EOR) == 0) && + ((in_flags & MSG_PEEK) == 0) && + (sinfo) && + (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO))) { + struct sctp_extrcvinfo *s_extra; + + s_extra = (struct sctp_extrcvinfo *)sinfo; + s_extra->sreinfo_next_flags = SCTP_NO_NEXT_MSG; + } + if (hold_rlock == 1) { + SCTP_INP_READ_UNLOCK(inp); + hold_rlock = 0; + } + if (hold_sblock) { + SOCKBUF_UNLOCK(&so->so_rcv); + hold_sblock = 0; + } + if (sockbuf_lock) { + sbunlock(&so->so_rcv); + } + if (freecnt_applied) { + /* + * The lock on the socket buffer protects us so the free + * code will stop. But since we used the socketbuf lock and + * the sender uses the tcb_lock to increment, we need to use + * the atomic add to the refcnt. + */ + if (stcb == NULL) { +#ifdef INVARIANTS + panic("stcb for refcnt has gone NULL?"); + goto stage_left; +#else + goto stage_left; +#endif + } + atomic_add_int(&stcb->asoc.refcnt, -1); + freecnt_applied = 0; + /* Save the value back for next time */ + stcb->freed_by_sorcv_sincelast = freed_so_far; + } + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) { + if (stcb) { + sctp_misc_ints(SCTP_SORECV_DONE, + freed_so_far, + ((uio) ? (slen - uio->uio_resid) : slen), + stcb->asoc.my_rwnd, + so->so_rcv.sb_cc); + } else { + sctp_misc_ints(SCTP_SORECV_DONE, + freed_so_far, + ((uio) ? (slen - uio->uio_resid) : slen), + 0, + so->so_rcv.sb_cc); + } + } +stage_left: + if (wakeup_read_socket) { + sctp_sorwakeup(inp, so); + } + return (error); +} + + +#ifdef SCTP_MBUF_LOGGING +struct mbuf * +sctp_m_free(struct mbuf *m) +{ + if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { + if (SCTP_BUF_IS_EXTENDED(m)) { + sctp_log_mb(m, SCTP_MBUF_IFREE); + } + } + return (m_free(m)); +} + +void +sctp_m_freem(struct mbuf *mb) +{ + while (mb != NULL) + mb = sctp_m_free(mb); +} + +#endif + +int +sctp_dynamic_set_primary(struct sockaddr *sa, uint32_t vrf_id) +{ + /* + * Given a local address. For all associations that holds the + * address, request a peer-set-primary. + */ + struct sctp_ifa *ifa; + struct sctp_laddr *wi; + + ifa = sctp_find_ifa_by_addr(sa, vrf_id, 0); + if (ifa == NULL) { + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EADDRNOTAVAIL); + return (EADDRNOTAVAIL); + } + /* + * Now that we have the ifa we must awaken the iterator with this + * message. + */ + wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); + if (wi == NULL) { + SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOMEM); + return (ENOMEM); + } + /* Now incr the count and int wi structure */ + SCTP_INCR_LADDR_COUNT(); + bzero(wi, sizeof(*wi)); + (void)SCTP_GETTIME_TIMEVAL(&wi->start_time); + wi->ifa = ifa; + wi->action = SCTP_SET_PRIM_ADDR; + atomic_add_int(&ifa->refcount, 1); + + /* Now add it to the work queue */ + SCTP_WQ_ADDR_LOCK(); + /* + * Should this really be a tailq? As it is we will process the + * newest first :-0 + */ + LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); + SCTP_WQ_ADDR_UNLOCK(); + sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, + (struct sctp_inpcb *)NULL, + (struct sctp_tcb *)NULL, + (struct sctp_nets *)NULL); + return (0); +} + + +int +sctp_soreceive(struct socket *so, + struct sockaddr **psa, + struct uio *uio, + struct mbuf **mp0, + struct mbuf **controlp, + int *flagsp) +{ + int error, fromlen; + uint8_t sockbuf[256]; + struct sockaddr *from; + struct sctp_extrcvinfo sinfo; + int filling_sinfo = 1; + struct sctp_inpcb *inp; + + inp = (struct sctp_inpcb *)so->so_pcb; + /* pickup the assoc we are reading from */ + if (inp == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + return (EINVAL); + } + if ((sctp_is_feature_off(inp, + SCTP_PCB_FLAGS_RECVDATAIOEVNT)) || + (controlp == NULL)) { + /* user does not want the sndrcv ctl */ + filling_sinfo = 0; + } + if (psa) { + from = (struct sockaddr *)sockbuf; + fromlen = sizeof(sockbuf); + from->sa_len = 0; + } else { + from = NULL; + fromlen = 0; + } + + error = sctp_sorecvmsg(so, uio, mp0, from, fromlen, flagsp, + (struct sctp_sndrcvinfo *)&sinfo, filling_sinfo); + if ((controlp) && (filling_sinfo)) { + /* copy back the sinfo in a CMSG format */ + if (filling_sinfo) + *controlp = sctp_build_ctl_nchunk(inp, + (struct sctp_sndrcvinfo *)&sinfo); + else + *controlp = NULL; + } + if (psa) { + /* copy back the address info */ + if (from && from->sa_len) { + *psa = sodupsockaddr(from, M_NOWAIT); + } else { + *psa = NULL; + } + } + return (error); +} + + +int +sctp_l_soreceive(struct socket *so, + struct sockaddr **name, + struct uio *uio, + char **controlp, + int *controllen, + int *flag) +{ + int error, fromlen; + uint8_t sockbuf[256]; + struct sockaddr *from; + struct sctp_extrcvinfo sinfo; + int filling_sinfo = 1; + struct sctp_inpcb *inp; + + inp = (struct sctp_inpcb *)so->so_pcb; + /* pickup the assoc we are reading from */ + if (inp == NULL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + return (EINVAL); + } + if ((sctp_is_feature_off(inp, + SCTP_PCB_FLAGS_RECVDATAIOEVNT)) || + (controlp == NULL)) { + /* user does not want the sndrcv ctl */ + filling_sinfo = 0; + } + if (name) { + from = (struct sockaddr *)sockbuf; + fromlen = sizeof(sockbuf); + from->sa_len = 0; + } else { + from = NULL; + fromlen = 0; + } + + error = sctp_sorecvmsg(so, uio, + (struct mbuf **)NULL, + from, fromlen, flag, + (struct sctp_sndrcvinfo *)&sinfo, + filling_sinfo); + if ((controlp) && (filling_sinfo)) { + /* + * copy back the sinfo in a CMSG format note that the caller + * has reponsibility for freeing the memory. + */ + if (filling_sinfo) + *controlp = sctp_build_ctl_cchunk(inp, + controllen, + (struct sctp_sndrcvinfo *)&sinfo); + } + if (name) { + /* copy back the address info */ + if (from && from->sa_len) { + *name = sodupsockaddr(from, M_WAIT); + } else { + *name = NULL; + } + } + return (error); +} + + + + + + + +int +sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr, + int totaddr, int *error) +{ + int added = 0; + int i; + struct sctp_inpcb *inp; + struct sockaddr *sa; + size_t incr = 0; + + sa = addr; + inp = stcb->sctp_ep; + *error = 0; + for (i = 0; i < totaddr; i++) { + if (sa->sa_family == AF_INET) { + incr = sizeof(struct sockaddr_in); + if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { + /* assoc gone no un-lock */ + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_7); + *error = ENOBUFS; + goto out_now; + } + added++; + } else if (sa->sa_family == AF_INET6) { + incr = sizeof(struct sockaddr_in6); + if (sctp_add_remote_addr(stcb, sa, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { + /* assoc gone no un-lock */ + SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS); + (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8); + *error = ENOBUFS; + goto out_now; + } + added++; + } + sa = (struct sockaddr *)((caddr_t)sa + incr); + } +out_now: + return (added); +} + +struct sctp_tcb * +sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, + int *totaddr, int *num_v4, int *num_v6, int *error, + int limit, int *bad_addr) +{ + struct sockaddr *sa; + struct sctp_tcb *stcb = NULL; + size_t incr, at, i; + + at = incr = 0; + sa = addr; + *error = *num_v6 = *num_v4 = 0; + /* account and validate addresses */ + for (i = 0; i < (size_t)*totaddr; i++) { + if (sa->sa_family == AF_INET) { + (*num_v4) += 1; + incr = sizeof(struct sockaddr_in); + if (sa->sa_len != incr) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + *bad_addr = 1; + return (NULL); + } + } else if (sa->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)sa; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + /* Must be non-mapped for connectx */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + *bad_addr = 1; + return (NULL); + } + (*num_v6) += 1; + incr = sizeof(struct sockaddr_in6); + if (sa->sa_len != incr) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + *bad_addr = 1; + return (NULL); + } + } else { + *totaddr = i; + /* we are done */ + break; + } + SCTP_INP_INCR_REF(inp); + stcb = sctp_findassociation_ep_addr(&inp, sa, NULL, NULL, NULL); + if (stcb != NULL) { + /* Already have or am bring up an association */ + return (stcb); + } else { + SCTP_INP_DECR_REF(inp); + } + if ((at + incr) > (size_t)limit) { + *totaddr = i; + break; + } + sa = (struct sockaddr *)((caddr_t)sa + incr); + } + return ((struct sctp_tcb *)NULL); +} + +/* + * sctp_bindx(ADD) for one address. + * assumes all arguments are valid/checked by caller. + */ +void +sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp, + struct sockaddr *sa, sctp_assoc_t assoc_id, + uint32_t vrf_id, int *error, void *p) +{ + struct sockaddr *addr_touse; + +#ifdef INET6 + struct sockaddr_in sin; + +#endif + + /* see if we're bound all already! */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + addr_touse = sa; +#if defined(INET6) && !defined(__Userspace__) /* TODO port in6_sin6_2_sin */ + if (sa->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + if (sa->sa_len != sizeof(struct sockaddr_in6)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { + /* can only bind v6 on PF_INET6 sockets */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + sin6 = (struct sockaddr_in6 *)addr_touse; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(inp)) { + /* can't bind v4-mapped on PF_INET sockets */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + in6_sin6_2_sin(&sin, sin6); + addr_touse = (struct sockaddr *)&sin; + } + } +#endif + if (sa->sa_family == AF_INET) { + if (sa->sa_len != sizeof(struct sockaddr_in)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(inp)) { + /* can't bind v4 on PF_INET sockets */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + } + if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) { + if (p == NULL) { + /* Can't get proc for Net/Open BSD */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + *error = sctp_inpcb_bind(so, addr_touse, NULL, p); + return; + } + /* + * No locks required here since bind and mgmt_ep_sa all do their own + * locking. If we do something for the FIX: below we may need to + * lock in that case. + */ + if (assoc_id == 0) { + /* add the address */ + struct sctp_inpcb *lep; + struct sockaddr_in *lsin = (struct sockaddr_in *)addr_touse; + + /* validate the incoming port */ + if ((lsin->sin_port != 0) && + (lsin->sin_port != inp->sctp_lport)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } else { + /* user specified 0 port, set it to existing port */ + lsin->sin_port = inp->sctp_lport; + } + + lep = sctp_pcb_findep(addr_touse, 1, 0, vrf_id); + if (lep != NULL) { + /* + * We must decrement the refcount since we have the + * ep already and are binding. No remove going on + * here. + */ + SCTP_INP_DECR_REF(lep); + } + if (lep == inp) { + /* already bound to it.. ok */ + return; + } else if (lep == NULL) { + ((struct sockaddr_in *)addr_touse)->sin_port = 0; + *error = sctp_addr_mgmt_ep_sa(inp, addr_touse, + SCTP_ADD_IP_ADDRESS, + vrf_id, NULL); + } else { + *error = EADDRINUSE; + } + if (*error) + return; + } else { + /* + * FIX: decide whether we allow assoc based bindx + */ + } +} + +/* + * sctp_bindx(DELETE) for one address. + * assumes all arguments are valid/checked by caller. + */ +void +sctp_bindx_delete_address(struct socket *so, struct sctp_inpcb *inp, + struct sockaddr *sa, sctp_assoc_t assoc_id, + uint32_t vrf_id, int *error) +{ + struct sockaddr *addr_touse; + +#ifdef INET6 + struct sockaddr_in sin; + +#endif + + /* see if we're bound all already! */ + if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + addr_touse = sa; +#if defined(INET6) && !defined(__Userspace__) /* TODO port in6_sin6_2_sin */ + if (sa->sa_family == AF_INET6) { + struct sockaddr_in6 *sin6; + + if (sa->sa_len != sizeof(struct sockaddr_in6)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { + /* can only bind v6 on PF_INET6 sockets */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + sin6 = (struct sockaddr_in6 *)addr_touse; + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(inp)) { + /* can't bind mapped-v4 on PF_INET sockets */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + in6_sin6_2_sin(&sin, sin6); + addr_touse = (struct sockaddr *)&sin; + } + } +#endif + if (sa->sa_family == AF_INET) { + if (sa->sa_len != sizeof(struct sockaddr_in)) { + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && + SCTP_IPV6_V6ONLY(inp)) { + /* can't bind v4 on PF_INET sockets */ + SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); + *error = EINVAL; + return; + } + } + /* + * No lock required mgmt_ep_sa does its own locking. If the FIX: + * below is ever changed we may need to lock before calling + * association level binding. + */ + if (assoc_id == 0) { + /* delete the address */ + *error = sctp_addr_mgmt_ep_sa(inp, addr_touse, + SCTP_DEL_IP_ADDRESS, + vrf_id, NULL); + } else { + /* + * FIX: decide whether we allow assoc based bindx + */ + } +} + +/* + * returns the valid local address count for an assoc, taking into account + * all scoping rules + */ +int +sctp_local_addr_count(struct sctp_tcb *stcb) +{ + int loopback_scope, ipv4_local_scope, local_scope, site_scope; + int ipv4_addr_legal, ipv6_addr_legal; + struct sctp_vrf *vrf; + struct sctp_ifn *sctp_ifn; + struct sctp_ifa *sctp_ifa; + int count = 0; + + /* Turn on all the appropriate scopes */ + loopback_scope = stcb->asoc.loopback_scope; + ipv4_local_scope = stcb->asoc.ipv4_local_scope; + local_scope = stcb->asoc.local_scope; + site_scope = stcb->asoc.site_scope; + ipv4_addr_legal = ipv6_addr_legal = 0; + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { + ipv6_addr_legal = 1; + if (SCTP_IPV6_V6ONLY(stcb->sctp_ep) == 0) { + ipv4_addr_legal = 1; + } + } else { + ipv4_addr_legal = 1; + } + + SCTP_IPI_ADDR_RLOCK(); + vrf = sctp_find_vrf(stcb->asoc.vrf_id); + if (vrf == NULL) { + /* no vrf, no addresses */ + SCTP_IPI_ADDR_RUNLOCK(); + return (0); + } + if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { + /* + * bound all case: go through all ifns on the vrf + */ + LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { + if ((loopback_scope == 0) && + SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { + continue; + } + LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { + if (sctp_is_addr_restricted(stcb, sctp_ifa)) + continue; + switch (sctp_ifa->address.sa.sa_family) { + case AF_INET: + if (ipv4_addr_legal) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&sctp_ifa->address.sa; + if (sin->sin_addr.s_addr == 0) { + /* + * skip unspecified + * addrs + */ + continue; + } + if ((ipv4_local_scope == 0) && + (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { + continue; + } + /* count this one */ + count++; + } else { + continue; + } + break; +#ifdef INET6 + case AF_INET6: + if (ipv6_addr_legal) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&sctp_ifa->address.sa; + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { + continue; + } + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { + if (local_scope == 0) + continue; + if (sin6->sin6_scope_id == 0) { + if (sa6_recoverscope(sin6) != 0) + /* + * + * bad + * + * li + * nk + * + * loc + * al + * + * add + * re + * ss + * */ + continue; + } + } + if ((site_scope == 0) && + (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { + continue; + } + /* count this one */ + count++; + } + break; +#endif + default: + /* TSNH */ + break; + } + } + } + } else { + /* + * subset bound case + */ + struct sctp_laddr *laddr; + + LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, + sctp_nxt_addr) { + if (sctp_is_addr_restricted(stcb, laddr->ifa)) { + continue; + } + /* count this one */ + count++; + } + } + SCTP_IPI_ADDR_RUNLOCK(); + return (count); +} + +#if defined(SCTP_LOCAL_TRACE_BUF) + +void +sctp_log_trace(uint32_t subsys, const char *str SCTP_UNUSED, uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, uint32_t f) +{ + uint32_t saveindex, newindex; + + do { + saveindex = SCTP_BASE_SYSCTL(sctp_log).index; + if (saveindex >= SCTP_MAX_LOGGING_SIZE) { + newindex = 1; + } else { + newindex = saveindex + 1; + } + } while (atomic_cmpset_int(&SCTP_BASE_SYSCTL(sctp_log).index, saveindex, newindex) == 0); + if (saveindex >= SCTP_MAX_LOGGING_SIZE) { + saveindex = 0; + } + SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].timestamp = SCTP_GET_CYCLECOUNT; + SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].subsys = subsys; + SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[0] = a; + SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[1] = b; + SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[2] = c; + SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[3] = d; + SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[4] = e; + SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[5] = f; +} + +#endif +/* We will need to add support + * to bind the ports and such here + * so we can do UDP tunneling. In + * the mean-time, we return error + */ +#include +#include +#include +#ifdef INET6 +#include +#endif + +static void +sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *ignored) +{ + struct ip *iph; + struct mbuf *sp, *last; + struct udphdr *uhdr; + uint16_t port = 0, len; + int header_size = sizeof(struct udphdr) + sizeof(struct sctphdr); + + /* + * Split out the mbuf chain. Leave the IP header in m, place the + * rest in the sp. + */ + if ((m->m_flags & M_PKTHDR) == 0) { + /* Can't handle one that is not a pkt hdr */ + goto out; + } + /* pull the src port */ + iph = mtod(m, struct ip *); + uhdr = (struct udphdr *)((caddr_t)iph + off); + + port = uhdr->uh_sport; + sp = m_split(m, off, M_DONTWAIT); + if (sp == NULL) { + /* Gak, drop packet, we can't do a split */ + goto out; + } + if (sp->m_pkthdr.len < header_size) { + /* Gak, packet can't have an SCTP header in it - to small */ + m_freem(sp); + goto out; + } + /* ok now pull up the UDP header and SCTP header together */ + sp = m_pullup(sp, header_size); + if (sp == NULL) { + /* Gak pullup failed */ + goto out; + } + /* trim out the UDP header */ + m_adj(sp, sizeof(struct udphdr)); + + /* Now reconstruct the mbuf chain */ + /* 1) find last one */ + last = m; + while (last->m_next != NULL) { + last = last->m_next; + } + last->m_next = sp; + m->m_pkthdr.len += sp->m_pkthdr.len; + last = m; + while (last != NULL) { + last = last->m_next; + } + /* Now its ready for sctp_input or sctp6_input */ + iph = mtod(m, struct ip *); + switch (iph->ip_v) { + case IPVERSION: + { + /* its IPv4 */ + len = SCTP_GET_IPV4_LENGTH(iph); + len -= sizeof(struct udphdr); + SCTP_GET_IPV4_LENGTH(iph) = len; + sctp_input_with_port(m, off, port); + break; + } +#ifdef INET6 + case IPV6_VERSION >> 4: + { + /* its IPv6 - NOT supported */ + goto out; + break; + + } +#endif + default: + { + m_freem(m); + break; + } + } + return; +out: + m_freem(m); +} + +void +sctp_over_udp_stop(void) +{ + struct socket *sop; + + /* + * This function assumes sysctl caller holds sctp_sysctl_info_lock() + * for writting! + */ + if (SCTP_BASE_INFO(udp_tun_socket) == NULL) { + /* Nothing to do */ + return; + } + sop = SCTP_BASE_INFO(udp_tun_socket); + soclose(sop); + SCTP_BASE_INFO(udp_tun_socket) = NULL; +} +int +sctp_over_udp_start(void) +{ + uint16_t port; + int ret; + struct sockaddr_in sin; + struct socket *sop = NULL; + struct thread *th; + struct ucred *cred; + + /* + * This function assumes sysctl caller holds sctp_sysctl_info_lock() + * for writting! + */ + port = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port); + if (port == 0) { + /* Must have a port set */ + return (EINVAL); + } + if (SCTP_BASE_INFO(udp_tun_socket) != NULL) { + /* Already running -- must stop first */ + return (EALREADY); + } + th = curthread; + cred = th->td_ucred; + if ((ret = socreate(PF_INET, &sop, + SOCK_DGRAM, IPPROTO_UDP, cred, th))) { + return (ret); + } + SCTP_BASE_INFO(udp_tun_socket) = sop; + /* call the special UDP hook */ + ret = udp_set_kernel_tunneling(sop, sctp_recv_udp_tunneled_packet); + if (ret) { + goto exit_stage_left; + } + /* Ok we have a socket, bind it to the port */ + memset(&sin, 0, sizeof(sin)); + sin.sin_len = sizeof(sin); + sin.sin_family = AF_INET; + sin.sin_port = htons(port); + ret = sobind(sop, (struct sockaddr *)&sin, th); + if (ret) { + /* Close up we cant get the port */ +exit_stage_left: + sctp_over_udp_stop(); + return (ret); + } + /* + * Ok we should now get UDP packets directly to our input routine + * sctp_recv_upd_tunneled_packet(). + */ + return (0); +} diff --git a/freebsd/sys/netinet/sctputil.h b/freebsd/sys/netinet/sctputil.h new file mode 100644 index 00000000..b1bee3a4 --- /dev/null +++ b/freebsd/sys/netinet/sctputil.h @@ -0,0 +1,392 @@ +/*- + * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * a) Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * b) Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the distribution. + * + * c) Neither the name of Cisco Systems, Inc. nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + + +/* $KAME: sctputil.h,v 1.15 2005/03/06 16:04:19 itojun Exp $ */ + +#include +__FBSDID("$FreeBSD$"); +#ifndef __sctputil_h__ +#define __sctputil_h__ + + +#if defined(_KERNEL) || defined(__Userspace__) + +#define SCTP_READ_LOCK_HELD 1 +#define SCTP_READ_LOCK_NOT_HELD 0 + +#ifdef SCTP_ASOCLOG_OF_TSNS +void sctp_print_out_track_log(struct sctp_tcb *stcb); + +#endif + +#ifdef SCTP_MBUF_LOGGING +struct mbuf *sctp_m_free(struct mbuf *m); +void sctp_m_freem(struct mbuf *m); + +#else +#define sctp_m_free m_free +#define sctp_m_freem m_freem +#endif + +#if defined(SCTP_LOCAL_TRACE_BUF) || defined(__APPLE__) +void + sctp_log_trace(uint32_t fr, const char *str SCTP_UNUSED, uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, uint32_t f); + +#endif + +#define sctp_get_associd(stcb) ((sctp_assoc_t)stcb->asoc.assoc_id) + + +/* + * Function prototypes + */ +uint32_t +sctp_get_ifa_hash_val(struct sockaddr *addr); + +struct sctp_ifa * + sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr, int hold_lock); + +struct sctp_ifa * + sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock); + +uint32_t sctp_select_initial_TSN(struct sctp_pcb *); + +uint32_t sctp_select_a_tag(struct sctp_inpcb *, uint16_t lport, uint16_t rport, int); + +int sctp_init_asoc(struct sctp_inpcb *, struct sctp_tcb *, uint32_t, uint32_t); + +void sctp_fill_random_store(struct sctp_pcb *); + +void +sctp_timer_start(int, struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *); + +void +sctp_timer_stop(int, struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *, uint32_t); + +int + sctp_dynamic_set_primary(struct sockaddr *sa, uint32_t vrf_id); + +void + sctp_mtu_size_reset(struct sctp_inpcb *, struct sctp_association *, uint32_t); + +void +sctp_add_to_readq(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_queued_to_read *control, + struct sockbuf *sb, + int end, + int inpread_locked, + int so_locked +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); + +int +sctp_append_to_readq(struct sctp_inpcb *inp, + struct sctp_tcb *stcb, + struct sctp_queued_to_read *control, + struct mbuf *m, + int end, + int new_cumack, + struct sockbuf *sb); + + +void sctp_iterator_worker(void); + +uint32_t sctp_get_prev_mtu(uint32_t); +uint32_t sctp_get_next_mtu(struct sctp_inpcb *, uint32_t); + +void + sctp_timeout_handler(void *); + +uint32_t +sctp_calculate_rto(struct sctp_tcb *, struct sctp_association *, + struct sctp_nets *, struct timeval *, int); + +uint32_t sctp_calculate_len(struct mbuf *); + +caddr_t sctp_m_getptr(struct mbuf *, int, int, uint8_t *); + +struct sctp_paramhdr * +sctp_get_next_param(struct mbuf *, int, + struct sctp_paramhdr *, int); + +int sctp_add_pad_tombuf(struct mbuf *, int); + +int sctp_pad_lastmbuf(struct mbuf *, int, struct mbuf *); + +void +sctp_ulp_notify(uint32_t, struct sctp_tcb *, uint32_t, void *, int +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); + +void +sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp, + struct sctp_inpcb *new_inp, + struct sctp_tcb *stcb, int waitflags); + + +void sctp_stop_timers_for_shutdown(struct sctp_tcb *); + +void +sctp_report_all_outbound(struct sctp_tcb *, int, int +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); + +int sctp_expand_mapping_array(struct sctp_association *, uint32_t); + +void +sctp_abort_notification(struct sctp_tcb *, int, int +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); + +/* We abort responding to an IP packet for some reason */ +void +sctp_abort_association(struct sctp_inpcb *, struct sctp_tcb *, + struct mbuf *, int, struct sctphdr *, struct mbuf *, uint32_t, uint16_t); + + +/* We choose to abort via user input */ +void +sctp_abort_an_association(struct sctp_inpcb *, struct sctp_tcb *, int, + struct mbuf *, int +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); + +void +sctp_handle_ootb(struct mbuf *, int, int, struct sctphdr *, + struct sctp_inpcb *, struct mbuf *, uint32_t, uint16_t); + +int +sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr, + int totaddr, int *error); + +struct sctp_tcb * +sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, + int *totaddr, int *num_v4, int *num_v6, int *error, int limit, int *bad_addr); + +int sctp_is_there_an_abort_here(struct mbuf *, int, uint32_t *); + +#ifdef INET6 +uint32_t sctp_is_same_scope(struct sockaddr_in6 *, struct sockaddr_in6 *); + +struct sockaddr_in6 * + sctp_recover_scope(struct sockaddr_in6 *, struct sockaddr_in6 *); + +#define sctp_recover_scope_mac(addr, store) do { \ + if ((addr->sin6_family == AF_INET6) && \ + (IN6_IS_SCOPE_LINKLOCAL(&addr->sin6_addr))) { \ + *store = *addr; \ + if (addr->sin6_scope_id == 0) { \ + if (!sa6_recoverscope(store)) { \ + addr = store; \ + } \ + } else { \ + in6_clearscope(&addr->sin6_addr); \ + addr = store; \ + } \ + } \ +} while (0) +#endif + +int sctp_cmpaddr(struct sockaddr *, struct sockaddr *); + +void sctp_print_address(struct sockaddr *); +void sctp_print_address_pkt(struct ip *, struct sctphdr *); + +int +sctp_release_pr_sctp_chunk(struct sctp_tcb *, struct sctp_tmit_chunk *, + int, int +#if !defined(__APPLE__) && !defined(SCTP_SO_LOCK_TESTING) + SCTP_UNUSED +#endif +); + +struct mbuf *sctp_generate_invmanparam(int); + +void +sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp, + struct sockaddr *sa, sctp_assoc_t assoc_id, + uint32_t vrf_id, int *error, void *p); +void +sctp_bindx_delete_address(struct socket *so, struct sctp_inpcb *inp, + struct sockaddr *sa, sctp_assoc_t assoc_id, + uint32_t vrf_id, int *error); + +int sctp_local_addr_count(struct sctp_tcb *stcb); + +#ifdef SCTP_MBCNT_LOGGING +void +sctp_free_bufspace(struct sctp_tcb *, struct sctp_association *, + struct sctp_tmit_chunk *, int); + +#else +#define sctp_free_bufspace(stcb, asoc, tp1, chk_cnt) \ +do { \ + if (tp1->data != NULL) { \ + atomic_subtract_int(&((asoc)->chunks_on_out_queue), chk_cnt); \ + if ((asoc)->total_output_queue_size >= tp1->book_size) { \ + atomic_subtract_int(&((asoc)->total_output_queue_size), tp1->book_size); \ + } else { \ + (asoc)->total_output_queue_size = 0; \ + } \ + if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \ + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \ + if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) { \ + atomic_subtract_int(&((stcb)->sctp_socket->so_snd.sb_cc), tp1->book_size); \ + } else { \ + stcb->sctp_socket->so_snd.sb_cc = 0; \ + } \ + } \ + } \ +} while (0) + +#endif + +#define sctp_free_spbufspace(stcb, asoc, sp) \ +do { \ + if (sp->data != NULL) { \ + if ((asoc)->total_output_queue_size >= sp->length) { \ + atomic_subtract_int(&(asoc)->total_output_queue_size, sp->length); \ + } else { \ + (asoc)->total_output_queue_size = 0; \ + } \ + if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \ + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \ + if (stcb->sctp_socket->so_snd.sb_cc >= sp->length) { \ + atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc,sp->length); \ + } else { \ + stcb->sctp_socket->so_snd.sb_cc = 0; \ + } \ + } \ + } \ +} while (0) + +#define sctp_snd_sb_alloc(stcb, sz) \ +do { \ + atomic_add_int(&stcb->asoc.total_output_queue_size,sz); \ + if ((stcb->sctp_socket != NULL) && \ + ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \ + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \ + atomic_add_int(&stcb->sctp_socket->so_snd.sb_cc,sz); \ + } \ +} while (0) + +/* new functions to start/stop udp tunneling */ +void sctp_over_udp_stop(void); +int sctp_over_udp_start(void); + +int +sctp_soreceive(struct socket *so, struct sockaddr **psa, + struct uio *uio, + struct mbuf **mp0, + struct mbuf **controlp, + int *flagsp); + + +/* For those not passing mbufs, this does the + * translations for you. Caller owns memory + * of size controllen returned in controlp. + */ +int +sctp_l_soreceive(struct socket *so, + struct sockaddr **name, + struct uio *uio, + char **controlp, + int *controllen, + int *flag); + + +void + sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d); + +void +sctp_wakeup_log(struct sctp_tcb *stcb, + uint32_t cumtsn, + uint32_t wake_cnt, int from); + +void sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t, uint16_t, uint16_t, int); + +void sctp_log_nagle_event(struct sctp_tcb *stcb, int action); + + +void + sctp_log_mb(struct mbuf *m, int from); + +void +sctp_sblog(struct sockbuf *sb, + struct sctp_tcb *stcb, int from, int incr); + +void +sctp_log_strm_del(struct sctp_queued_to_read *control, + struct sctp_queued_to_read *poschk, + int from); +void sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *, int, uint8_t); +void rto_logging(struct sctp_nets *net, int from); + +void sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc); + +void sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from); +void sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *, int, int, uint8_t); +void sctp_log_block(uint8_t, struct socket *, struct sctp_association *, int); +void sctp_log_rwnd(uint8_t, uint32_t, uint32_t, uint32_t); +void sctp_log_mbcnt(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t); +void sctp_log_rwnd_set(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t); +int sctp_fill_stat_log(void *, size_t *); +void sctp_log_fr(uint32_t, uint32_t, uint32_t, int); +void sctp_log_sack(uint32_t, uint32_t, uint32_t, uint16_t, uint16_t, int); +void sctp_log_map(uint32_t, uint32_t, uint32_t, int); +void sctp_print_mapping_array(struct sctp_association *asoc); +void sctp_clr_stat_log(void); + + +#ifdef SCTP_AUDITING_ENABLED +void +sctp_auditing(int, struct sctp_inpcb *, struct sctp_tcb *, + struct sctp_nets *); +void sctp_audit_log(uint8_t, uint8_t); + +#endif + + +#endif /* _KERNEL */ +#endif diff --git a/freebsd/sys/netinet/tcp.h b/freebsd/sys/netinet/tcp.h new file mode 100644 index 00000000..19b1c57f --- /dev/null +++ b/freebsd/sys/netinet/tcp.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/netinet/tcp_debug.c b/freebsd/sys/netinet/tcp_debug.c new file mode 100644 index 00000000..52a82193 --- /dev/null +++ b/freebsd/sys/netinet/tcp_debug.c @@ -0,0 +1,226 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.c 8.1 (Berkeley) 6/10/93 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#ifdef TCPDEBUG +/* load symbolic names */ +#define PRUREQUESTS +#define TCPSTATES +#define TCPTIMERS +#define TANAMES +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +#ifdef TCPDEBUG +static int tcpconsdebug = 0; +#endif + +/* + * Global ring buffer of TCP debugging state. Each entry captures a snapshot + * of TCP connection state at any given moment. tcp_debx addresses at the + * next available slot. There is no explicit export of this data structure; + * it will be read via /dev/kmem by debugging tools. + */ +static struct tcp_debug tcp_debug[TCP_NDEBUG]; +static int tcp_debx; + +/* + * All global state is protected by tcp_debug_mtx; tcp_trace() is split into + * two parts, one of which saves connection and other state into the global + * array (locked by tcp_debug_mtx). + */ +struct mtx tcp_debug_mtx; +MTX_SYSINIT(tcp_debug_mtx, &tcp_debug_mtx, "tcp_debug_mtx", MTX_DEF); + +/* + * Save TCP state at a given moment; optionally, both tcpcb and TCP packet + * header state will be saved. + */ +void +tcp_trace(short act, short ostate, struct tcpcb *tp, void *ipgen, + struct tcphdr *th, int req) +{ +#ifdef INET6 + int isipv6; +#endif /* INET6 */ + tcp_seq seq, ack; + int len, flags; + struct tcp_debug *td; + + mtx_lock(&tcp_debug_mtx); + td = &tcp_debug[tcp_debx++]; + if (tcp_debx == TCP_NDEBUG) + tcp_debx = 0; + bzero(td, sizeof(*td)); +#ifdef INET6 + isipv6 = (ipgen != NULL && ((struct ip *)ipgen)->ip_v == 6) ? 1 : 0; +#endif /* INET6 */ + td->td_family = +#ifdef INET6 + (isipv6 != 0) ? AF_INET6 : +#endif + AF_INET; +#ifdef INET + td->td_time = iptime(); +#endif + td->td_act = act; + td->td_ostate = ostate; + td->td_tcb = (caddr_t)tp; + if (tp != NULL) + td->td_cb = *tp; + if (ipgen != NULL) { + switch (td->td_family) { +#ifdef INET + case AF_INET: + bcopy(ipgen, &td->td_ti.ti_i, sizeof(td->td_ti.ti_i)); + break; +#endif +#ifdef INET6 + case AF_INET6: + bcopy(ipgen, td->td_ip6buf, sizeof(td->td_ip6buf)); + break; +#endif + } + } + if (th != NULL) { + switch (td->td_family) { +#ifdef INET + case AF_INET: + td->td_ti.ti_t = *th; + break; +#endif +#ifdef INET6 + case AF_INET6: + td->td_ti6.th = *th; + break; +#endif + } + } + td->td_req = req; + mtx_unlock(&tcp_debug_mtx); +#ifdef TCPDEBUG + if (tcpconsdebug == 0) + return; + if (tp != NULL) + printf("%p %s:", tp, tcpstates[ostate]); + else + printf("???????? "); + printf("%s ", tanames[act]); + switch (act) { + case TA_INPUT: + case TA_OUTPUT: + case TA_DROP: + if (ipgen == NULL || th == NULL) + break; + seq = th->th_seq; + ack = th->th_ack; + len = +#ifdef INET6 + isipv6 ? ntohs(((struct ip6_hdr *)ipgen)->ip6_plen) : +#endif + ((struct ip *)ipgen)->ip_len; + if (act == TA_OUTPUT) { + seq = ntohl(seq); + ack = ntohl(ack); + len = ntohs((u_short)len); + } + if (act == TA_OUTPUT) + len -= sizeof (struct tcphdr); + if (len) + printf("[%x..%x)", seq, seq+len); + else + printf("%x", seq); + printf("@%x, urp=%x", ack, th->th_urp); + flags = th->th_flags; + if (flags) { + char *cp = "<"; +#define pf(f) { \ + if (th->th_flags & TH_##f) { \ + printf("%s%s", cp, #f); \ + cp = ","; \ + } \ +} + pf(SYN); pf(ACK); pf(FIN); pf(RST); pf(PUSH); pf(URG); + printf(">"); + } + break; + + case TA_USER: + printf("%s", prurequests[req&0xff]); + if ((req & 0xff) == PRU_SLOWTIMO) + printf("<%s>", tcptimers[req>>8]); + break; + } + if (tp != NULL) + printf(" -> %s", tcpstates[tp->t_state]); + /* print out internal state of tp !?! */ + printf("\n"); + if (tp == NULL) + return; + printf( + "\trcv_(nxt,wnd,up) (%lx,%lx,%lx) snd_(una,nxt,max) (%lx,%lx,%lx)\n", + (u_long)tp->rcv_nxt, tp->rcv_wnd, (u_long)tp->rcv_up, + (u_long)tp->snd_una, (u_long)tp->snd_nxt, (u_long)tp->snd_max); + printf("\tsnd_(wl1,wl2,wnd) (%lx,%lx,%lx)\n", + (u_long)tp->snd_wl1, (u_long)tp->snd_wl2, tp->snd_wnd); +#endif /* TCPDEBUG */ +} diff --git a/freebsd/sys/netinet/tcp_debug.h b/freebsd/sys/netinet/tcp_debug.h new file mode 100644 index 00000000..0c103958 --- /dev/null +++ b/freebsd/sys/netinet/tcp_debug.h @@ -0,0 +1,80 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_debug.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_DEBUG_HH_ +#define _NETINET_TCP_DEBUG_HH_ + +struct tcp_debug { + uint32_t td_time; /* network format */ + short td_act; + short td_ostate; + caddr_t td_tcb; + int td_family; + /* + * Co-existense of td_ti and td_ti6 below is ugly, but it is necessary + * to achieve backword compatibility to some extent. + */ + struct tcpiphdr td_ti; + struct { +#define IP6_HDR_LEN 40 /* sizeof(struct ip6_hdr) */ +#if !defined(_KERNEL) && defined(INET6) + struct ip6_hdr ip6; +#else + u_char ip6buf[IP6_HDR_LEN]; +#endif + struct tcphdr th; + } td_ti6; +#define td_ip6buf td_ti6.ip6buf + short td_req; + struct tcpcb td_cb; +}; + +#define TA_INPUT 0 +#define TA_OUTPUT 1 +#define TA_USER 2 +#define TA_RESPOND 3 +#define TA_DROP 4 + +#ifdef TANAMES +static const char *tanames[] = + { "input", "output", "user", "respond", "drop" }; +#endif + +#define TCP_NDEBUG 100 + +#ifndef _KERNEL +/* XXX common variables for broken applications. */ +struct tcp_debug tcp_debug[TCP_NDEBUG]; +int tcp_debx; +#endif + +#endif /* !_NETINET_TCP_DEBUG_HH_ */ diff --git a/freebsd/sys/netinet/tcp_fsm.h b/freebsd/sys/netinet/tcp_fsm.h new file mode 100644 index 00000000..253e53d4 --- /dev/null +++ b/freebsd/sys/netinet/tcp_fsm.h @@ -0,0 +1,112 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_fsm.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_FSM_HH_ +#define _NETINET_TCP_FSM_HH_ + +/* + * TCP FSM state definitions. + * + * Per RFC793, September, 1981. + */ + +#define TCP_NSTATES 11 + +#define TCPS_CLOSED 0 /* closed */ +#define TCPS_LISTEN 1 /* listening for connection */ +#define TCPS_SYN_SENT 2 /* active, have sent syn */ +#define TCPS_SYN_RECEIVED 3 /* have sent and received syn */ +/* states < TCPS_ESTABLISHED are those where connections not established */ +#define TCPS_ESTABLISHED 4 /* established */ +#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */ +/* states > TCPS_CLOSE_WAIT are those where user has closed */ +#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */ +#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */ +#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */ +/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */ +#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */ +#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */ + +/* for KAME src sync over BSD*'s */ +#define TCP6_NSTATES TCP_NSTATES +#define TCP6S_CLOSED TCPS_CLOSED +#define TCP6S_LISTEN TCPS_LISTEN +#define TCP6S_SYN_SENT TCPS_SYN_SENT +#define TCP6S_SYN_RECEIVED TCPS_SYN_RECEIVED +#define TCP6S_ESTABLISHED TCPS_ESTABLISHED +#define TCP6S_CLOSE_WAIT TCPS_CLOSE_WAIT +#define TCP6S_FIN_WAIT_1 TCPS_FIN_WAIT_1 +#define TCP6S_CLOSING TCPS_CLOSING +#define TCP6S_LAST_ACK TCPS_LAST_ACK +#define TCP6S_FIN_WAIT_2 TCPS_FIN_WAIT_2 +#define TCP6S_TIME_WAIT TCPS_TIME_WAIT + +#define TCPS_HAVERCVDSYN(s) ((s) >= TCPS_SYN_RECEIVED) +#define TCPS_HAVEESTABLISHED(s) ((s) >= TCPS_ESTABLISHED) +#define TCPS_HAVERCVDFIN(s) ((s) >= TCPS_TIME_WAIT) + +#ifdef TCPOUTFLAGS +/* + * Flags used when sending segments in tcp_output. Basic flags (TH_RST, + * TH_ACK,TH_SYN,TH_FIN) are totally determined by state, with the proviso + * that TH_FIN is sent only if all data queued for output is included in the + * segment. + */ +static u_char tcp_outflags[TCP_NSTATES] = { + TH_RST|TH_ACK, /* 0, CLOSED */ + 0, /* 1, LISTEN */ + TH_SYN, /* 2, SYN_SENT */ + TH_SYN|TH_ACK, /* 3, SYN_RECEIVED */ + TH_ACK, /* 4, ESTABLISHED */ + TH_ACK, /* 5, CLOSE_WAIT */ + TH_FIN|TH_ACK, /* 6, FIN_WAIT_1 */ + TH_FIN|TH_ACK, /* 7, CLOSING */ + TH_FIN|TH_ACK, /* 8, LAST_ACK */ + TH_ACK, /* 9, FIN_WAIT_2 */ + TH_ACK, /* 10, TIME_WAIT */ +}; +#endif + +#ifdef KPROF +int tcp_acounts[TCP_NSTATES][PRU_NREQ]; +#endif + +#ifdef TCPSTATES +static char const * const tcpstates[] = { + "CLOSED", "LISTEN", "SYN_SENT", "SYN_RCVD", + "ESTABLISHED", "CLOSE_WAIT", "FIN_WAIT_1", "CLOSING", + "LAST_ACK", "FIN_WAIT_2", "TIME_WAIT", +}; +#endif + +#endif diff --git a/freebsd/sys/netinet/tcp_hostcache.c b/freebsd/sys/netinet/tcp_hostcache.c new file mode 100644 index 00000000..07b78cfe --- /dev/null +++ b/freebsd/sys/netinet/tcp_hostcache.c @@ -0,0 +1,693 @@ +#include + +/*- + * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * The tcp_hostcache moves the tcp-specific cached metrics from the routing + * table to a dedicated structure indexed by the remote IP address. It keeps + * information on the measured TCP parameters of past TCP sessions to allow + * better initial start values to be used with later connections to/from the + * same source. Depending on the network parameters (delay, bandwidth, max + * MTU, congestion window) between local and remote sites, this can lead to + * significant speed-ups for new TCP connections after the first one. + * + * Due to the tcp_hostcache, all TCP-specific metrics information in the + * routing table have been removed. The inpcb no longer keeps a pointer to + * the routing entry, and protocol-initiated route cloning has been removed + * as well. With these changes, the routing table has gone back to being + * more lightwight and only carries information related to packet forwarding. + * + * tcp_hostcache is designed for multiple concurrent access in SMP + * environments and high contention. All bucket rows have their own lock and + * thus multiple lookups and modifies can be done at the same time as long as + * they are in different bucket rows. If a request for insertion of a new + * record can't be satisfied, it simply returns an empty structure. Nobody + * and nothing outside of tcp_hostcache.c will ever point directly to any + * entry in the tcp_hostcache. All communication is done in an + * object-oriented way and only functions of tcp_hostcache will manipulate + * hostcache entries. Otherwise, we are unable to achieve good behaviour in + * concurrent access situations. Since tcp_hostcache is only caching + * information, there are no fatal consequences if we either can't satisfy + * any particular request or have to drop/overwrite an existing entry because + * of bucket limit memory constrains. + */ + +/* + * Many thanks to jlemon for basic structure of tcp_syncache which is being + * followed here. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#endif +#include +#include +#include +#ifdef INET6 +#include +#endif + +#include + +/* Arbitrary values */ +#define TCP_HOSTCACHE_HASHSIZE 512 +#define TCP_HOSTCACHE_BUCKETLIMIT 30 +#define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ +#define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ + +static VNET_DEFINE(struct tcp_hostcache, tcp_hostcache); +#define V_tcp_hostcache VNET(tcp_hostcache) + +static VNET_DEFINE(struct callout, tcp_hc_callout); +#define V_tcp_hc_callout VNET(tcp_hc_callout) + +static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); +static struct hc_metrics *tcp_hc_insert(struct in_conninfo *); +static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); +static void tcp_hc_purge_internal(int); +static void tcp_hc_purge(void *); + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, + "TCP Host cache"); + +SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, + &VNET_NAME(tcp_hostcache.cache_limit), 0, + "Overall entry limit for hostcache"); + +SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN, + &VNET_NAME(tcp_hostcache.hashsize), 0, + "Size of TCP hostcache hashtable"); + +SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, + CTLFLAG_RDTUN, &VNET_NAME(tcp_hostcache.bucket_limit), 0, + "Per-bucket hash limit for hostcache"); + +SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD, + &VNET_NAME(tcp_hostcache.cache_count), 0, + "Current number of entries in hostcache"); + +SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW, + &VNET_NAME(tcp_hostcache.expire), 0, + "Expire time of TCP hostcache entries"); + +SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, prune, CTLFLAG_RW, + &VNET_NAME(tcp_hostcache.prune), 0, + "Time between purge runs"); + +SYSCTL_VNET_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW, + &VNET_NAME(tcp_hostcache.purgeall), 0, + "Expire all entires on next purge run"); + +SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0, + sysctl_tcp_hc_list, "A", "List of all hostcache entries"); + + +static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache"); + +#define HOSTCACHE_HASH(ip) \ + (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \ + V_tcp_hostcache.hashmask) + +/* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */ +#define HOSTCACHE_HASH6(ip6) \ + (((ip6)->s6_addr32[0] ^ \ + (ip6)->s6_addr32[1] ^ \ + (ip6)->s6_addr32[2] ^ \ + (ip6)->s6_addr32[3]) & \ + V_tcp_hostcache.hashmask) + +#define THC_LOCK(lp) mtx_lock(lp) +#define THC_UNLOCK(lp) mtx_unlock(lp) + +void +tcp_hc_init(void) +{ + int i; + + /* + * Initialize hostcache structures. + */ + V_tcp_hostcache.cache_count = 0; + V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; + V_tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT; + V_tcp_hostcache.cache_limit = + V_tcp_hostcache.hashsize * V_tcp_hostcache.bucket_limit; + V_tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE; + V_tcp_hostcache.prune = TCP_HOSTCACHE_PRUNE; + + TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize", + &V_tcp_hostcache.hashsize); + TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit", + &V_tcp_hostcache.cache_limit); + TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit", + &V_tcp_hostcache.bucket_limit); + if (!powerof2(V_tcp_hostcache.hashsize)) { + printf("WARNING: hostcache hash size is not a power of 2.\n"); + V_tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; /* default */ + } + V_tcp_hostcache.hashmask = V_tcp_hostcache.hashsize - 1; + + /* + * Allocate the hash table. + */ + V_tcp_hostcache.hashbase = (struct hc_head *) + malloc(V_tcp_hostcache.hashsize * sizeof(struct hc_head), + M_HOSTCACHE, M_WAITOK | M_ZERO); + + /* + * Initialize the hash buckets. + */ + for (i = 0; i < V_tcp_hostcache.hashsize; i++) { + TAILQ_INIT(&V_tcp_hostcache.hashbase[i].hch_bucket); + V_tcp_hostcache.hashbase[i].hch_length = 0; + mtx_init(&V_tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", + NULL, MTX_DEF); + } + + /* + * Allocate the hostcache entries. + */ + V_tcp_hostcache.zone = + uma_zcreate("hostcache", sizeof(struct hc_metrics), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + uma_zone_set_max(V_tcp_hostcache.zone, V_tcp_hostcache.cache_limit); + + /* + * Set up periodic cache cleanup. + */ + callout_init(&V_tcp_hc_callout, CALLOUT_MPSAFE); + callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, + tcp_hc_purge, curvnet); +} + +#ifdef VIMAGE +void +tcp_hc_destroy(void) +{ + int i; + + callout_drain(&V_tcp_hc_callout); + + /* Purge all hc entries. */ + tcp_hc_purge_internal(1); + + /* Free the uma zone and the allocated hash table. */ + uma_zdestroy(V_tcp_hostcache.zone); + + for (i = 0; i < V_tcp_hostcache.hashsize; i++) + mtx_destroy(&V_tcp_hostcache.hashbase[i].hch_mtx); + free(V_tcp_hostcache.hashbase, M_HOSTCACHE); +} +#endif + +/* + * Internal function: look up an entry in the hostcache or return NULL. + * + * If an entry has been returned, the caller becomes responsible for + * unlocking the bucket row after he is done reading/modifying the entry. + */ +static struct hc_metrics * +tcp_hc_lookup(struct in_conninfo *inc) +{ + int hash; + struct hc_head *hc_head; + struct hc_metrics *hc_entry; + + KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer")); + + /* + * Hash the foreign ip address. + */ + if (inc->inc_flags & INC_ISIPV6) + hash = HOSTCACHE_HASH6(&inc->inc6_faddr); + else + hash = HOSTCACHE_HASH(&inc->inc_faddr); + + hc_head = &V_tcp_hostcache.hashbase[hash]; + + /* + * Acquire lock for this bucket row; we release the lock if we don't + * find an entry, otherwise the caller has to unlock after he is + * done. + */ + THC_LOCK(&hc_head->hch_mtx); + + /* + * Iterate through entries in bucket row looking for a match. + */ + TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) { + if (inc->inc_flags & INC_ISIPV6) { + if (memcmp(&inc->inc6_faddr, &hc_entry->ip6, + sizeof(inc->inc6_faddr)) == 0) + return hc_entry; + } else { + if (memcmp(&inc->inc_faddr, &hc_entry->ip4, + sizeof(inc->inc_faddr)) == 0) + return hc_entry; + } + } + + /* + * We were unsuccessful and didn't find anything. + */ + THC_UNLOCK(&hc_head->hch_mtx); + return NULL; +} + +/* + * Internal function: insert an entry into the hostcache or return NULL if + * unable to allocate a new one. + * + * If an entry has been returned, the caller becomes responsible for + * unlocking the bucket row after he is done reading/modifying the entry. + */ +static struct hc_metrics * +tcp_hc_insert(struct in_conninfo *inc) +{ + int hash; + struct hc_head *hc_head; + struct hc_metrics *hc_entry; + + KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer")); + + /* + * Hash the foreign ip address. + */ + if (inc->inc_flags & INC_ISIPV6) + hash = HOSTCACHE_HASH6(&inc->inc6_faddr); + else + hash = HOSTCACHE_HASH(&inc->inc_faddr); + + hc_head = &V_tcp_hostcache.hashbase[hash]; + + /* + * Acquire lock for this bucket row; we release the lock if we don't + * find an entry, otherwise the caller has to unlock after he is + * done. + */ + THC_LOCK(&hc_head->hch_mtx); + + /* + * If the bucket limit is reached, reuse the least-used element. + */ + if (hc_head->hch_length >= V_tcp_hostcache.bucket_limit || + V_tcp_hostcache.cache_count >= V_tcp_hostcache.cache_limit) { + hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead); + /* + * At first we were dropping the last element, just to + * reacquire it in the next two lines again, which isn't very + * efficient. Instead just reuse the least used element. + * We may drop something that is still "in-use" but we can be + * "lossy". + * Just give up if this bucket row is empty and we don't have + * anything to replace. + */ + if (hc_entry == NULL) { + THC_UNLOCK(&hc_head->hch_mtx); + return NULL; + } + TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q); + V_tcp_hostcache.hashbase[hash].hch_length--; + V_tcp_hostcache.cache_count--; + TCPSTAT_INC(tcps_hc_bucketoverflow); +#if 0 + uma_zfree(V_tcp_hostcache.zone, hc_entry); +#endif + } else { + /* + * Allocate a new entry, or balk if not possible. + */ + hc_entry = uma_zalloc(V_tcp_hostcache.zone, M_NOWAIT); + if (hc_entry == NULL) { + THC_UNLOCK(&hc_head->hch_mtx); + return NULL; + } + } + + /* + * Initialize basic information of hostcache entry. + */ + bzero(hc_entry, sizeof(*hc_entry)); + if (inc->inc_flags & INC_ISIPV6) + bcopy(&inc->inc6_faddr, &hc_entry->ip6, sizeof(hc_entry->ip6)); + else + hc_entry->ip4 = inc->inc_faddr; + hc_entry->rmx_head = hc_head; + hc_entry->rmx_expire = V_tcp_hostcache.expire; + + /* + * Put it upfront. + */ + TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); + V_tcp_hostcache.hashbase[hash].hch_length++; + V_tcp_hostcache.cache_count++; + TCPSTAT_INC(tcps_hc_added); + + return hc_entry; +} + +/* + * External function: look up an entry in the hostcache and fill out the + * supplied TCP metrics structure. Fills in NULL when no entry was found or + * a value is not set. + */ +void +tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) +{ + struct hc_metrics *hc_entry; + + /* + * Find the right bucket. + */ + hc_entry = tcp_hc_lookup(inc); + + /* + * If we don't have an existing object. + */ + if (hc_entry == NULL) { + bzero(hc_metrics_lite, sizeof(*hc_metrics_lite)); + return; + } + hc_entry->rmx_hits++; + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ + + hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu; + hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh; + hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt; + hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar; + hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth; + hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd; + hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe; + hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe; + + /* + * Unlock bucket row. + */ + THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); +} + +/* + * External function: look up an entry in the hostcache and return the + * discovered path MTU. Returns NULL if no entry is found or value is not + * set. + */ +u_long +tcp_hc_getmtu(struct in_conninfo *inc) +{ + struct hc_metrics *hc_entry; + u_long mtu; + + hc_entry = tcp_hc_lookup(inc); + if (hc_entry == NULL) { + return 0; + } + hc_entry->rmx_hits++; + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ + + mtu = hc_entry->rmx_mtu; + THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); + return mtu; +} + +/* + * External function: update the MTU value of an entry in the hostcache. + * Creates a new entry if none was found. + */ +void +tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu) +{ + struct hc_metrics *hc_entry; + + /* + * Find the right bucket. + */ + hc_entry = tcp_hc_lookup(inc); + + /* + * If we don't have an existing object, try to insert a new one. + */ + if (hc_entry == NULL) { + hc_entry = tcp_hc_insert(inc); + if (hc_entry == NULL) + return; + } + hc_entry->rmx_updates++; + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ + + hc_entry->rmx_mtu = mtu; + + /* + * Put it upfront so we find it faster next time. + */ + TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); + TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); + + /* + * Unlock bucket row. + */ + THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); +} + +/* + * External function: update the TCP metrics of an entry in the hostcache. + * Creates a new entry if none was found. + */ +void +tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) +{ + struct hc_metrics *hc_entry; + + hc_entry = tcp_hc_lookup(inc); + if (hc_entry == NULL) { + hc_entry = tcp_hc_insert(inc); + if (hc_entry == NULL) + return; + } + hc_entry->rmx_updates++; + hc_entry->rmx_expire = V_tcp_hostcache.expire; /* start over again */ + + if (hcml->rmx_rtt != 0) { + if (hc_entry->rmx_rtt == 0) + hc_entry->rmx_rtt = hcml->rmx_rtt; + else + hc_entry->rmx_rtt = + (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2; + TCPSTAT_INC(tcps_cachedrtt); + } + if (hcml->rmx_rttvar != 0) { + if (hc_entry->rmx_rttvar == 0) + hc_entry->rmx_rttvar = hcml->rmx_rttvar; + else + hc_entry->rmx_rttvar = + (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2; + TCPSTAT_INC(tcps_cachedrttvar); + } + if (hcml->rmx_ssthresh != 0) { + if (hc_entry->rmx_ssthresh == 0) + hc_entry->rmx_ssthresh = hcml->rmx_ssthresh; + else + hc_entry->rmx_ssthresh = + (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; + TCPSTAT_INC(tcps_cachedssthresh); + } + if (hcml->rmx_bandwidth != 0) { + if (hc_entry->rmx_bandwidth == 0) + hc_entry->rmx_bandwidth = hcml->rmx_bandwidth; + else + hc_entry->rmx_bandwidth = + (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2; + /* TCPSTAT_INC(tcps_cachedbandwidth); */ + } + if (hcml->rmx_cwnd != 0) { + if (hc_entry->rmx_cwnd == 0) + hc_entry->rmx_cwnd = hcml->rmx_cwnd; + else + hc_entry->rmx_cwnd = + (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2; + /* TCPSTAT_INC(tcps_cachedcwnd); */ + } + if (hcml->rmx_sendpipe != 0) { + if (hc_entry->rmx_sendpipe == 0) + hc_entry->rmx_sendpipe = hcml->rmx_sendpipe; + else + hc_entry->rmx_sendpipe = + (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2; + /* TCPSTAT_INC(tcps_cachedsendpipe); */ + } + if (hcml->rmx_recvpipe != 0) { + if (hc_entry->rmx_recvpipe == 0) + hc_entry->rmx_recvpipe = hcml->rmx_recvpipe; + else + hc_entry->rmx_recvpipe = + (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2; + /* TCPSTAT_INC(tcps_cachedrecvpipe); */ + } + + TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); + TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); + THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); +} + +/* + * Sysctl function: prints the list and values of all hostcache entries in + * unsorted order. + */ +static int +sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) +{ + int bufsize; + int linesize = 128; + char *p, *buf; + int len, i, error; + struct hc_metrics *hc_entry; +#ifdef INET6 + char ip6buf[INET6_ADDRSTRLEN]; +#endif + + bufsize = linesize * (V_tcp_hostcache.cache_count + 1); + + p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); + + len = snprintf(p, linesize, + "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH " + " CWND SENDPIPE RECVPIPE HITS UPD EXP\n"); + p += len; + +#define msec(u) (((u) + 500) / 1000) + for (i = 0; i < V_tcp_hostcache.hashsize; i++) { + THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); + TAILQ_FOREACH(hc_entry, &V_tcp_hostcache.hashbase[i].hch_bucket, + rmx_q) { + len = snprintf(p, linesize, + "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu " + "%4lu %4lu %4i\n", + hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) : +#ifdef INET6 + ip6_sprintf(ip6buf, &hc_entry->ip6), +#else + "IPv6?", +#endif + hc_entry->rmx_mtu, + hc_entry->rmx_ssthresh, + msec(hc_entry->rmx_rtt * + (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))), + msec(hc_entry->rmx_rttvar * + (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))), + hc_entry->rmx_bandwidth * 8, + hc_entry->rmx_cwnd, + hc_entry->rmx_sendpipe, + hc_entry->rmx_recvpipe, + hc_entry->rmx_hits, + hc_entry->rmx_updates, + hc_entry->rmx_expire); + p += len; + } + THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); + } +#undef msec + error = SYSCTL_OUT(req, buf, p - buf); + free(buf, M_TEMP); + return(error); +} + +/* + * Caller has to make sure the curvnet is set properly. + */ +static void +tcp_hc_purge_internal(int all) +{ + struct hc_metrics *hc_entry, *hc_next; + int i; + + for (i = 0; i < V_tcp_hostcache.hashsize; i++) { + THC_LOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); + TAILQ_FOREACH_SAFE(hc_entry, + &V_tcp_hostcache.hashbase[i].hch_bucket, rmx_q, hc_next) { + if (all || hc_entry->rmx_expire <= 0) { + TAILQ_REMOVE(&V_tcp_hostcache.hashbase[i].hch_bucket, + hc_entry, rmx_q); + uma_zfree(V_tcp_hostcache.zone, hc_entry); + V_tcp_hostcache.hashbase[i].hch_length--; + V_tcp_hostcache.cache_count--; + } else + hc_entry->rmx_expire -= V_tcp_hostcache.prune; + } + THC_UNLOCK(&V_tcp_hostcache.hashbase[i].hch_mtx); + } +} + +/* + * Expire and purge (old|all) entries in the tcp_hostcache. Runs + * periodically from the callout. + */ +static void +tcp_hc_purge(void *arg) +{ + CURVNET_SET((struct vnet *) arg); + int all = 0; + + if (V_tcp_hostcache.purgeall) { + all = 1; + V_tcp_hostcache.purgeall = 0; + } + + tcp_hc_purge_internal(all); + + callout_reset(&V_tcp_hc_callout, V_tcp_hostcache.prune * hz, + tcp_hc_purge, arg); + CURVNET_RESTORE(); +} diff --git a/freebsd/sys/netinet/tcp_hostcache.h b/freebsd/sys/netinet/tcp_hostcache.h new file mode 100644 index 00000000..a494ed03 --- /dev/null +++ b/freebsd/sys/netinet/tcp_hostcache.h @@ -0,0 +1,82 @@ +/*- + * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Many thanks to jlemon for basic structure of tcp_syncache which is being + * followed here. + */ + +#ifndef _NETINET_TCP_HOSTCACHE_HH_ +#define _NETINET_TCP_HOSTCACHE_HH_ + +TAILQ_HEAD(hc_qhead, hc_metrics); + +struct hc_head { + struct hc_qhead hch_bucket; + u_int hch_length; + struct mtx hch_mtx; +}; + +struct hc_metrics { + /* housekeeping */ + TAILQ_ENTRY(hc_metrics) rmx_q; + struct hc_head *rmx_head; /* head of bucket tail queue */ + struct in_addr ip4; /* IP address */ + struct in6_addr ip6; /* IP6 address */ + /* endpoint specific values for tcp */ + u_long rmx_mtu; /* MTU for this path */ + u_long rmx_ssthresh; /* outbound gateway buffer limit */ + u_long rmx_rtt; /* estimated round trip time */ + u_long rmx_rttvar; /* estimated rtt variance */ + u_long rmx_bandwidth; /* estimated bandwidth */ + u_long rmx_cwnd; /* congestion window */ + u_long rmx_sendpipe; /* outbound delay-bandwidth product */ + u_long rmx_recvpipe; /* inbound delay-bandwidth product */ + /* TCP hostcache internal data */ + int rmx_expire; /* lifetime for object */ + u_long rmx_hits; /* number of hits */ + u_long rmx_updates; /* number of updates */ +}; + +struct tcp_hostcache { + struct hc_head *hashbase; + uma_zone_t zone; + u_int hashsize; + u_int hashmask; + u_int bucket_limit; + u_int cache_count; + u_int cache_limit; + int expire; + int prune; + int purgeall; +}; + +#endif /* !_NETINET_TCP_HOSTCACHE_HH_*/ diff --git a/freebsd/sys/netinet/tcp_input.c b/freebsd/sys/netinet/tcp_input.c new file mode 100644 index 00000000..85daf203 --- /dev/null +++ b/freebsd/sys/netinet/tcp_input.c @@ -0,0 +1,3453 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include /* for ipfw_fwd */ +#include +#include +#include +#include + +#include +#include +#include +#include +#include /* for proc0 declaration */ +#include +#include +#include +#include +#include +#include +#include + +#include /* before tcp_seq.h, for tcp_random18() */ + +#include + +#include +#include +#include + +#define TCPSTATES /* for logging */ + +#include +#include +#include +#include +#include +#include /* required for icmp_var.h */ +#include /* for ICMP_BANDLIM */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif /* TCPDEBUG */ + +#ifdef IPSEC +#include +#include +#endif /*IPSEC*/ + +#include + +#include + +static const int tcprexmtthresh = 3; + +VNET_DEFINE(struct tcpstat, tcpstat); +SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW, + &VNET_NAME(tcpstat), tcpstat, + "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); + +int tcp_log_in_vain = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &tcp_log_in_vain, 0, + "Log all incoming TCP segments to closed ports"); + +VNET_DEFINE(int, blackhole) = 0; +#define V_blackhole VNET(blackhole) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, + &VNET_NAME(blackhole), 0, + "Do not send RST on segments to closed ports"); + +VNET_DEFINE(int, tcp_delack_enabled) = 1; +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, + &VNET_NAME(tcp_delack_enabled), 0, + "Delay ACK to try and piggyback it onto a data packet"); + +VNET_DEFINE(int, drop_synfin) = 0; +#define V_drop_synfin VNET(drop_synfin) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, + &VNET_NAME(drop_synfin), 0, + "Drop TCP packets with SYN+FIN set"); + +VNET_DEFINE(int, tcp_do_rfc3042) = 1; +#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW, + &VNET_NAME(tcp_do_rfc3042), 0, + "Enable RFC 3042 (Limited Transmit)"); + +VNET_DEFINE(int, tcp_do_rfc3390) = 1; +#define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW, + &VNET_NAME(tcp_do_rfc3390), 0, + "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); + +VNET_DEFINE(int, tcp_do_rfc3465) = 1; +#define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, + &VNET_NAME(tcp_do_rfc3465), 0, + "Enable RFC 3465 (Appropriate Byte Counting)"); + +VNET_DEFINE(int, tcp_abc_l_var) = 2; +#define V_tcp_abc_l_var VNET(tcp_abc_l_var) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW, + &VNET_NAME(tcp_abc_l_var), 2, + "Cap the max cwnd increment during slow-start to this number of segments"); + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); + +VNET_DEFINE(int, tcp_do_ecn) = 0; +SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, + &VNET_NAME(tcp_do_ecn), 0, + "TCP ECN support"); + +VNET_DEFINE(int, tcp_ecn_maxretries) = 1; +SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW, + &VNET_NAME(tcp_ecn_maxretries), 0, + "Max retries before giving up on ECN"); + +VNET_DEFINE(int, tcp_insecure_rst) = 0; +#define V_tcp_insecure_rst VNET(tcp_insecure_rst) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, + &VNET_NAME(tcp_insecure_rst), 0, + "Follow the old (insecure) criteria for accepting RST packets"); + +VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; +#define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW, + &VNET_NAME(tcp_do_autorcvbuf), 0, + "Enable automatic receive buffer sizing"); + +VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024; +#define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW, + &VNET_NAME(tcp_autorcvbuf_inc), 0, + "Incrementor step size of automatic receive buffer"); + +VNET_DEFINE(int, tcp_autorcvbuf_max) = 256*1024; +#define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW, + &VNET_NAME(tcp_autorcvbuf_max), 0, + "Max size of automatic receive buffer"); + +int tcp_read_locking = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW, + &tcp_read_locking, 0, "Enable read locking strategy"); + +VNET_DEFINE(struct inpcbhead, tcb); +#define tcb6 tcb /* for KAME src sync over BSD*'s */ +VNET_DEFINE(struct inpcbinfo, tcbinfo); + +static void tcp_dooptions(struct tcpopt *, u_char *, int, int); +static void tcp_do_segment(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, int, int, uint8_t, + int); +static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, + struct tcpcb *, int, int); +static void tcp_pulloutofband(struct socket *, + struct tcphdr *, struct mbuf *, int); +static void tcp_xmit_timer(struct tcpcb *, int); +static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); +static void inline + tcp_congestion_exp(struct tcpcb *); + +/* + * Kernel module interface for updating tcpstat. The argument is an index + * into tcpstat treated as an array of u_long. While this encodes the + * general layout of tcpstat into the caller, it doesn't encode its location, + * so that future changes to add, for example, per-CPU stats support won't + * cause binary compatibility problems for kernel modules. + */ +void +kmod_tcpstat_inc(int statnum) +{ + + (*((u_long *)&V_tcpstat + statnum))++; +} + +static void inline +tcp_congestion_exp(struct tcpcb *tp) +{ + u_int win; + + win = min(tp->snd_wnd, tp->snd_cwnd) / + 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + ENTER_FASTRECOVERY(tp); + tp->snd_recover = tp->snd_max; + if (tp->t_flags & TF_ECN_PERMIT) + tp->t_flags |= TF_ECN_SND_CWR; +} + +/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ +#ifdef INET6 +#define ND6_HINT(tp) \ +do { \ + if ((tp) && (tp)->t_inpcb && \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ + nd6_nud_hint(NULL, NULL, 0); \ +} while (0) +#else +#define ND6_HINT(tp) +#endif + +/* + * Indicate whether this ack should be delayed. We can delay the ack if + * - there is no delayed ack timer in progress and + * - our last ack wasn't a 0-sized window. We never want to delay + * the ack that opens up a 0-sized window and + * - delayed acks are enabled or + * - this is a half-synchronized T/TCP connection. + */ +#define DELAY_ACK(tp) \ + ((!tcp_timer_active(tp, TT_DELACK) && \ + (tp->t_flags & TF_RXWIN0SENT) == 0) && \ + (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) + +/* + * TCP input handling is split into multiple parts: + * tcp6_input is a thin wrapper around tcp_input for the extended + * ip6_protox[] call format in ip6_input + * tcp_input handles primary segment validation, inpcb lookup and + * SYN processing on listen sockets + * tcp_do_segment processes the ACK and text of the segment for + * establishing, established and closing connections + */ +#ifdef INET6 +int +tcp6_input(struct mbuf **mp, int *offp, int proto) +{ + struct mbuf *m = *mp; + struct in6_ifaddr *ia6; + + IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE); + + /* + * draft-itojun-ipv6-tcp-to-anycast + * better place to put this in? + */ + ia6 = ip6_getdstifaddr(m); + if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { + struct ip6_hdr *ip6; + + ifa_free(&ia6->ia_ifa); + ip6 = mtod(m, struct ip6_hdr *); + icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, + (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); + return IPPROTO_DONE; + } + + tcp_input(m, *offp); + return IPPROTO_DONE; +} +#endif + +void +tcp_input(struct mbuf *m, int off0) +{ + struct tcphdr *th; + struct ip *ip = NULL; + struct ipovly *ipov; + struct inpcb *inp = NULL; + struct tcpcb *tp = NULL; + struct socket *so = NULL; + u_char *optp = NULL; + int optlen = 0; + int len, tlen, off; + int drop_hdrlen; + int thflags; + int rstreason = 0; /* For badport_bandlim accounting purposes */ + uint8_t iptos; +#ifdef IPFIREWALL_FORWARD + struct m_tag *fwd_tag; +#endif +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6; +#else + const void *ip6 = NULL; + const int isipv6 = 0; +#endif + struct tcpopt to; /* options in this segment */ + char *s = NULL; /* address and port logging */ + int ti_locked; +#define TI_UNLOCKED 1 +#define TI_RLOCKED 2 +#define TI_WLOCKED 3 + +#ifdef TCPDEBUG + /* + * The size of tcp_saveipgen must be the size of the max ip header, + * now IPv6. + */ + u_char tcp_saveipgen[IP6_HDR_LEN]; + struct tcphdr tcp_savetcp; + short ostate = 0; +#endif + +#ifdef INET6 + isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; +#endif + + to.to_flags = 0; + TCPSTAT_INC(tcps_rcvtotal); + + if (isipv6) { +#ifdef INET6 + /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */ + ip6 = mtod(m, struct ip6_hdr *); + tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; + if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + TCPSTAT_INC(tcps_rcvbadsum); + goto drop; + } + th = (struct tcphdr *)((caddr_t)ip6 + off0); + + /* + * Be proactive about unspecified IPv6 address in source. + * As we use all-zero to indicate unbounded/unconnected pcb, + * unspecified IPv6 address can be used to confuse us. + * + * Note that packets with unspecified IPv6 destination is + * already dropped in ip6_input. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { + /* XXX stat */ + goto drop; + } +#else + th = NULL; /* XXX: Avoid compiler warning. */ +#endif + } else { + /* + * Get IP and TCP header together in first mbuf. + * Note: IP leaves IP header in first mbuf. + */ + if (off0 > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + off0 = sizeof(struct ip); + } + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) + == NULL) { + TCPSTAT_INC(tcps_rcvshort); + return; + } + } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); + tlen = ip->ip_len; + + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, + htonl(m->m_pkthdr.csum_data + + ip->ip_len + + IPPROTO_TCP)); + th->th_sum ^= 0xffff; +#ifdef TCPDEBUG + ipov->ih_len = (u_short)tlen; + ipov->ih_len = htons(ipov->ih_len); +#endif + } else { + /* + * Checksum extended TCP header and data. + */ + len = sizeof (struct ip) + tlen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = (u_short)tlen; + ipov->ih_len = htons(ipov->ih_len); + th->th_sum = in_cksum(m, len); + } + if (th->th_sum) { + TCPSTAT_INC(tcps_rcvbadsum); + goto drop; + } + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; + } + +#ifdef INET6 + if (isipv6) + iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + else +#endif + iptos = ip->ip_tos; + + /* + * Check that TCP offset makes sense, + * pull out TCP options and adjust length. XXX + */ + off = th->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + TCPSTAT_INC(tcps_rcvbadoff); + goto drop; + } + tlen -= off; /* tlen is used instead of ti->ti_len */ + if (off > sizeof (struct tcphdr)) { + if (isipv6) { +#ifdef INET6 + IP6_EXTHDR_CHECK(m, off0, off, ); + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)((caddr_t)ip6 + off0); +#endif + } else { + if (m->m_len < sizeof(struct ip) + off) { + if ((m = m_pullup(m, sizeof (struct ip) + off)) + == NULL) { + TCPSTAT_INC(tcps_rcvshort); + return; + } + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)((caddr_t)ip + off0); + } + } + optlen = off - sizeof (struct tcphdr); + optp = (u_char *)(th + 1); + } + thflags = th->th_flags; + + /* + * Convert TCP protocol specific fields to host format. + */ + th->th_seq = ntohl(th->th_seq); + th->th_ack = ntohl(th->th_ack); + th->th_win = ntohs(th->th_win); + th->th_urp = ntohs(th->th_urp); + + /* + * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. + */ + drop_hdrlen = off0 + off; + + /* + * Locate pcb for segment, which requires a lock on tcbinfo. + * Optimisticaly acquire a global read lock rather than a write lock + * unless header flags necessarily imply a state change. There are + * two cases where we might discover later we need a write lock + * despite the flags: ACKs moving a connection out of the syncache, + * and ACKs for a connection in TIMEWAIT. + */ + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || + tcp_read_locking == 0) { + INP_INFO_WLOCK(&V_tcbinfo); + ti_locked = TI_WLOCKED; + } else { + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; + } + +findpcb: +#ifdef INVARIANTS + if (ti_locked == TI_RLOCKED) + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + else if (ti_locked == TI_WLOCKED) + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + else + panic("%s: findpcb ti_locked %d\n", __func__, ti_locked); +#endif + +#ifdef IPFIREWALL_FORWARD + /* + * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. + */ + fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); + + if (fwd_tag != NULL && isipv6 == 0) { /* IPv6 support is not yet */ + struct sockaddr_in *next_hop; + + next_hop = (struct sockaddr_in *)(fwd_tag+1); + /* + * Transparently forwarded. Pretend to be the destination. + * already got one like this? + */ + inp = in_pcblookup_hash(&V_tcbinfo, + ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, + 0, m->m_pkthdr.rcvif); + if (!inp) { + /* It's new. Try to find the ambushing socket. */ + inp = in_pcblookup_hash(&V_tcbinfo, + ip->ip_src, th->th_sport, + next_hop->sin_addr, + next_hop->sin_port ? + ntohs(next_hop->sin_port) : + th->th_dport, + INPLOOKUP_WILDCARD, + m->m_pkthdr.rcvif); + } + /* Remove the tag from the packet. We don't need it anymore. */ + m_tag_delete(m, fwd_tag); + } else +#endif /* IPFIREWALL_FORWARD */ + { + if (isipv6) { +#ifdef INET6 + inp = in6_pcblookup_hash(&V_tcbinfo, + &ip6->ip6_src, th->th_sport, + &ip6->ip6_dst, th->th_dport, + INPLOOKUP_WILDCARD, + m->m_pkthdr.rcvif); +#endif + } else + inp = in_pcblookup_hash(&V_tcbinfo, + ip->ip_src, th->th_sport, + ip->ip_dst, th->th_dport, + INPLOOKUP_WILDCARD, + m->m_pkthdr.rcvif); + } + + /* + * If the INPCB does not exist then all data in the incoming + * segment is discarded and an appropriate RST is sent back. + * XXX MRT Send RST using which routing table? + */ + if (inp == NULL) { + /* + * Log communication attempts to ports that are not + * in use. + */ + if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) || + tcp_log_in_vain == 2) { + if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) + log(LOG_INFO, "%s; %s: Connection attempt " + "to closed port\n", s, __func__); + } + /* + * When blackholing do not respond with a RST but + * completely ignore the segment and drop it. + */ + if ((V_blackhole == 1 && (thflags & TH_SYN)) || + V_blackhole == 2) + goto dropunlock; + + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + INP_WLOCK(inp); + if (!(inp->inp_flags & INP_HW_FLOWID) + && (m->m_flags & M_FLOWID) + && ((inp->inp_socket == NULL) + || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) { + inp->inp_flags |= INP_HW_FLOWID; + inp->inp_flags &= ~INP_SW_FLOWID; + inp->inp_flowid = m->m_pkthdr.flowid; + } +#ifdef IPSEC +#ifdef INET6 + if (isipv6 && ipsec6_in_reject(m, inp)) { + V_ipsec6stat.in_polvio++; + goto dropunlock; + } else +#endif /* INET6 */ + if (ipsec4_in_reject(m, inp) != 0) { + V_ipsec4stat.in_polvio++; + goto dropunlock; + } +#endif /* IPSEC */ + + /* + * Check the minimum TTL for socket. + */ + if (inp->inp_ip_minttl != 0) { +#ifdef INET6 + if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim) + goto dropunlock; + else +#endif + if (inp->inp_ip_minttl > ip->ip_ttl) + goto dropunlock; + } + + /* + * A previous connection in TIMEWAIT state is supposed to catch stray + * or duplicate segments arriving late. If this segment was a + * legitimate new connection attempt the old INPCB gets removed and + * we can try again to find a listening socket. + * + * At this point, due to earlier optimism, we may hold a read lock on + * the inpcbinfo, rather than a write lock. If so, we need to + * upgrade, or if that fails, acquire a reference on the inpcb, drop + * all locks, acquire a global write lock, and then re-acquire the + * inpcb lock. We may at that point discover that another thread has + * tried to free the inpcb, in which case we need to loop back and + * try to find a new inpcb to deliver to. + */ +relocked: + if (inp->inp_flags & INP_TIMEWAIT) { + KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, + ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked)); + + if (ti_locked == TI_RLOCKED) { + if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { + in_pcbref(inp); + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); + ti_locked = TI_WLOCKED; + INP_WLOCK(inp); + if (in_pcbrele(inp)) { + inp = NULL; + goto findpcb; + } + } else + ti_locked = TI_WLOCKED; + } + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + + if (thflags & TH_SYN) + tcp_dooptions(&to, optp, optlen, TO_SYN); + /* + * NB: tcp_twcheck unlocks the INP and frees the mbuf. + */ + if (tcp_twcheck(inp, &to, th, m, tlen)) + goto findpcb; + INP_INFO_WUNLOCK(&V_tcbinfo); + return; + } + /* + * The TCPCB may no longer exist if the connection is winding + * down or it is in the CLOSED state. Either way we drop the + * segment and send an appropriate response. + */ + tp = intotcpcb(inp); + if (tp == NULL || tp->t_state == TCPS_CLOSED) { + rstreason = BANDLIM_RST_CLOSEDPORT; + goto dropwithreset; + } + + /* + * We've identified a valid inpcb, but it could be that we need an + * inpcbinfo write lock and have only a read lock. In this case, + * attempt to upgrade/relock using the same strategy as the TIMEWAIT + * case above. If we relock, we have to jump back to 'relocked' as + * the connection might now be in TIMEWAIT. + */ + if (tp->t_state != TCPS_ESTABLISHED || + (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || + tcp_read_locking == 0) { + KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, + ("%s: upgrade check ti_locked %d", __func__, ti_locked)); + + if (ti_locked == TI_RLOCKED) { + if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) { + in_pcbref(inp); + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); + ti_locked = TI_WLOCKED; + INP_WLOCK(inp); + if (in_pcbrele(inp)) { + inp = NULL; + goto findpcb; + } + goto relocked; + } else + ti_locked = TI_WLOCKED; + } + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + } + +#ifdef MAC + INP_WLOCK_ASSERT(inp); + if (mac_inpcb_check_deliver(inp, m)) + goto dropunlock; +#endif + so = inp->inp_socket; + KASSERT(so != NULL, ("%s: so == NULL", __func__)); +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) { + ostate = tp->t_state; + if (isipv6) { +#ifdef INET6 + bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); +#endif + } else + bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); + tcp_savetcp = *th; + } +#endif + /* + * When the socket is accepting connections (the INPCB is in LISTEN + * state) we look into the SYN cache if this is a new connection + * attempt or the completion of a previous one. + */ + if (so->so_options & SO_ACCEPTCONN) { + struct in_conninfo inc; + + KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but " + "tp not listening", __func__)); + + bzero(&inc, sizeof(inc)); +#ifdef INET6 + if (isipv6) { + inc.inc_flags |= INC_ISIPV6; + inc.inc6_faddr = ip6->ip6_src; + inc.inc6_laddr = ip6->ip6_dst; + } else +#endif + { + inc.inc_faddr = ip->ip_src; + inc.inc_laddr = ip->ip_dst; + } + inc.inc_fport = th->th_sport; + inc.inc_lport = th->th_dport; + inc.inc_fibnum = so->so_fibnum; + + /* + * Check for an existing connection attempt in syncache if + * the flag is only ACK. A successful lookup creates a new + * socket appended to the listen queue in SYN_RECEIVED state. + */ + if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { + /* + * Parse the TCP options here because + * syncookies need access to the reflected + * timestamp. + */ + tcp_dooptions(&to, optp, optlen, 0); + /* + * NB: syncache_expand() doesn't unlock + * inp and tcpinfo locks. + */ + if (!syncache_expand(&inc, &to, th, &so, m)) { + /* + * No syncache entry or ACK was not + * for our SYN/ACK. Send a RST. + * NB: syncache did its own logging + * of the failure cause. + */ + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + if (so == NULL) { + /* + * We completed the 3-way handshake + * but could not allocate a socket + * either due to memory shortage, + * listen queue length limits or + * global socket limits. Send RST + * or wait and have the remote end + * retransmit the ACK for another + * try. + */ + if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Listen socket: " + "Socket allocation failed due to " + "limits or memory shortage, %s\n", + s, __func__, + V_tcp_sc_rst_sock_fail ? + "sending RST" : "try again"); + if (V_tcp_sc_rst_sock_fail) { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } else + goto dropunlock; + } + /* + * Socket is created in state SYN_RECEIVED. + * Unlock the listen socket, lock the newly + * created socket and update the tp variable. + */ + INP_WUNLOCK(inp); /* listen socket */ + inp = sotoinpcb(so); + INP_WLOCK(inp); /* new connection */ + tp = intotcpcb(inp); + KASSERT(tp->t_state == TCPS_SYN_RECEIVED, + ("%s: ", __func__)); + /* + * Process the segment and the data it + * contains. tcp_do_segment() consumes + * the mbuf chain and unlocks the inpcb. + */ + tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, + iptos, ti_locked); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + return; + } + /* + * Segment flag validation for new connection attempts: + * + * Our (SYN|ACK) response was rejected. + * Check with syncache and remove entry to prevent + * retransmits. + * + * NB: syncache_chkrst does its own logging of failure + * causes. + */ + if (thflags & TH_RST) { + syncache_chkrst(&inc, th); + goto dropunlock; + } + /* + * We can't do anything without SYN. + */ + if ((thflags & TH_SYN) == 0) { + if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Listen socket: " + "SYN is missing, segment ignored\n", + s, __func__); + TCPSTAT_INC(tcps_badsyn); + goto dropunlock; + } + /* + * (SYN|ACK) is bogus on a listen socket. + */ + if (thflags & TH_ACK) { + if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Listen socket: " + "SYN|ACK invalid, segment rejected\n", + s, __func__); + syncache_badack(&inc); /* XXX: Not needed! */ + TCPSTAT_INC(tcps_badsyn); + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + /* + * If the drop_synfin option is enabled, drop all + * segments with both the SYN and FIN bits set. + * This prevents e.g. nmap from identifying the + * TCP/IP stack. + * XXX: Poor reasoning. nmap has other methods + * and is constantly refining its stack detection + * strategies. + * XXX: This is a violation of the TCP specification + * and was used by RFC1644. + */ + if ((thflags & TH_FIN) && V_drop_synfin) { + if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Listen socket: " + "SYN|FIN segment ignored (based on " + "sysctl setting)\n", s, __func__); + TCPSTAT_INC(tcps_badsyn); + goto dropunlock; + } + /* + * Segment's flags are (SYN) or (SYN|FIN). + * + * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored + * as they do not affect the state of the TCP FSM. + * The data pointed to by TH_URG and th_urp is ignored. + */ + KASSERT((thflags & (TH_RST|TH_ACK)) == 0, + ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); + KASSERT(thflags & (TH_SYN), + ("%s: Listen socket: TH_SYN not set", __func__)); +#ifdef INET6 + /* + * If deprecated address is forbidden, + * we do not accept SYN to deprecated interface + * address to prevent any new inbound connection from + * getting established. + * When we do not accept SYN, we send a TCP RST, + * with deprecated source address (instead of dropping + * it). We compromise it as it is much better for peer + * to send a RST, and RST will be the final packet + * for the exchange. + * + * If we do not forbid deprecated addresses, we accept + * the SYN packet. RFC2462 does not suggest dropping + * SYN in this case. + * If we decipher RFC2462 5.5.4, it says like this: + * 1. use of deprecated addr with existing + * communication is okay - "SHOULD continue to be + * used" + * 2. use of it with new communication: + * (2a) "SHOULD NOT be used if alternate address + * with sufficient scope is available" + * (2b) nothing mentioned otherwise. + * Here we fall into (2b) case as we have no choice in + * our source address selection - we must obey the peer. + * + * The wording in RFC2462 is confusing, and there are + * multiple description text for deprecated address + * handling - worse, they are not exactly the same. + * I believe 5.5.4 is the best one, so we follow 5.5.4. + */ + if (isipv6 && !V_ip6_use_deprecated) { + struct in6_ifaddr *ia6; + + ia6 = ip6_getdstifaddr(m); + if (ia6 != NULL && + (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { + ifa_free(&ia6->ia_ifa); + if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Listen socket: " + "Connection attempt to deprecated " + "IPv6 address rejected\n", + s, __func__); + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + ifa_free(&ia6->ia_ifa); + } +#endif + /* + * Basic sanity checks on incoming SYN requests: + * Don't respond if the destination is a link layer + * broadcast according to RFC1122 4.2.3.10, p. 104. + * If it is from this socket it must be forged. + * Don't respond if the source or destination is a + * global or subnet broad- or multicast address. + * Note that it is quite possible to receive unicast + * link-layer packets with a broadcast IP address. Use + * in_broadcast() to find them. + */ + if (m->m_flags & (M_BCAST|M_MCAST)) { + if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Listen socket: " + "Connection attempt from broad- or multicast " + "link layer address ignored\n", s, __func__); + goto dropunlock; + } + if (isipv6) { +#ifdef INET6 + if (th->th_dport == th->th_sport && + IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { + if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Listen socket: " + "Connection attempt to/from self " + "ignored\n", s, __func__); + goto dropunlock; + } + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { + if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Listen socket: " + "Connection attempt from/to multicast " + "address ignored\n", s, __func__); + goto dropunlock; + } +#endif + } else { + if (th->th_dport == th->th_sport && + ip->ip_dst.s_addr == ip->ip_src.s_addr) { + if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Listen socket: " + "Connection attempt from/to self " + "ignored\n", s, __func__); + goto dropunlock; + } + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Listen socket: " + "Connection attempt from/to broad- " + "or multicast address ignored\n", + s, __func__); + goto dropunlock; + } + } + /* + * SYN appears to be valid. Create compressed TCP state + * for syncache. + */ +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, + (void *)tcp_saveipgen, &tcp_savetcp, 0); +#endif + tcp_dooptions(&to, optp, optlen, TO_SYN); + syncache_add(&inc, &to, th, inp, &so, m); + /* + * Entry added to syncache and mbuf consumed. + * Everything already unlocked by syncache_add(). + */ + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + return; + } + + /* + * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later + * state. tcp_do_segment() always consumes the mbuf chain, unlocks + * the inpcb, and unlocks pcbinfo. + */ + tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + return; + +dropwithreset: + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); + else if (ti_locked == TI_WLOCKED) + INP_INFO_WUNLOCK(&V_tcbinfo); + else + panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); + ti_locked = TI_UNLOCKED; + + if (inp != NULL) { + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(inp); + } else + tcp_dropwithreset(m, th, NULL, tlen, rstreason); + m = NULL; /* mbuf chain got consumed. */ + goto drop; + +dropunlock: + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); + else if (ti_locked == TI_WLOCKED) + INP_INFO_WUNLOCK(&V_tcbinfo); + else + panic("%s: dropunlock ti_locked %d", __func__, ti_locked); + ti_locked = TI_UNLOCKED; + + if (inp != NULL) + INP_WUNLOCK(inp); + +drop: + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + if (s != NULL) + free(s, M_TCPLOG); + if (m != NULL) + m_freem(m); +} + +static void +tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, + int ti_locked) +{ + int thflags, acked, ourfinisacked, needoutput = 0; + int rstreason, todrop, win; + u_long tiwin; + struct tcpopt to; + +#ifdef TCPDEBUG + /* + * The size of tcp_saveipgen must be the size of the max ip header, + * now IPv6. + */ + u_char tcp_saveipgen[IP6_HDR_LEN]; + struct tcphdr tcp_savetcp; + short ostate = 0; +#endif + thflags = th->th_flags; + + /* + * If this is either a state-changing packet or current state isn't + * established, we require a write lock on tcbinfo. Otherwise, we + * allow either a read lock or a write lock, as we may have acquired + * a write lock due to a race. + * + * Require a global write lock for SYN/FIN/RST segments or + * non-established connections; otherwise accept either a read or + * write lock, as we may have conservatively acquired a write lock in + * certain cases in tcp_input() (is this still true?). Currently we + * will never enter with no lock, so we try to drop it quickly in the + * common pure ack/pure data cases. + */ + if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || + tp->t_state != TCPS_ESTABLISHED) { + KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " + "SYN/FIN/RST/!EST", __func__, ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + } else { +#ifdef INVARIANTS + if (ti_locked == TI_RLOCKED) + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + else if (ti_locked == TI_WLOCKED) + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + else + panic("%s: ti_locked %d for EST", __func__, + ti_locked); +#endif + } + INP_WLOCK_ASSERT(tp->t_inpcb); + KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", + __func__)); + KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", + __func__)); + + /* + * Segment received on connection. + * Reset idle time and keep-alive timer. + * XXX: This should be done after segment + * validation to ignore broken/spoofed segs. + */ + tp->t_rcvtime = ticks; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); + + /* + * Unscale the window into a 32-bit value. + * For the SYN_SENT state the scale is zero. + */ + tiwin = th->th_win << tp->snd_scale; + + /* + * TCP ECN processing. + */ + if (tp->t_flags & TF_ECN_PERMIT) { + if (thflags & TH_CWR) + tp->t_flags &= ~TF_ECN_SND_ECE; + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->t_flags |= TF_ECN_SND_ECE; + TCPSTAT_INC(tcps_ecn_ce); + break; + case IPTOS_ECN_ECT0: + TCPSTAT_INC(tcps_ecn_ect0); + break; + case IPTOS_ECN_ECT1: + TCPSTAT_INC(tcps_ecn_ect1); + break; + } + /* + * Congestion experienced. + * Ignore if we are already trying to recover. + */ + if ((thflags & TH_ECE) && + SEQ_LEQ(th->th_ack, tp->snd_recover)) { + TCPSTAT_INC(tcps_ecn_rcwnd); + tcp_congestion_exp(tp); + } + } + + /* + * Parse options on any incoming segment. + */ + tcp_dooptions(&to, (u_char *)(th + 1), + (th->th_off << 2) - sizeof(struct tcphdr), + (thflags & TH_SYN) ? TO_SYN : 0); + + /* + * If echoed timestamp is later than the current time, + * fall back to non RFC1323 RTT calculation. Normalize + * timestamp if syncookies were used when this connection + * was established. + */ + if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { + to.to_tsecr -= tp->ts_offset; + if (TSTMP_GT(to.to_tsecr, ticks)) + to.to_tsecr = 0; + } + + /* + * Process options only when we get SYN/ACK back. The SYN case + * for incoming connections is handled in tcp_syncache. + * According to RFC1323 the window field in a SYN (i.e., a + * or ) segment itself is never scaled. + * XXX this is traditional behavior, may need to be cleaned up. + */ + if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { + if ((to.to_flags & TOF_SCALE) && + (tp->t_flags & TF_REQ_SCALE)) { + tp->t_flags |= TF_RCVD_SCALE; + tp->snd_scale = to.to_wscale; + } + /* + * Initial send window. It will be updated with + * the next incoming segment to the scaled value. + */ + tp->snd_wnd = th->th_win; + if (to.to_flags & TOF_TS) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to.to_tsval; + tp->ts_recent_age = ticks; + } + if (to.to_flags & TOF_MSS) + tcp_mss(tp, to.to_mss); + if ((tp->t_flags & TF_SACK_PERMIT) && + (to.to_flags & TOF_SACKPERM) == 0) + tp->t_flags &= ~TF_SACK_PERMIT; + } + + /* + * Header prediction: check for the two common cases + * of a uni-directional data xfer. If the packet has + * no control flags, is in-sequence, the window didn't + * change and we're not retransmitting, it's a + * candidate. If the length is zero and the ack moved + * forward, we're the sender side of the xfer. Just + * free the data acked & wake any higher level process + * that was blocked waiting for space. If the length + * is non-zero and the ack didn't move, we're the + * receiver side. If we're getting packets in-order + * (the reassembly queue is empty), add the data to + * the socket buffer and note that we need a delayed ack. + * Make sure that the hidden state-flags are also off. + * Since we check for TCPS_ESTABLISHED first, it can only + * be TH_NEEDSYN. + */ + if (tp->t_state == TCPS_ESTABLISHED && + th->th_seq == tp->rcv_nxt && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + tp->snd_nxt == tp->snd_max && + tiwin && tiwin == tp->snd_wnd && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + LIST_EMPTY(&tp->t_segq) && + ((to.to_flags & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { + + /* + * If last ACK falls within this segment's sequence numbers, + * record the timestamp. + * NOTE that the test is modified according to the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + */ + if ((to.to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + if (tlen == 0) { + if (SEQ_GT(th->th_ack, tp->snd_una) && + SEQ_LEQ(th->th_ack, tp->snd_max) && + tp->snd_cwnd >= tp->snd_wnd && + ((!V_tcp_do_newreno && + !(tp->t_flags & TF_SACK_PERMIT) && + tp->t_dupacks < tcprexmtthresh) || + ((V_tcp_do_newreno || + (tp->t_flags & TF_SACK_PERMIT)) && + !IN_FASTRECOVERY(tp) && + (to.to_flags & TOF_SACK) == 0 && + TAILQ_EMPTY(&tp->snd_holes)))) { + /* + * This is a pure ack for outstanding data. + */ + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); + else if (ti_locked == TI_WLOCKED) + INP_INFO_WUNLOCK(&V_tcbinfo); + else + panic("%s: ti_locked %d on pure ACK", + __func__, ti_locked); + ti_locked = TI_UNLOCKED; + + TCPSTAT_INC(tcps_predack); + + /* + * "bad retransmit" recovery. + */ + if (tp->t_rxtshift == 1 && + (int)(ticks - tp->t_badrxtwin) < 0) { + TCPSTAT_INC(tcps_sndrexmitbad); + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = + tp->snd_ssthresh_prev; + tp->snd_recover = tp->snd_recover_prev; + if (tp->t_flags & TF_WASFRECOVERY) + ENTER_FASTRECOVERY(tp); + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; + } + + /* + * Recalculate the transmit timer / rtt. + * + * Some boxes send broken timestamp replies + * during the SYN+ACK phase, ignore + * timestamps of 0 or we could calculate a + * huge RTT and blow up the retransmit timer. + */ + if ((to.to_flags & TOF_TS) != 0 && + to.to_tsecr) { + if (!tp->t_rttlow || + tp->t_rttlow > ticks - to.to_tsecr) + tp->t_rttlow = ticks - to.to_tsecr; + tcp_xmit_timer(tp, + ticks - to.to_tsecr + 1); + } else if (tp->t_rtttime && + SEQ_GT(th->th_ack, tp->t_rtseq)) { + if (!tp->t_rttlow || + tp->t_rttlow > ticks - tp->t_rtttime) + tp->t_rttlow = ticks - tp->t_rtttime; + tcp_xmit_timer(tp, + ticks - tp->t_rtttime); + } + tcp_xmit_bandwidth_limit(tp, th->th_ack); + acked = th->th_ack - tp->snd_una; + TCPSTAT_INC(tcps_rcvackpack); + TCPSTAT_ADD(tcps_rcvackbyte, acked); + sbdrop(&so->so_snd, acked); + if (SEQ_GT(tp->snd_una, tp->snd_recover) && + SEQ_LEQ(th->th_ack, tp->snd_recover)) + tp->snd_recover = th->th_ack - 1; + tp->snd_una = th->th_ack; + /* + * Pull snd_wl2 up to prevent seq wrap relative + * to th_ack. + */ + tp->snd_wl2 = th->th_ack; + tp->t_dupacks = 0; + m_freem(m); + ND6_HINT(tp); /* Some progress has been made. */ + + /* + * If all outstanding data are acked, stop + * retransmit timer, otherwise restart timer + * using current (possibly backed-off) value. + * If process is waiting for space, + * wakeup/selwakeup/signal. If data + * are ready to send, let tcp_output + * decide between more output or persist. + */ +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, + (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (tp->snd_una == tp->snd_max) + tcp_timer_activate(tp, TT_REXMT, 0); + else if (!tcp_timer_active(tp, TT_PERSIST)) + tcp_timer_activate(tp, TT_REXMT, + tp->t_rxtcur); + sowwakeup(so); + if (so->so_snd.sb_cc) + (void) tcp_output(tp); + goto check_delack; + } + } else if (th->th_ack == tp->snd_una && + tlen <= sbspace(&so->so_rcv)) { + int newsize = 0; /* automatic sockbuf scaling */ + + /* + * This is a pure, in-sequence data packet with + * nothing on the reassembly queue and we have enough + * buffer space to take it. + */ + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); + else if (ti_locked == TI_WLOCKED) + INP_INFO_WUNLOCK(&V_tcbinfo); + else + panic("%s: ti_locked %d on pure data " + "segment", __func__, ti_locked); + ti_locked = TI_UNLOCKED; + + /* Clean receiver SACK report if present */ + if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) + tcp_clean_sackreport(tp); + TCPSTAT_INC(tcps_preddat); + tp->rcv_nxt += tlen; + /* + * Pull snd_wl1 up to prevent seq wrap relative to + * th_seq. + */ + tp->snd_wl1 = th->th_seq; + /* + * Pull rcv_up up to prevent seq wrap relative to + * rcv_nxt. + */ + tp->rcv_up = tp->rcv_nxt; + TCPSTAT_INC(tcps_rcvpack); + TCPSTAT_ADD(tcps_rcvbyte, tlen); + ND6_HINT(tp); /* Some progress has been made */ +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, + (void *)tcp_saveipgen, &tcp_savetcp, 0); +#endif + /* + * Automatic sizing of receive socket buffer. Often the send + * buffer size is not optimally adjusted to the actual network + * conditions at hand (delay bandwidth product). Setting the + * buffer size too small limits throughput on links with high + * bandwidth and high delay (eg. trans-continental/oceanic links). + * + * On the receive side the socket buffer memory is only rarely + * used to any significant extent. This allows us to be much + * more aggressive in scaling the receive socket buffer. For + * the case that the buffer space is actually used to a large + * extent and we run out of kernel memory we can simply drop + * the new segments; TCP on the sender will just retransmit it + * later. Setting the buffer size too big may only consume too + * much kernel memory if the application doesn't read() from + * the socket or packet loss or reordering makes use of the + * reassembly queue. + * + * The criteria to step up the receive buffer one notch are: + * 1. the number of bytes received during the time it takes + * one timestamp to be reflected back to us (the RTT); + * 2. received bytes per RTT is within seven eighth of the + * current socket buffer size; + * 3. receive buffer size has not hit maximal automatic size; + * + * This algorithm does one step per RTT at most and only if + * we receive a bulk stream w/o packet losses or reorderings. + * Shrinking the buffer during idle times is not necessary as + * it doesn't consume any memory when idle. + * + * TODO: Only step up if the application is actually serving + * the buffer to better manage the socket buffer resources. + */ + if (V_tcp_do_autorcvbuf && + to.to_tsecr && + (so->so_rcv.sb_flags & SB_AUTOSIZE)) { + if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) && + to.to_tsecr - tp->rfbuf_ts < hz) { + if (tp->rfbuf_cnt > + (so->so_rcv.sb_hiwat / 8 * 7) && + so->so_rcv.sb_hiwat < + V_tcp_autorcvbuf_max) { + newsize = + min(so->so_rcv.sb_hiwat + + V_tcp_autorcvbuf_inc, + V_tcp_autorcvbuf_max); + } + /* Start over with next RTT. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + } else + tp->rfbuf_cnt += tlen; /* add up */ + } + + /* Add data to socket buffer. */ + SOCKBUF_LOCK(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + m_freem(m); + } else { + /* + * Set new socket buffer size. + * Give up when limit is reached. + */ + if (newsize) + if (!sbreserve_locked(&so->so_rcv, + newsize, so, NULL)) + so->so_rcv.sb_flags &= ~SB_AUTOSIZE; + m_adj(m, drop_hdrlen); /* delayed header drop */ + sbappendstream_locked(&so->so_rcv, m); + } + /* NB: sorwakeup_locked() does an implicit unlock. */ + sorwakeup_locked(so); + if (DELAY_ACK(tp)) { + tp->t_flags |= TF_DELACK; + } else { + tp->t_flags |= TF_ACKNOW; + tcp_output(tp); + } + goto check_delack; + } + } + + /* + * Calculate amount of space in receive window, + * and then do TCP input processing. + * Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); + + /* Reset receive buffer auto scaling when not in bulk receive mode. */ + tp->rfbuf_ts = 0; + tp->rfbuf_cnt = 0; + + switch (tp->t_state) { + + /* + * If the state is SYN_RECEIVED: + * if seg contains an ACK, but not for our SYN/ACK, send a RST. + */ + case TCPS_SYN_RECEIVED: + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->snd_una) || + SEQ_GT(th->th_ack, tp->snd_max))) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + break; + + /* + * If the state is SYN_SENT: + * if seg contains an ACK, but not for our SYN, drop the input. + * if seg contains a RST, then drop the connection. + * if seg does not contain SYN, then drop it. + * Otherwise this is an acceptable SYN segment + * initialize tp->rcv_nxt and tp->irs + * if seg contains ack then advance tp->snd_una + * if seg contains an ECE and ECN support is enabled, the stream + * is ECN capable. + * if SYN has been acked change to ESTABLISHED else SYN_RCVD state + * arrange for segment to be acked (eventually) + * continue processing rest of data/controls, beginning with URG + */ + case TCPS_SYN_SENT: + if ((thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || + SEQ_GT(th->th_ack, tp->snd_max))) { + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) + tp = tcp_drop(tp, ECONNREFUSED); + if (thflags & TH_RST) + goto drop; + if (!(thflags & TH_SYN)) + goto drop; + + tp->irs = th->th_seq; + tcp_rcvseqinit(tp); + if (thflags & TH_ACK) { + TCPSTAT_INC(tcps_connects); + soisconnected(so); +#ifdef MAC + mac_socketpeer_set_from_mbuf(m, so); +#endif + /* Do window scaling on this connection? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + } + tp->rcv_adv += tp->rcv_wnd; + tp->snd_una++; /* SYN is acked */ + /* + * If there's data, delay ACK; if there's also a FIN + * ACKNOW will be turned on later. + */ + if (DELAY_ACK(tp) && tlen != 0) + tcp_timer_activate(tp, TT_DELACK, + tcp_delacktime); + else + tp->t_flags |= TF_ACKNOW; + + if ((thflags & TH_ECE) && V_tcp_do_ecn) { + tp->t_flags |= TF_ECN_PERMIT; + TCPSTAT_INC(tcps_ecn_shs); + } + + /* + * Received in SYN_SENT[*] state. + * Transitions: + * SYN_SENT --> ESTABLISHED + * SYN_SENT* --> FIN_WAIT_1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + thflags &= ~TH_SYN; + } else { + tp->t_state = TCPS_ESTABLISHED; + tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); + } + } else { + /* + * Received initial SYN in SYN-SENT[*] state => + * simultaneous open. If segment contains CC option + * and there is a cached CC, apply TAO test. + * If it succeeds, connection is * half-synchronized. + * Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + * If there was no CC option, clear cached CC value. + */ + tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); + tcp_timer_activate(tp, TT_REXMT, 0); + tp->t_state = TCPS_SYN_RECEIVED; + } + + KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " + "ti_locked %d", __func__, ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * Advance th->th_seq to correspond to first data byte. + * If data, trim to stay within window, + * dropping FIN if necessary. + */ + th->th_seq++; + if (tlen > tp->rcv_wnd) { + todrop = tlen - tp->rcv_wnd; + m_adj(m, -todrop); + tlen = tp->rcv_wnd; + thflags &= ~TH_FIN; + TCPSTAT_INC(tcps_rcvpackafterwin); + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + } + tp->snd_wl1 = th->th_seq - 1; + tp->rcv_up = th->th_seq; + /* + * Client side of transaction: already sent SYN and data. + * If the remote host used T/TCP to validate the SYN, + * our data will be ACK'd; if so, enter normal data segment + * processing in the middle of step 5, ack processing. + * Otherwise, goto step 6. + */ + if (thflags & TH_ACK) + goto process_ACK; + + goto step6; + + /* + * If the state is LAST_ACK or CLOSING or TIME_WAIT: + * do normal processing. + * + * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. + */ + case TCPS_LAST_ACK: + case TCPS_CLOSING: + break; /* continue normal processing */ + } + + /* + * States other than LISTEN or SYN_SENT. + * First check the RST flag and sequence number since reset segments + * are exempt from the timestamp and connection count tests. This + * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix + * below which allowed reset segments in half the sequence space + * to fall though and be processed (which gives forged reset + * segments with a random sequence number a 50 percent chance of + * killing a connection). + * Then check timestamp, if present. + * Then check the connection count, if present. + * Then check that at least some bytes of segment are within + * receive window. If segment begins before rcv_nxt, + * drop leading data (and SYN); if nothing left, just ack. + * + * + * If the RST bit is set, check the sequence number to see + * if this is a valid reset segment. + * RFC 793 page 37: + * In all states except SYN-SENT, all reset (RST) segments + * are validated by checking their SEQ-fields. A reset is + * valid if its sequence number is in the window. + * Note: this does not take into account delayed ACKs, so + * we should test against last_ack_sent instead of rcv_nxt. + * The sequence number in the reset segment is normally an + * echo of our outgoing acknowlegement numbers, but some hosts + * send a reset with the sequence number at the rightmost edge + * of our receive window, and we have to handle this case. + * Note 2: Paul Watson's paper "Slipping in the Window" has shown + * that brute force RST attacks are possible. To combat this, + * we use a much stricter check while in the ESTABLISHED state, + * only accepting RSTs where the sequence number is equal to + * last_ack_sent. In all other states (the states in which a + * RST is more likely), the more permissive check is used. + * If we have multiple segments in flight, the initial reset + * segment sequence numbers will be to the left of last_ack_sent, + * but they will eventually catch up. + * In any case, it never made sense to trim reset segments to + * fit the receive window since RFC 1122 says: + * 4.2.2.12 RST Segment: RFC-793 Section 3.4 + * + * A TCP SHOULD allow a received RST segment to include data. + * + * DISCUSSION + * It has been suggested that a RST segment could contain + * ASCII text that encoded and explained the cause of the + * RST. No standard has yet been established for such + * data. + * + * If the reset segment passes the sequence number test examine + * the state: + * SYN_RECEIVED STATE: + * If passive open, return to LISTEN state. + * If active open, inform user that connection was refused. + * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES: + * Inform user that connection was reset, and close tcb. + * CLOSING, LAST_ACK STATES: + * Close the tcb. + * TIME_WAIT STATE: + * Drop the segment - see Stevens, vol. 2, p. 964 and + * RFC 1337. + */ + if (thflags & TH_RST) { + if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && + SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + switch (tp->t_state) { + + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + + case TCPS_ESTABLISHED: + if (V_tcp_insecure_rst == 0 && + !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) && + SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) && + !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) && + SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) { + TCPSTAT_INC(tcps_badrst); + goto drop; + } + /* FALLTHROUGH */ + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + so->so_error = ECONNRESET; + close: + KASSERT(ti_locked == TI_WLOCKED, + ("tcp_do_segment: TH_RST 1 ti_locked %d", + ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + + tp->t_state = TCPS_CLOSED; + TCPSTAT_INC(tcps_drops); + tp = tcp_close(tp); + break; + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + KASSERT(ti_locked == TI_WLOCKED, + ("tcp_do_segment: TH_RST 2 ti_locked %d", + ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + + tp = tcp_close(tp); + break; + } + } + goto drop; + } + + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to.to_tsval, tp->ts_recent)) { + + /* Check to see if ts_recent is over 24 days old. */ + if (ticks - tp->ts_recent_age > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates + * ts_recent, the age will be reset later and ts_recent + * will get a valid value. If it does not, setting + * ts_recent to zero will at least satisfy the + * requirement that zero be placed in the timestamp + * echo reply when ts_recent isn't valid. The + * age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be + * dropped when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, tlen); + TCPSTAT_INC(tcps_pawsdrop); + if (tlen) + goto dropafterack; + goto drop; + } + } + + /* + * In the SYN-RECEIVED state, validate that the packet belongs to + * this connection before trimming the data to fit the receive + * window. Check the sequence number versus IRS since we know + * the sequence numbers haven't wrapped. This is a partial fix + * for the "LAND" DoS attack. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + /* + * If this is a duplicate SYN for our current connection, + * advance over it and pretend and it's not a SYN. + */ + if (thflags & TH_SYN && th->th_seq == tp->irs) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, todrop); + } else { + TCPSTAT_INC(tcps_rcvpartduppack); + TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); + } + drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + + /* + * If new data are received on a connection after the + * user processes are gone, then RST the other end. + */ + if ((so->so_state & SS_NOFDREF) && + tp->t_state > TCPS_CLOSE_WAIT && tlen) { + char *s; + + KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " + "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + + if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket " + "was closed, sending RST and removing tcpcb\n", + s, __func__, tcpstates[tp->t_state], tlen); + free(s, M_TCPLOG); + } + tp = tcp_close(tp); + TCPSTAT_INC(tcps_rcvafterclose); + rstreason = BANDLIM_UNLIMITED; + goto dropwithreset; + } + + /* + * If segment ends after window, drop trailing data + * (and PUSH and FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); + if (todrop > 0) { + TCPSTAT_INC(tcps_rcvpackafterwin); + if (todrop >= tlen) { + TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment + * and ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_rcvwinprobe); + } else + goto dropafterack; + } else + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH|TH_FIN); + } + + /* + * If last ACK falls within this segment's sequence numbers, + * record its timestamp. + * NOTE: + * 1) That the test incorporates suggestions from the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + * 2) That updating only on newer timestamps interferes with + * our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. + * 3) That we modify the segment boundary check to be + * Last.ACK.Sent <= SEG.SEQ + SEG.Len + * instead of RFC1323's + * Last.ACK.Sent < SEG.SEQ + SEG.Len, + * This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated + * Vol. 2 p.869. In such cases, we can still calculate the + * RTT correctly when RCV.NXT == Last.ACK.Sent. + */ + if ((to.to_flags & TOF_TS) != 0 && + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN|TH_FIN)) != 0))) { + tp->ts_recent_age = ticks; + tp->ts_recent = to.to_tsval; + } + + /* + * If a SYN is in the window, then this is an + * error and we send an RST and drop the connection. + */ + if (thflags & TH_SYN) { + KASSERT(ti_locked == TI_WLOCKED, + ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + + tp = tcp_drop(tp, ECONNRESET); + rstreason = BANDLIM_UNLIMITED; + goto drop; + } + + /* + * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN + * flag is on (half-synchronized state), then queue data for + * later processing; else drop segment and return. + */ + if ((thflags & TH_ACK) == 0) { + if (tp->t_state == TCPS_SYN_RECEIVED || + (tp->t_flags & TF_NEEDSYN)) + goto step6; + else if (tp->t_flags & TF_ACKNOW) + goto dropafterack; + else + goto drop; + } + + /* + * Ack processing. + */ + switch (tp->t_state) { + + /* + * In SYN_RECEIVED state, the ack ACKs our SYN, so enter + * ESTABLISHED state and continue processing. + * The ACK was checked above. + */ + case TCPS_SYN_RECEIVED: + + TCPSTAT_INC(tcps_connects); + soisconnected(so); + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + tp->snd_wnd = tiwin; + } + /* + * Make transitions: + * SYN-RECEIVED -> ESTABLISHED + * SYN-RECEIVED* -> FIN-WAIT-1 + */ + tp->t_starttime = ticks; + if (tp->t_flags & TF_NEEDFIN) { + tp->t_state = TCPS_FIN_WAIT_1; + tp->t_flags &= ~TF_NEEDFIN; + } else { + tp->t_state = TCPS_ESTABLISHED; + tcp_timer_activate(tp, TT_KEEP, tcp_keepidle); + } + /* + * If segment contains data or ACK, will call tcp_reass() + * later; if not, do so now to pass queued data to user. + */ + if (tlen == 0 && (thflags & TH_FIN) == 0) + (void) tcp_reass(tp, (struct tcphdr *)0, 0, + (struct mbuf *)0); + tp->snd_wl1 = th->th_seq - 1; + /* FALLTHROUGH */ + + /* + * In ESTABLISHED state: drop duplicate ACKs; ACK out of range + * ACKs. If the ack is in the range + * tp->snd_una < th->th_ack <= tp->snd_max + * then advance tp->snd_una to th->th_ack and drop + * data from the retransmission queue. If this ACK reflects + * more up to date window information we update our window information. + */ + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + if (SEQ_GT(th->th_ack, tp->snd_max)) { + TCPSTAT_INC(tcps_rcvacktoomuch); + goto dropafterack; + } + if ((tp->t_flags & TF_SACK_PERMIT) && + ((to.to_flags & TOF_SACK) || + !TAILQ_EMPTY(&tp->snd_holes))) + tcp_sack_doack(tp, &to, th->th_ack); + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { + if (tlen == 0 && tiwin == tp->snd_wnd) { + TCPSTAT_INC(tcps_rcvdupack); + /* + * If we have outstanding data (other than + * a window probe), this is a completely + * duplicate ack (ie, window info didn't + * change), the ack is the biggest we've + * seen and we've seen exactly our rexmt + * threshhold of them, assume a packet + * has been dropped and retransmit it. + * Kludge snd_nxt & the congestion + * window so we send only this one + * packet. + * + * We know we're losing at the current + * window size so do congestion avoidance + * (set ssthresh to half the current window + * and pull our congestion window back to + * the new ssthresh). + * + * Dup acks mean that packets have left the + * network (they're now cached at the receiver) + * so bump cwnd by the amount in the receiver + * to keep a constant cwnd packets in the + * network. + * + * When using TCP ECN, notify the peer that + * we reduced the cwnd. + */ + if (!tcp_timer_active(tp, TT_REXMT) || + th->th_ack != tp->snd_una) + tp->t_dupacks = 0; + else if (++tp->t_dupacks > tcprexmtthresh || + ((V_tcp_do_newreno || + (tp->t_flags & TF_SACK_PERMIT)) && + IN_FASTRECOVERY(tp))) { + if ((tp->t_flags & TF_SACK_PERMIT) && + IN_FASTRECOVERY(tp)) { + int awnd; + + /* + * Compute the amount of data in flight first. + * We can inject new data into the pipe iff + * we have less than 1/2 the original window's + * worth of data in flight. + */ + awnd = (tp->snd_nxt - tp->snd_fack) + + tp->sackhint.sack_bytes_rexmit; + if (awnd < tp->snd_ssthresh) { + tp->snd_cwnd += tp->t_maxseg; + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + } + } else + tp->snd_cwnd += tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } else if (tp->t_dupacks == tcprexmtthresh) { + tcp_seq onxt = tp->snd_nxt; + + /* + * If we're doing sack, check to + * see if we're already in sack + * recovery. If we're not doing sack, + * check to see if we're in newreno + * recovery. + */ + if (tp->t_flags & TF_SACK_PERMIT) { + if (IN_FASTRECOVERY(tp)) { + tp->t_dupacks = 0; + break; + } + } else if (V_tcp_do_newreno || + V_tcp_do_ecn) { + if (SEQ_LEQ(th->th_ack, + tp->snd_recover)) { + tp->t_dupacks = 0; + break; + } + } + tcp_congestion_exp(tp); + tcp_timer_activate(tp, TT_REXMT, 0); + tp->t_rtttime = 0; + if (tp->t_flags & TF_SACK_PERMIT) { + TCPSTAT_INC( + tcps_sack_recovery_episode); + tp->sack_newdata = tp->snd_nxt; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } + tp->snd_nxt = th->th_ack; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + KASSERT(tp->snd_limited <= 2, + ("%s: tp->snd_limited too big", + __func__)); + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * + (tp->t_dupacks - tp->snd_limited); + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + goto drop; + } else if (V_tcp_do_rfc3042) { + u_long oldcwnd = tp->snd_cwnd; + tcp_seq oldsndmax = tp->snd_max; + u_int sent; + + KASSERT(tp->t_dupacks == 1 || + tp->t_dupacks == 2, + ("%s: dupacks not 1 or 2", + __func__)); + if (tp->t_dupacks == 1) + tp->snd_limited = 0; + tp->snd_cwnd = + (tp->snd_nxt - tp->snd_una) + + (tp->t_dupacks - tp->snd_limited) * + tp->t_maxseg; + (void) tcp_output(tp); + sent = tp->snd_max - oldsndmax; + if (sent > tp->t_maxseg) { + KASSERT((tp->t_dupacks == 2 && + tp->snd_limited == 0) || + (sent == tp->t_maxseg + 1 && + tp->t_flags & TF_SENTFIN), + ("%s: sent too much", + __func__)); + tp->snd_limited = 2; + } else if (sent > 0) + ++tp->snd_limited; + tp->snd_cwnd = oldcwnd; + goto drop; + } + } else + tp->t_dupacks = 0; + break; + } + + KASSERT(SEQ_GT(th->th_ack, tp->snd_una), + ("%s: th_ack <= snd_una", __func__)); + + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) { + if (IN_FASTRECOVERY(tp)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (tp->t_flags & TF_SACK_PERMIT) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); + } else { + /* + * Out of fast recovery. + * Window inflation should have left us + * with approximately snd_ssthresh + * outstanding data. + * But in case we would be inclined to + * send a burst, better to do it via + * the slow start mechanism. + */ + if (SEQ_GT(th->th_ack + + tp->snd_ssthresh, + tp->snd_max)) + tp->snd_cwnd = tp->snd_max - + th->th_ack + + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + } + } + } else { + if (tp->t_dupacks >= tcprexmtthresh && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + } + tp->t_dupacks = 0; + /* + * If we reach this point, ACK is not a duplicate, + * i.e., it ACKs something we sent. + */ + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our + * SYN has been ACK'd (so connection is now fully + * synchronized). Go to non-starred state, + * increment snd_una for ACK of SYN, and check if + * we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + /* Do window scaling? */ + if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == + (TF_RCVD_SCALE|TF_REQ_SCALE)) { + tp->rcv_scale = tp->request_r_scale; + /* Send window already scaled. */ + } + } + +process_ACK: + INP_INFO_LOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, + ("tcp_input: process_ACK ti_locked %d", ti_locked)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + acked = th->th_ack - tp->snd_una; + TCPSTAT_INC(tcps_rcvackpack); + TCPSTAT_ADD(tcps_rcvackbyte, acked); + + /* + * If we just performed our first retransmit, and the ACK + * arrives within our recovery window, then it was a mistake + * to do the retransmit in the first place. Recover our + * original cwnd and ssthresh, and proceed to transmit where + * we left off. + */ + if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) { + TCPSTAT_INC(tcps_sndrexmitbad); + tp->snd_cwnd = tp->snd_cwnd_prev; + tp->snd_ssthresh = tp->snd_ssthresh_prev; + tp->snd_recover = tp->snd_recover_prev; + if (tp->t_flags & TF_WASFRECOVERY) + ENTER_FASTRECOVERY(tp); + tp->snd_nxt = tp->snd_max; + tp->t_badrxtwin = 0; /* XXX probably not required */ + } + + /* + * If we have a timestamp reply, update smoothed + * round trip time. If no timestamp is present but + * transmit timer is running and timed sequence + * number was acked, update smoothed round trip time. + * Since we now have an rtt measurement, cancel the + * timer backoff (cf., Phil Karn's retransmit alg.). + * Recompute the initial retransmit timer. + * + * Some boxes send broken timestamp replies + * during the SYN+ACK phase, ignore + * timestamps of 0 or we could calculate a + * huge RTT and blow up the retransmit timer. + */ + if ((to.to_flags & TOF_TS) != 0 && + to.to_tsecr) { + if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr) + tp->t_rttlow = ticks - to.to_tsecr; + tcp_xmit_timer(tp, ticks - to.to_tsecr + 1); + } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { + if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) + tp->t_rttlow = ticks - tp->t_rtttime; + tcp_xmit_timer(tp, ticks - tp->t_rtttime); + } + tcp_xmit_bandwidth_limit(tp, th->th_ack); + + /* + * If all outstanding data is acked, stop retransmit + * timer and remember to restart (more output or persist). + * If there is more data to be acked, restart retransmit + * timer, using current (possibly backed-off) value. + */ + if (th->th_ack == tp->snd_max) { + tcp_timer_activate(tp, TT_REXMT, 0); + needoutput = 1; + } else if (!tcp_timer_active(tp, TT_PERSIST)) + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + + /* + * If no data (only SYN) was ACK'd, + * skip rest of ACK processing. + */ + if (acked == 0) + goto step6; + + /* + * When new data is acked, open the congestion window. + * Method depends on which congestion control state we're + * in (slow start or cong avoid) and if ABC (RFC 3465) is + * enabled. + * + * slow start: cwnd <= ssthresh + * cong avoid: cwnd > ssthresh + * + * slow start and ABC (RFC 3465): + * Grow cwnd exponentially by the amount of data + * ACKed capping the max increment per ACK to + * (abc_l_var * maxseg) bytes. + * + * slow start without ABC (RFC 2581): + * Grow cwnd exponentially by maxseg per ACK. + * + * cong avoid and ABC (RFC 3465): + * Grow cwnd linearly by maxseg per RTT for each + * cwnd worth of ACKed data. + * + * cong avoid without ABC (RFC 2581): + * Grow cwnd linearly by approximately maxseg per RTT using + * maxseg^2 / cwnd per ACK as the increment. + * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to + * avoid capping cwnd. + */ + if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) || + !IN_FASTRECOVERY(tp)) { + u_int cw = tp->snd_cwnd; + u_int incr = tp->t_maxseg; + /* In congestion avoidance? */ + if (cw > tp->snd_ssthresh) { + if (V_tcp_do_rfc3465) { + tp->t_bytes_acked += acked; + if (tp->t_bytes_acked >= tp->snd_cwnd) + tp->t_bytes_acked -= cw; + else + incr = 0; + } + else + incr = max((incr * incr / cw), 1); + /* + * In slow-start with ABC enabled and no RTO in sight? + * (Must not use abc_l_var > 1 if slow starting after an + * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt == + * snd_max check is sufficient to handle this). + */ + } else if (V_tcp_do_rfc3465 && + tp->snd_nxt == tp->snd_max) + incr = min(acked, + V_tcp_abc_l_var * tp->t_maxseg); + /* ABC is on by default, so (incr == 0) frequently. */ + if (incr > 0) + tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<snd_scale); + } + SOCKBUF_LOCK(&so->so_snd); + if (acked > so->so_snd.sb_cc) { + tp->snd_wnd -= so->so_snd.sb_cc; + sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); + ourfinisacked = 1; + } else { + sbdrop_locked(&so->so_snd, acked); + tp->snd_wnd -= acked; + ourfinisacked = 0; + } + /* NB: sowwakeup_locked() does an implicit unlock. */ + sowwakeup_locked(so); + /* Detect una wraparound. */ + if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && + !IN_FASTRECOVERY(tp) && + SEQ_GT(tp->snd_una, tp->snd_recover) && + SEQ_LEQ(th->th_ack, tp->snd_recover)) + tp->snd_recover = th->th_ack - 1; + if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) && + IN_FASTRECOVERY(tp) && + SEQ_GEQ(th->th_ack, tp->snd_recover)) { + EXIT_FASTRECOVERY(tp); + tp->t_bytes_acked = 0; + } + tp->snd_una = th->th_ack; + if (tp->t_flags & TF_SACK_PERMIT) { + if (SEQ_GT(tp->snd_una, tp->snd_recover)) + tp->snd_recover = tp->snd_una; + } + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + + switch (tp->t_state) { + + /* + * In FIN_WAIT_1 STATE in addition to the processing + * for the ESTABLISHED state if our FIN is now acknowledged + * then enter FIN_WAIT_2. + */ + case TCPS_FIN_WAIT_1: + if (ourfinisacked) { + /* + * If we can't receive any more + * data, then closing user can proceed. + * Starting the timer is contrary to the + * specification, but if we don't get a FIN + * we'll hang forever. + * + * XXXjl: + * we should release the tp also, and use a + * compressed state. + */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + int timeout; + + soisdisconnected(so); + timeout = (tcp_fast_finwait2_recycle) ? + tcp_finwait2_timeout : tcp_maxidle; + tcp_timer_activate(tp, TT_2MSL, timeout); + } + tp->t_state = TCPS_FIN_WAIT_2; + } + break; + + /* + * In CLOSING STATE in addition to the processing for + * the ESTABLISHED state if the ACK acknowledges our FIN + * then enter the TIME-WAIT state, otherwise ignore + * the segment. + */ + case TCPS_CLOSING: + if (ourfinisacked) { + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + tcp_twstart(tp); + INP_INFO_WUNLOCK(&V_tcbinfo); + m_freem(m); + return; + } + break; + + /* + * In LAST_ACK, we may still be waiting for data to drain + * and/or to be acked, as well as for the ack of our FIN. + * If our FIN is now acknowledged, delete the TCB, + * enter the closed state and return. + */ + case TCPS_LAST_ACK: + if (ourfinisacked) { + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + tp = tcp_close(tp); + goto drop; + } + break; + } + } + +step6: + INP_INFO_LOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, + ("tcp_do_segment: step6 ti_locked %d", ti_locked)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * Update window information. + * Don't look at window if no ACK: TAC's send garbage on first SYN. + */ + if ((thflags & TH_ACK) && + (SEQ_LT(tp->snd_wl1, th->th_seq) || + (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || + (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { + /* keep track of pure window updates */ + if (tlen == 0 && + tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) + TCPSTAT_INC(tcps_rcvwinupd); + tp->snd_wnd = tiwin; + tp->snd_wl1 = th->th_seq; + tp->snd_wl2 = th->th_ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + needoutput = 1; + } + + /* + * Process segments with URG. + */ + if ((thflags & TH_URG) && th->th_urp && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + /* + * This is a kludge, but if we receive and accept + * random urgent pointers, we'll crash in + * soreceive. It's hard to imagine someone + * actually wanting to send this much urgent data. + */ + SOCKBUF_LOCK(&so->so_rcv); + if (th->th_urp + so->so_rcv.sb_cc > sb_max) { + th->th_urp = 0; /* XXX */ + thflags &= ~TH_URG; /* XXX */ + SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ + goto dodata; /* XXX */ + } + /* + * If this segment advances the known urgent pointer, + * then mark the data stream. This should not happen + * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since + * a FIN has been received from the remote side. + * In these states we ignore the URG. + * + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section as the original + * spec states (in one of two places). + */ + if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { + tp->rcv_up = th->th_seq + th->th_urp; + so->so_oobmark = so->so_rcv.sb_cc + + (tp->rcv_up - tp->rcv_nxt) - 1; + if (so->so_oobmark == 0) + so->so_rcv.sb_state |= SBS_RCVATMARK; + sohasoutofband(so); + tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); + } + SOCKBUF_UNLOCK(&so->so_rcv); + /* + * Remove out of band data so doesn't get presented to user. + * This can happen independent of advancing the URG pointer, + * but if two URG's are pending at once, some out-of-band + * data may creep in... ick. + */ + if (th->th_urp <= (u_long)tlen && + !(so->so_options & SO_OOBINLINE)) { + /* hdr drop is delayed */ + tcp_pulloutofband(so, th, m, drop_hdrlen); + } + } else { + /* + * If no out of band data is expected, + * pull receive urgent pointer along + * with the receive window. + */ + if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) + tp->rcv_up = tp->rcv_nxt; + } +dodata: /* XXX */ + INP_INFO_LOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, + ("tcp_do_segment: dodata ti_locked %d", ti_locked)); + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * Process the segment text, merging it into the TCP sequencing queue, + * and arranging for acknowledgment of receipt if necessary. + * This process logically involves adjusting tp->rcv_wnd as data + * is presented to the user (this happens in tcp_usrreq.c, + * case PRU_RCVD). If a FIN has already been received on this + * connection then we just ignore the text. + */ + if ((tlen || (thflags & TH_FIN)) && + TCPS_HAVERCVDFIN(tp->t_state) == 0) { + tcp_seq save_start = th->th_seq; + m_adj(m, drop_hdrlen); /* delayed header drop */ + /* + * Insert segment which includes th into TCP reassembly queue + * with control block tp. Set thflags to whether reassembly now + * includes a segment with FIN. This handles the common case + * inline (segment is the next to be received on an established + * connection, and the queue is empty), avoiding linkage into + * and removal from the queue and repetition of various + * conversions. + * Set DELACK for segments received in order, but ack + * immediately when segments are out of order (so + * fast retransmit can work). + */ + if (th->th_seq == tp->rcv_nxt && + LIST_EMPTY(&tp->t_segq) && + TCPS_HAVEESTABLISHED(tp->t_state)) { + if (DELAY_ACK(tp)) + tp->t_flags |= TF_DELACK; + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt += tlen; + thflags = th->th_flags & TH_FIN; + TCPSTAT_INC(tcps_rcvpack); + TCPSTAT_ADD(tcps_rcvbyte, tlen); + ND6_HINT(tp); + SOCKBUF_LOCK(&so->so_rcv); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + m_freem(m); + else + sbappendstream_locked(&so->so_rcv, m); + /* NB: sorwakeup_locked() does an implicit unlock. */ + sorwakeup_locked(so); + } else { + /* + * XXX: Due to the header drop above "th" is + * theoretically invalid by now. Fortunately + * m_adj() doesn't actually frees any mbufs + * when trimming from the head. + */ + thflags = tcp_reass(tp, th, &tlen, m); + tp->t_flags |= TF_ACKNOW; + } + if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) + tcp_update_sack_list(tp, save_start, save_start + tlen); +#if 0 + /* + * Note the amount of data that peer has sent into + * our window, in order to estimate the sender's + * buffer size. + * XXX: Unused. + */ + len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); +#endif + } else { + m_freem(m); + thflags &= ~TH_FIN; + } + + /* + * If FIN is received ACK the FIN and let the user know + * that the connection is closing. + */ + if (thflags & TH_FIN) { + if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { + socantrcvmore(so); + /* + * If connection is half-synchronized + * (ie NEEDSYN flag on) then delay ACK, + * so it may be piggybacked when SYN is sent. + * Otherwise, since we received a FIN then no + * more input can be expected, send ACK now. + */ + if (tp->t_flags & TF_NEEDSYN) + tp->t_flags |= TF_DELACK; + else + tp->t_flags |= TF_ACKNOW; + tp->rcv_nxt++; + } + switch (tp->t_state) { + + /* + * In SYN_RECEIVED and ESTABLISHED STATES + * enter the CLOSE_WAIT state. + */ + case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; + /* FALLTHROUGH */ + case TCPS_ESTABLISHED: + tp->t_state = TCPS_CLOSE_WAIT; + break; + + /* + * If still in FIN_WAIT_1 STATE FIN has not been acked so + * enter the CLOSING state. + */ + case TCPS_FIN_WAIT_1: + tp->t_state = TCPS_CLOSING; + break; + + /* + * In FIN_WAIT_2 state enter the TIME_WAIT state, + * starting the time-wait timer, turning off the other + * standard timers. + */ + case TCPS_FIN_WAIT_2: + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata " + "TCP_FIN_WAIT_2 ti_locked: %d", __func__, + ti_locked)); + + tcp_twstart(tp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return; + } + } + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); + else if (ti_locked == TI_WLOCKED) + INP_INFO_WUNLOCK(&V_tcbinfo); + else + panic("%s: dodata epilogue ti_locked %d", __func__, + ti_locked); + ti_locked = TI_UNLOCKED; + +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + + /* + * Return any desired output. + */ + if (needoutput || (tp->t_flags & TF_ACKNOW)) + (void) tcp_output(tp); + +check_delack: + KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", + __func__, ti_locked)); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (tp->t_flags & TF_DELACK) { + tp->t_flags &= ~TF_DELACK; + tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); + } + INP_WUNLOCK(tp->t_inpcb); + return; + +dropafterack: + KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED, + ("tcp_do_segment: dropafterack ti_locked %d", ti_locked)); + + /* + * Generate an ACK dropping incoming segment if it occupies + * sequence space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all + * paths to this code happen after packets containing + * RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the + * segment we received passes the SYN-RECEIVED ACK test. + * If it fails send a RST. This breaks the loop in the + * "LAND" DoS attack, and also prevents an ACK storm + * between two listening ports that have been sent forged + * SYN segments, each with the source address of the other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max)) ) { + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } +#ifdef TCPDEBUG + if (so->so_options & SO_DEBUG) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); + else if (ti_locked == TI_WLOCKED) + INP_INFO_WUNLOCK(&V_tcbinfo); + else + panic("%s: dropafterack epilogue ti_locked %d", __func__, + ti_locked); + ti_locked = TI_UNLOCKED; + + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + INP_WUNLOCK(tp->t_inpcb); + m_freem(m); + return; + +dropwithreset: + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); + else if (ti_locked == TI_WLOCKED) + INP_INFO_WUNLOCK(&V_tcbinfo); + else + panic("%s: dropwithreset ti_locked %d", __func__, ti_locked); + ti_locked = TI_UNLOCKED; + + if (tp != NULL) { + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); + } else + tcp_dropwithreset(m, th, NULL, tlen, rstreason); + return; + +drop: + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); + else if (ti_locked == TI_WLOCKED) + INP_INFO_WUNLOCK(&V_tcbinfo); +#ifdef INVARIANTS + else + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); +#endif + ti_locked = TI_UNLOCKED; + + /* + * Drop space held by incoming segment and return. + */ +#ifdef TCPDEBUG + if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, + &tcp_savetcp, 0); +#endif + if (tp != NULL) + INP_WUNLOCK(tp->t_inpcb); + m_freem(m); +} + +/* + * Issue RST and make ACK acceptable to originator of segment. + * The mbuf must still include the original packet header. + * tp may be NULL. + */ +static void +tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, + int tlen, int rstreason) +{ + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + + if (tp != NULL) { + INP_WLOCK_ASSERT(tp->t_inpcb); + } + + /* Don't bother if destination was broadcast/multicast. */ + if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) + goto drop; +#ifdef INET6 + if (mtod(m, struct ip *)->ip_v == 6) { + ip6 = mtod(m, struct ip6_hdr *); + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || + IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) + goto drop; + /* IPv6 anycast check is done at tcp6_input() */ + } else +#endif + { + ip = mtod(m, struct ip *); + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || + ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || + in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) + goto drop; + } + + /* Perform bandwidth limiting. */ + if (badport_bandlim(rstreason) < 0) + goto drop; + + /* tcp_respond consumes the mbuf chain. */ + if (th->th_flags & TH_ACK) { + tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, + th->th_ack, TH_RST); + } else { + if (th->th_flags & TH_SYN) + tlen++; + tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, + (tcp_seq)0, TH_RST|TH_ACK); + } + return; +drop: + m_freem(m); +} + +/* + * Parse TCP options and place in tcpopt. + */ +static void +tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) +{ + int opt, optlen; + + to->to_flags = 0; + for (; cnt > 0; cnt -= optlen, cp += optlen) { + opt = cp[0]; + if (opt == TCPOPT_EOL) + break; + if (opt == TCPOPT_NOP) + optlen = 1; + else { + if (cnt < 2) + break; + optlen = cp[1]; + if (optlen < 2 || optlen > cnt) + break; + } + switch (opt) { + case TCPOPT_MAXSEG: + if (optlen != TCPOLEN_MAXSEG) + continue; + if (!(flags & TO_SYN)) + continue; + to->to_flags |= TOF_MSS; + bcopy((char *)cp + 2, + (char *)&to->to_mss, sizeof(to->to_mss)); + to->to_mss = ntohs(to->to_mss); + break; + case TCPOPT_WINDOW: + if (optlen != TCPOLEN_WINDOW) + continue; + if (!(flags & TO_SYN)) + continue; + to->to_flags |= TOF_SCALE; + to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); + break; + case TCPOPT_TIMESTAMP: + if (optlen != TCPOLEN_TIMESTAMP) + continue; + to->to_flags |= TOF_TS; + bcopy((char *)cp + 2, + (char *)&to->to_tsval, sizeof(to->to_tsval)); + to->to_tsval = ntohl(to->to_tsval); + bcopy((char *)cp + 6, + (char *)&to->to_tsecr, sizeof(to->to_tsecr)); + to->to_tsecr = ntohl(to->to_tsecr); + break; +#ifdef TCP_SIGNATURE + /* + * XXX In order to reply to a host which has set the + * TCP_SIGNATURE option in its initial SYN, we have to + * record the fact that the option was observed here + * for the syncache code to perform the correct response. + */ + case TCPOPT_SIGNATURE: + if (optlen != TCPOLEN_SIGNATURE) + continue; + to->to_flags |= TOF_SIGNATURE; + to->to_signature = cp + 2; + break; +#endif + case TCPOPT_SACK_PERMITTED: + if (optlen != TCPOLEN_SACK_PERMITTED) + continue; + if (!(flags & TO_SYN)) + continue; + if (!V_tcp_do_sack) + continue; + to->to_flags |= TOF_SACKPERM; + break; + case TCPOPT_SACK: + if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) + continue; + if (flags & TO_SYN) + continue; + to->to_flags |= TOF_SACK; + to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; + to->to_sacks = cp + 2; + TCPSTAT_INC(tcps_sack_rcv_blocks); + break; + default: + continue; + } + } +} + +/* + * Pull out of band byte out of a segment so + * it doesn't appear in the user's data queue. + * It is still reflected in the segment length for + * sequencing purposes. + */ +static void +tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, + int off) +{ + int cnt = off + th->th_urp - 1; + + while (cnt >= 0) { + if (m->m_len > cnt) { + char *cp = mtod(m, caddr_t) + cnt; + struct tcpcb *tp = sototcpcb(so); + + INP_WLOCK_ASSERT(tp->t_inpcb); + + tp->t_iobc = *cp; + tp->t_oobflags |= TCPOOB_HAVEDATA; + bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); + m->m_len--; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len--; + return; + } + cnt -= m->m_len; + m = m->m_next; + if (m == NULL) + break; + } + panic("tcp_pulloutofband"); +} + +/* + * Collect new round-trip time estimate + * and update averages and current timeout. + */ +static void +tcp_xmit_timer(struct tcpcb *tp, int rtt) +{ + int delta; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + TCPSTAT_INC(tcps_rttupdated); + tp->t_rttupdated++; + if (tp->t_srtt != 0) { + /* + * srtt is stored as fixed point with 5 bits after the + * binary point (i.e., scaled by 8). The following magic + * is equivalent to the smoothing algorithm in rfc793 with + * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed + * point). Adjust rtt to origin 0. + */ + delta = ((rtt - 1) << TCP_DELTA_SHIFT) + - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + + if ((tp->t_srtt += delta) <= 0) + tp->t_srtt = 1; + + /* + * We accumulate a smoothed rtt variance (actually, a + * smoothed mean difference), then set the retransmit + * timer to smoothed rtt + 4 times the smoothed variance. + * rttvar is stored as fixed point with 4 bits after the + * binary point (scaled by 16). The following is + * equivalent to rfc793 smoothing with an alpha of .75 + * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * rfc793's wired-in beta. + */ + if (delta < 0) + delta = -delta; + delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); + if ((tp->t_rttvar += delta) <= 0) + tp->t_rttvar = 1; + if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) + tp->t_rttbest = tp->t_srtt + tp->t_rttvar; + } else { + /* + * No rtt measurement yet - use the unsmoothed rtt. + * Set the variance to half the rtt (so our first + * retransmit happens at 3*rtt). + */ + tp->t_srtt = rtt << TCP_RTT_SHIFT; + tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + tp->t_rttbest = tp->t_srtt + tp->t_rttvar; + } + tp->t_rtttime = 0; + tp->t_rxtshift = 0; + + /* + * the retransmit should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + */ + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + + /* + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + */ + tp->t_softerror = 0; +} + +/* + * Determine a reasonable value for maxseg size. + * If the route is known, check route for mtu. + * If none, use an mss that can be handled on the outgoing + * interface without forcing IP to fragment; if bigger than + * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES + * to utilize large mbufs. If no route is found, route has no mtu, + * or the destination isn't local, use a default, hopefully conservative + * size (usually 512 or the default IP max size, but no more than the mtu + * of the interface), as we can't discover anything about intervening + * gateways or networks. We also initialize the congestion/slow start + * window to be a single segment if the destination isn't local. + * While looking at the routing entry, we also initialize other path-dependent + * parameters from pre-set or cached values in the routing entry. + * + * Also take into account the space needed for options that we + * send regularly. Make maxseg shorter by that amount to assure + * that we can send maxseg amount of data even when the options + * are present. Store the upper limit of the length of options plus + * data in maxopd. + * + * In case of T/TCP, we call this routine during implicit connection + * setup as well (offer = -1), to initialize maxseg from the cached + * MSS of our peer. + * + * NOTE that this routine is only called when we process an incoming + * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). + */ +void +tcp_mss_update(struct tcpcb *tp, int offer, + struct hc_metrics_lite *metricptr, int *mtuflags) +{ + int mss; + u_long maxmtu; + struct inpcb *inp = tp->t_inpcb; + struct hc_metrics_lite metrics; + int origoffer = offer; +#ifdef INET6 + int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + size_t min_protoh = isipv6 ? + sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : + sizeof (struct tcpiphdr); +#else + const size_t min_protoh = sizeof(struct tcpiphdr); +#endif + + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* Initialize. */ +#ifdef INET6 + if (isipv6) { + maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags); + tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt; + } else +#endif + { + maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags); + tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt; + } + + /* + * No route to sender, stay with default mss and return. + */ + if (maxmtu == 0) { + /* + * In case we return early we need to initialize metrics + * to a defined state as tcp_hc_get() would do for us + * if there was no cache hit. + */ + if (metricptr != NULL) + bzero(metricptr, sizeof(struct hc_metrics_lite)); + return; + } + + /* What have we got? */ + switch (offer) { + case 0: + /* + * Offer == 0 means that there was no MSS on the SYN + * segment, in this case we use tcp_mssdflt as + * already assigned to t_maxopd above. + */ + offer = tp->t_maxopd; + break; + + case -1: + /* + * Offer == -1 means that we didn't receive SYN yet. + */ + /* FALLTHROUGH */ + + default: + /* + * Prevent DoS attack with too small MSS. Round up + * to at least minmss. + */ + offer = max(offer, V_tcp_minmss); + } + + /* + * rmx information is now retrieved from tcp_hostcache. + */ + tcp_hc_get(&inp->inp_inc, &metrics); + if (metricptr != NULL) + bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); + + /* + * If there's a discovered mtu int tcp hostcache, use it + * else, use the link mtu. + */ + if (metrics.rmx_mtu) + mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; + else { +#ifdef INET6 + if (isipv6) { + mss = maxmtu - min_protoh; + if (!V_path_mtu_discovery && + !in6_localaddr(&inp->in6p_faddr)) + mss = min(mss, V_tcp_v6mssdflt); + } else +#endif + { + mss = maxmtu - min_protoh; + if (!V_path_mtu_discovery && + !in_localaddr(inp->inp_faddr)) + mss = min(mss, V_tcp_mssdflt); + } + /* + * XXX - The above conditional (mss = maxmtu - min_protoh) + * probably violates the TCP spec. + * The problem is that, since we don't know the + * other end's MSS, we are supposed to use a conservative + * default. But, if we do that, then MTU discovery will + * never actually take place, because the conservative + * default is much less than the MTUs typically seen + * on the Internet today. For the moment, we'll sweep + * this under the carpet. + * + * The conservative default might not actually be a problem + * if the only case this occurs is when sending an initial + * SYN with options and data to a host we've never talked + * to before. Then, they will reply with an MSS value which + * will get recorded and the new parameters should get + * recomputed. For Further Study. + */ + } + mss = min(mss, offer); + + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even if the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + mss = max(mss, 64); + + /* + * maxopd stores the maximum length of data AND options + * in a segment; maxseg is the amount of data in a normal + * segment. We need to store this value (maxopd) apart + * from maxseg, because now every segment carries options + * and thus we normally have somewhat less data in segments. + */ + tp->t_maxopd = mss; + + /* + * origoffer==-1 indicates that no segments were received yet. + * In this case we just guess. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP && + (origoffer == -1 || + (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) + mss -= TCPOLEN_TSTAMP_APPA; + +#if (MCLBYTES & (MCLBYTES - 1)) == 0 + if (mss > MCLBYTES) + mss &= ~(MCLBYTES-1); +#else + if (mss > MCLBYTES) + mss = mss / MCLBYTES * MCLBYTES; +#endif + tp->t_maxseg = mss; +} + +void +tcp_mss(struct tcpcb *tp, int offer) +{ + int rtt, mss; + u_long bufsize; + struct inpcb *inp; + struct socket *so; + struct hc_metrics_lite metrics; + int mtuflags = 0; +#ifdef INET6 + int isipv6; +#endif + KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); + + tcp_mss_update(tp, offer, &metrics, &mtuflags); + + mss = tp->t_maxseg; + inp = tp->t_inpcb; +#ifdef INET6 + isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; +#endif + + /* + * If there's a pipesize, change the socket buffer to that size, + * don't change if sb_hiwat is different than default (then it + * has been changed on purpose with setsockopt). + * Make the socket buffers an integral number of mss units; + * if the mss is larger than the socket buffer, decrease the mss. + */ + so = inp->inp_socket; + SOCKBUF_LOCK(&so->so_snd); + if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) + bufsize = metrics.rmx_sendpipe; + else + bufsize = so->so_snd.sb_hiwat; + if (bufsize < mss) + mss = bufsize; + else { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + if (bufsize > so->so_snd.sb_hiwat) + (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL); + } + SOCKBUF_UNLOCK(&so->so_snd); + tp->t_maxseg = mss; + + SOCKBUF_LOCK(&so->so_rcv); + if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) + bufsize = metrics.rmx_recvpipe; + else + bufsize = so->so_rcv.sb_hiwat; + if (bufsize > mss) { + bufsize = roundup(bufsize, mss); + if (bufsize > sb_max) + bufsize = sb_max; + if (bufsize > so->so_rcv.sb_hiwat) + (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL); + } + SOCKBUF_UNLOCK(&so->so_rcv); + /* + * While we're here, check the others too. + */ + if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { + tp->t_srtt = rtt; + tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; + TCPSTAT_INC(tcps_usedrtt); + if (metrics.rmx_rttvar) { + tp->t_rttvar = metrics.rmx_rttvar; + TCPSTAT_INC(tcps_usedrttvar); + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + if (metrics.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); + TCPSTAT_INC(tcps_usedssthresh); + } + if (metrics.rmx_bandwidth) + tp->snd_bandwidth = metrics.rmx_bandwidth; + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + * + * Extend this so we cache the cwnd too and retrieve it here. + * Make cwnd even bigger than RFC3390 suggests but only if we + * have previous experience with the remote host. Be careful + * not make cwnd bigger than remote receive window or our own + * send socket buffer. Maybe put some additional upper bound + * on the retrieved cwnd. Should do incremental updates to + * hostcache when cwnd collapses so next connection doesn't + * overloads the path again. + * + * XXXAO: Initializing the CWND from the hostcache is broken + * and in its current form not RFC conformant. It is disabled + * until fixed or removed entirely. + * + * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. + * We currently check only in syncache_socket for that. + */ +/* #define TCP_METRICS_CWND */ +#ifdef TCP_METRICS_CWND + if (metrics.rmx_cwnd) + tp->snd_cwnd = max(mss, + min(metrics.rmx_cwnd / 2, + min(tp->snd_wnd, so->so_snd.sb_hiwat))); + else +#endif + if (V_tcp_do_rfc3390) + tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); +#ifdef INET6 + else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || + (!isipv6 && in_localaddr(inp->inp_faddr))) +#else + else if (in_localaddr(inp->inp_faddr)) +#endif + tp->snd_cwnd = mss * V_ss_fltsz_local; + else + tp->snd_cwnd = mss * V_ss_fltsz; + + /* Check the interface for TSO capabilities. */ + if (mtuflags & CSUM_TSO) + tp->t_flags |= TF_TSO; +} + +/* + * Determine the MSS option to send on an outgoing SYN. + */ +int +tcp_mssopt(struct in_conninfo *inc) +{ + int mss = 0; + u_long maxmtu = 0; + u_long thcmtu = 0; + size_t min_protoh; + + KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); + +#ifdef INET6 + if (inc->inc_flags & INC_ISIPV6) { + mss = V_tcp_v6mssdflt; + maxmtu = tcp_maxmtu6(inc, NULL); + thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ + min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + } else +#endif + { + mss = V_tcp_mssdflt; + maxmtu = tcp_maxmtu(inc, NULL); + thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ + min_protoh = sizeof(struct tcpiphdr); + } + if (maxmtu && thcmtu) + mss = min(maxmtu, thcmtu) - min_protoh; + else if (maxmtu || thcmtu) + mss = max(maxmtu, thcmtu) - min_protoh; + + return (mss); +} + + +/* + * On a partial ack arrives, force the retransmission of the + * next unacknowledged segment. Do not clear tp->t_dupacks. + * By setting snd_nxt to ti_ack, this forces retransmission timer to + * be started again. + */ +static void +tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) +{ + tcp_seq onxt = tp->snd_nxt; + u_long ocwnd = tp->snd_cwnd; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + tcp_timer_activate(tp, TT_REXMT, 0); + tp->t_rtttime = 0; + tp->snd_nxt = th->th_ack; + /* + * Set snd_cwnd to one segment beyond acknowledged offset. + * (tp->snd_una has not yet been updated when this function is called.) + */ + tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + tp->snd_cwnd = ocwnd; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + /* + * Partial window deflation. Relies on fact that tp->snd_una + * not updated yet. + */ + if (tp->snd_cwnd > th->th_ack - tp->snd_una) + tp->snd_cwnd -= th->th_ack - tp->snd_una; + else + tp->snd_cwnd = 0; + tp->snd_cwnd += tp->t_maxseg; +} diff --git a/freebsd/sys/netinet/tcp_lro.c b/freebsd/sys/netinet/tcp_lro.c new file mode 100644 index 00000000..6aaff4a5 --- /dev/null +++ b/freebsd/sys/netinet/tcp_lro.c @@ -0,0 +1,389 @@ +#include + +/****************************************************************************** + +Copyright (c) 2007, Myricom Inc. +Copyright (c) 2008, Intel Corporation. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Myricom Inc, nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + 3. Neither the name of the Intel Corporation, nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ +***************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + + +static uint16_t do_csum_data(uint16_t *raw, int len) +{ + uint32_t csum; + csum = 0; + while (len > 0) { + csum += *raw; + raw++; + csum += *raw; + raw++; + len -= 4; + } + csum = (csum >> 16) + (csum & 0xffff); + csum = (csum >> 16) + (csum & 0xffff); + return (uint16_t)csum; +} + +/* + * Allocate and init the LRO data structures + */ +int +tcp_lro_init(struct lro_ctrl *cntl) +{ + struct lro_entry *lro; + int i, error = 0; + + SLIST_INIT(&cntl->lro_free); + SLIST_INIT(&cntl->lro_active); + + cntl->lro_bad_csum = 0; + cntl->lro_queued = 0; + cntl->lro_flushed = 0; + + for (i = 0; i < LRO_ENTRIES; i++) { + lro = (struct lro_entry *) malloc(sizeof (struct lro_entry), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (lro == NULL) { + if (i == 0) + error = ENOMEM; + break; + } + cntl->lro_cnt = i; + SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); + } + + return (error); +} + +void +tcp_lro_free(struct lro_ctrl *cntl) +{ + struct lro_entry *entry; + + while (!SLIST_EMPTY(&cntl->lro_free)) { + entry = SLIST_FIRST(&cntl->lro_free); + SLIST_REMOVE_HEAD(&cntl->lro_free, next); + free(entry, M_DEVBUF); + } +} + +void +tcp_lro_flush(struct lro_ctrl *cntl, struct lro_entry *lro) +{ + struct ifnet *ifp; + struct ip *ip; + struct tcphdr *tcp; + uint32_t *ts_ptr; + uint32_t tcplen, tcp_csum; + + + if (lro->append_cnt) { + /* incorporate the new len into the ip header and + * re-calculate the checksum */ + ip = lro->ip; + ip->ip_len = htons(lro->len - ETHER_HDR_LEN); + ip->ip_sum = 0; + ip->ip_sum = 0xffff ^ + do_csum_data((uint16_t*)ip, + sizeof (*ip)); + + lro->m_head->m_pkthdr.csum_flags = CSUM_IP_CHECKED | + CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + lro->m_head->m_pkthdr.csum_data = 0xffff; + lro->m_head->m_pkthdr.len = lro->len; + + /* incorporate the latest ack into the tcp header */ + tcp = (struct tcphdr *) (ip + 1); + tcp->th_ack = lro->ack_seq; + tcp->th_win = lro->window; + /* incorporate latest timestamp into the tcp header */ + if (lro->timestamp) { + ts_ptr = (uint32_t *)(tcp + 1); + ts_ptr[1] = htonl(lro->tsval); + ts_ptr[2] = lro->tsecr; + } + /* + * update checksum in tcp header by re-calculating the + * tcp pseudoheader checksum, and adding it to the checksum + * of the tcp payload data + */ + tcp->th_sum = 0; + tcplen = lro->len - sizeof(*ip) - ETHER_HDR_LEN; + tcp_csum = lro->data_csum; + tcp_csum += in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(tcplen + IPPROTO_TCP)); + tcp_csum += do_csum_data((uint16_t*)tcp, + tcp->th_off << 2); + tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); + tcp_csum = (tcp_csum & 0xffff) + (tcp_csum >> 16); + tcp->th_sum = 0xffff ^ tcp_csum; + } + ifp = cntl->ifp; + (*ifp->if_input)(cntl->ifp, lro->m_head); + cntl->lro_queued += lro->append_cnt + 1; + cntl->lro_flushed++; + lro->m_head = NULL; + lro->timestamp = 0; + lro->append_cnt = 0; + SLIST_INSERT_HEAD(&cntl->lro_free, lro, next); +} + +int +tcp_lro_rx(struct lro_ctrl *cntl, struct mbuf *m_head, uint32_t csum) +{ + struct ether_header *eh; + struct ip *ip; + struct tcphdr *tcp; + uint32_t *ts_ptr; + struct mbuf *m_nxt, *m_tail; + struct lro_entry *lro; + int hlen, ip_len, tcp_hdr_len, tcp_data_len, tot_len; + int opt_bytes, trim, csum_flags; + uint32_t seq, tmp_csum, device_mtu; + + + eh = mtod(m_head, struct ether_header *); + if (eh->ether_type != htons(ETHERTYPE_IP)) + return 1; + ip = (struct ip *) (eh + 1); + if (ip->ip_p != IPPROTO_TCP) + return 1; + + /* ensure there are no options */ + if ((ip->ip_hl << 2) != sizeof (*ip)) + return -1; + + /* .. and the packet is not fragmented */ + if (ip->ip_off & htons(IP_MF|IP_OFFMASK)) + return -1; + + /* verify that the IP header checksum is correct */ + csum_flags = m_head->m_pkthdr.csum_flags; + if (csum_flags & CSUM_IP_CHECKED) { + if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { + cntl->lro_bad_csum++; + return -1; + } + } else { + tmp_csum = do_csum_data((uint16_t *)ip, sizeof (*ip)); + if (__predict_false((tmp_csum ^ 0xffff) != 0)) { + cntl->lro_bad_csum++; + return -1; + } + } + + /* find the TCP header */ + tcp = (struct tcphdr *) (ip + 1); + + /* Get the TCP checksum if we dont have it */ + if (!csum) + csum = tcp->th_sum; + + /* ensure no bits set besides ack or psh */ + if ((tcp->th_flags & ~(TH_ACK | TH_PUSH)) != 0) + return -1; + + /* check for timestamps. Since the only option we handle are + timestamps, we only have to handle the simple case of + aligned timestamps */ + + opt_bytes = (tcp->th_off << 2) - sizeof (*tcp); + tcp_hdr_len = sizeof (*tcp) + opt_bytes; + ts_ptr = (uint32_t *)(tcp + 1); + if (opt_bytes != 0) { + if (__predict_false(opt_bytes != TCPOLEN_TSTAMP_APPA) || + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP))) + return -1; + } + + ip_len = ntohs(ip->ip_len); + tcp_data_len = ip_len - (tcp->th_off << 2) - sizeof (*ip); + + + /* + * If frame is padded beyond the end of the IP packet, + * then we must trim the extra bytes off the end. + */ + tot_len = m_head->m_pkthdr.len; + trim = tot_len - (ip_len + ETHER_HDR_LEN); + if (trim != 0) { + if (trim < 0) { + /* truncated packet */ + return -1; + } + m_adj(m_head, -trim); + tot_len = m_head->m_pkthdr.len; + } + + m_nxt = m_head; + m_tail = NULL; /* -Wuninitialized */ + while (m_nxt != NULL) { + m_tail = m_nxt; + m_nxt = m_tail->m_next; + } + + hlen = ip_len + ETHER_HDR_LEN - tcp_data_len; + seq = ntohl(tcp->th_seq); + + SLIST_FOREACH(lro, &cntl->lro_active, next) { + if (lro->source_port == tcp->th_sport && + lro->dest_port == tcp->th_dport && + lro->source_ip == ip->ip_src.s_addr && + lro->dest_ip == ip->ip_dst.s_addr) { + /* Try to append it */ + + if (__predict_false(seq != lro->next_seq)) { + /* out of order packet */ + SLIST_REMOVE(&cntl->lro_active, lro, + lro_entry, next); + tcp_lro_flush(cntl, lro); + return -1; + } + + if (opt_bytes) { + uint32_t tsval = ntohl(*(ts_ptr + 1)); + /* make sure timestamp values are increasing */ + if (__predict_false(lro->tsval > tsval || + *(ts_ptr + 2) == 0)) { + return -1; + } + lro->tsval = tsval; + lro->tsecr = *(ts_ptr + 2); + } + + lro->next_seq += tcp_data_len; + lro->ack_seq = tcp->th_ack; + lro->window = tcp->th_win; + lro->append_cnt++; + if (tcp_data_len == 0) { + m_freem(m_head); + return 0; + } + /* subtract off the checksum of the tcp header + * from the hardware checksum, and add it to the + * stored tcp data checksum. Byteswap the checksum + * if the total length so far is odd + */ + tmp_csum = do_csum_data((uint16_t*)tcp, + tcp_hdr_len); + csum = csum + (tmp_csum ^ 0xffff); + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + if (lro->len & 0x1) { + /* Odd number of bytes so far, flip bytes */ + csum = ((csum << 8) | (csum >> 8)) & 0xffff; + } + csum = csum + lro->data_csum; + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + lro->data_csum = csum; + + lro->len += tcp_data_len; + + /* adjust mbuf so that m->m_data points to + the first byte of the payload */ + m_adj(m_head, hlen); + /* append mbuf chain */ + lro->m_tail->m_next = m_head; + /* advance the last pointer */ + lro->m_tail = m_tail; + /* flush packet if required */ + device_mtu = cntl->ifp->if_mtu; + if (lro->len > (65535 - device_mtu)) { + SLIST_REMOVE(&cntl->lro_active, lro, + lro_entry, next); + tcp_lro_flush(cntl, lro); + } + return 0; + } + } + + if (SLIST_EMPTY(&cntl->lro_free)) + return -1; + + /* start a new chain */ + lro = SLIST_FIRST(&cntl->lro_free); + SLIST_REMOVE_HEAD(&cntl->lro_free, next); + SLIST_INSERT_HEAD(&cntl->lro_active, lro, next); + lro->source_port = tcp->th_sport; + lro->dest_port = tcp->th_dport; + lro->source_ip = ip->ip_src.s_addr; + lro->dest_ip = ip->ip_dst.s_addr; + lro->next_seq = seq + tcp_data_len; + lro->mss = tcp_data_len; + lro->ack_seq = tcp->th_ack; + lro->window = tcp->th_win; + + /* save the checksum of just the TCP payload by + * subtracting off the checksum of the TCP header from + * the entire hardware checksum + * Since IP header checksum is correct, checksum over + * the IP header is -0. Substracting -0 is unnecessary. + */ + tmp_csum = do_csum_data((uint16_t*)tcp, tcp_hdr_len); + csum = csum + (tmp_csum ^ 0xffff); + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + lro->data_csum = csum; + + lro->ip = ip; + /* record timestamp if it is present */ + if (opt_bytes) { + lro->timestamp = 1; + lro->tsval = ntohl(*(ts_ptr + 1)); + lro->tsecr = *(ts_ptr + 2); + } + lro->len = tot_len; + lro->m_head = m_head; + lro->m_tail = m_tail; + return 0; +} diff --git a/freebsd/sys/netinet/tcp_lro.h b/freebsd/sys/netinet/tcp_lro.h new file mode 100644 index 00000000..20cfb7cf --- /dev/null +++ b/freebsd/sys/netinet/tcp_lro.h @@ -0,0 +1,85 @@ +/******************************************************************************* + +Copyright (c) 2006, Myricom Inc. +Copyright (c) 2008, Intel Corporation. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Myricom Inc, nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + 2. Neither the name of the Intel Corporation, nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +$FreeBSD$ + +***************************************************************************/ +#ifndef _TCP_LRO_HH_ +#define _TCP_LRO_HH_ + +struct lro_entry; +struct lro_entry +{ + SLIST_ENTRY(lro_entry) next; + struct mbuf *m_head; + struct mbuf *m_tail; + int timestamp; + struct ip *ip; + uint32_t tsval; + uint32_t tsecr; + uint32_t source_ip; + uint32_t dest_ip; + uint32_t next_seq; + uint32_t ack_seq; + uint32_t len; + uint32_t data_csum; + uint16_t window; + uint16_t source_port; + uint16_t dest_port; + uint16_t append_cnt; + uint16_t mss; + +}; +SLIST_HEAD(lro_head, lro_entry); + +struct lro_ctrl { + struct ifnet *ifp; + int lro_queued; + int lro_flushed; + int lro_bad_csum; + int lro_cnt; + + struct lro_head lro_active; + struct lro_head lro_free; +}; + + +int tcp_lro_init(struct lro_ctrl *); +void tcp_lro_free(struct lro_ctrl *); +void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); +int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); + +/* Number of LRO entries - these are per rx queue */ +#define LRO_ENTRIES 8 + +#endif /* _TCP_LRO_HH_ */ diff --git a/freebsd/sys/netinet/tcp_offload.c b/freebsd/sys/netinet/tcp_offload.c new file mode 100644 index 00000000..9c73992b --- /dev/null +++ b/freebsd/sys/netinet/tcp_offload.c @@ -0,0 +1,147 @@ +#include + +/*- + * Copyright (c) 2007, Chelsio Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the Chelsio Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +uint32_t toedev_registration_count; + +int +tcp_offload_connect(struct socket *so, struct sockaddr *nam) +{ + struct ifnet *ifp; + struct toedev *tdev; + struct rtentry *rt; + int error; + + if (toedev_registration_count == 0) + return (EINVAL); + + /* + * Look up the route used for the connection to + * determine if it uses an interface capable of + * offloading the connection. + */ + rt = rtalloc1(nam, 0 /*report*/, 0 /*ignflags*/); + if (rt) + RT_UNLOCK(rt); + else + return (EHOSTUNREACH); + + ifp = rt->rt_ifp; + if ((ifp->if_capenable & IFCAP_TOE) == 0) { + error = EINVAL; + goto fail; + } + + tdev = TOEDEV(ifp); + if (tdev == NULL) { + error = EPERM; + goto fail; + } + + if (tdev->tod_can_offload(tdev, so) == 0) { + error = EPERM; + goto fail; + } + + return (tdev->tod_connect(tdev, so, rt, nam)); +fail: + RTFREE(rt); + return (error); +} + + +/* + * This file contains code as a short-term staging area before it is moved in + * to sys/netinet/tcp_offload.c + */ + +void +tcp_offload_twstart(struct tcpcb *tp) +{ + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(tp->t_inpcb); + tcp_twstart(tp); + INP_INFO_WUNLOCK(&V_tcbinfo); +} + +struct tcpcb * +tcp_offload_close(struct tcpcb *tp) +{ + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(tp->t_inpcb); + tp = tcp_close(tp); + INP_INFO_WUNLOCK(&V_tcbinfo); + if (tp) + INP_WUNLOCK(tp->t_inpcb); + + return (tp); +} + +struct tcpcb * +tcp_offload_drop(struct tcpcb *tp, int error) +{ + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(tp->t_inpcb); + tp = tcp_drop(tp, error); + INP_INFO_WUNLOCK(&V_tcbinfo); + if (tp) + INP_WUNLOCK(tp->t_inpcb); + + return (tp); +} + diff --git a/freebsd/sys/netinet/tcp_offload.h b/freebsd/sys/netinet/tcp_offload.h new file mode 100644 index 00000000..f2a35a58 --- /dev/null +++ b/freebsd/sys/netinet/tcp_offload.h @@ -0,0 +1,354 @@ +/*- + * Copyright (c) 2007, Chelsio Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the Chelsio Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_OFFLOAD_HH_ +#define _NETINET_TCP_OFFLOAD_HH_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +/* + * A driver publishes that it provides offload services + * by setting IFCAP_TOE in the ifnet. The offload connect + * will bypass any further work if the interface that a + * connection would use does not support TCP offload. + * + * The TOE API assumes that the tcp offload engine can offload the + * the entire connection from set up to teardown, with some provision + * being made to allowing the software stack to handle time wait. If + * the device does not meet these criteria, it is the driver's responsibility + * to overload the functions that it needs to in tcp_usrreqs and make + * its own calls to tcp_output if it needs to do so. + * + * There is currently no provision for the device advertising the congestion + * control algorithms it supports as there is currently no API for querying + * an operating system for the protocols that it has loaded. This is a desirable + * future extension. + * + * + * + * It is assumed that individuals deploying TOE will want connections + * to be offloaded without software changes so all connections on an + * interface providing TOE are offloaded unless the the SO_NO_OFFLOAD + * flag is set on the socket. + * + * + * The toe_usrreqs structure constitutes the TOE driver's + * interface to the TCP stack for functionality that doesn't + * interact directly with userspace. If one wants to provide + * (optional) functionality to do zero-copy to/from + * userspace one still needs to override soreceive/sosend + * with functions that fault in and pin the user buffers. + * + * + tu_send + * - tells the driver that new data may have been added to the + * socket's send buffer - the driver should not fail if the + * buffer is in fact unchanged + * - the driver is responsible for providing credits (bytes in the send window) + * back to the socket by calling sbdrop() as segments are acknowledged. + * - The driver expects the inpcb lock to be held - the driver is expected + * not to drop the lock. Hence the driver is not allowed to acquire the + * pcbinfo lock during this call. + * + * + tu_rcvd + * - returns credits to the driver and triggers window updates + * to the peer (a credit as used here is a byte in the peer's receive window) + * - the driver is expected to determine how many bytes have been + * consumed and credit that back to the card so that it can grow + * the window again by maintaining its own state between invocations. + * - In principle this could be used to shrink the window as well as + * grow the window, although it is not used for that now. + * - this function needs to correctly handle being called any number of + * times without any bytes being consumed from the receive buffer. + * - The driver expects the inpcb lock to be held - the driver is expected + * not to drop the lock. Hence the driver is not allowed to acquire the + * pcbinfo lock during this call. + * + * + tu_disconnect + * - tells the driver to send FIN to peer + * - driver is expected to send the remaining data and then do a clean half close + * - disconnect implies at least half-close so only send, reset, and detach + * are legal + * - the driver is expected to handle transition through the shutdown + * state machine and allow the stack to support SO_LINGER. + * - The driver expects the inpcb lock to be held - the driver is expected + * not to drop the lock. Hence the driver is not allowed to acquire the + * pcbinfo lock during this call. + * + * + tu_reset + * - closes the connection and sends a RST to peer + * - driver is expectd to trigger an RST and detach the toepcb + * - no further calls are legal after reset + * - The driver expects the inpcb lock to be held - the driver is expected + * not to drop the lock. Hence the driver is not allowed to acquire the + * pcbinfo lock during this call. + * + * The following fields in the tcpcb are expected to be referenced by the driver: + * + iss + * + rcv_nxt + * + rcv_wnd + * + snd_isn + * + snd_max + * + snd_nxt + * + snd_una + * + t_flags + * + t_inpcb + * + t_maxseg + * + t_toe + * + * The following fields in the inpcb are expected to be referenced by the driver: + * + inp_lport + * + inp_fport + * + inp_laddr + * + inp_fport + * + inp_socket + * + inp_ip_tos + * + * The following fields in the socket are expected to be referenced by the + * driver: + * + so_comp + * + so_error + * + so_linger + * + so_options + * + so_rcv + * + so_snd + * + so_state + * + so_timeo + * + * These functions all return 0 on success and can return the following errors + * as appropriate: + * + EPERM: + * + ENOBUFS: memory allocation failed + * + EMSGSIZE: MTU changed during the call + * + EHOSTDOWN: + * + EHOSTUNREACH: + * + ENETDOWN: + * * ENETUNREACH: the peer is no longer reachable + * + * + tu_detach + * - tells driver that the socket is going away so disconnect + * the toepcb and free appropriate resources + * - allows the driver to cleanly handle the case of connection state + * outliving the socket + * - no further calls are legal after detach + * - the driver is expected to provide its own synchronization between + * detach and receiving new data. + * + * + tu_syncache_event + * - even if it is not actually needed, the driver is expected to + * call syncache_add for the initial SYN and then syncache_expand + * for the SYN,ACK + * - tells driver that a connection either has not been added or has + * been dropped from the syncache + * - the driver is expected to maintain state that lives outside the + * software stack so the syncache needs to be able to notify the + * toe driver that the software stack is not going to create a connection + * for a received SYN + * - The driver is responsible for any synchronization required between + * the syncache dropping an entry and the driver processing the SYN,ACK. + * + */ +struct toe_usrreqs { + int (*tu_send)(struct tcpcb *tp); + int (*tu_rcvd)(struct tcpcb *tp); + int (*tu_disconnect)(struct tcpcb *tp); + int (*tu_reset)(struct tcpcb *tp); + void (*tu_detach)(struct tcpcb *tp); + void (*tu_syncache_event)(int event, void *toep); +}; + +/* + * Proxy for struct tcpopt between TOE drivers and TCP functions. + */ +struct toeopt { + u_int64_t to_flags; /* see tcpopt in tcp_var.h */ + u_int16_t to_mss; /* maximum segment size */ + u_int8_t to_wscale; /* window scaling */ + + u_int8_t _pad1; /* explicit pad for 64bit alignment */ + u_int32_t _pad2; /* explicit pad for 64bit alignment */ + u_int64_t _pad3[4]; /* TBD */ +}; + +#define TOE_SC_ENTRY_PRESENT 1 /* 4-tuple already present */ +#define TOE_SC_DROP 2 /* connection was timed out */ + +/* + * Because listen is a one-to-many relationship (a socket can be listening + * on all interfaces on a machine some of which may be using different TCP + * offload devices), listen uses a publish/subscribe mechanism. The TCP + * offload driver registers a listen notification function with the stack. + * When a listen socket is created all TCP offload devices are notified + * so that they can do the appropriate set up to offload connections on the + * port to which the socket is bound. When the listen socket is closed, + * the offload devices are notified so that they will stop listening on that + * port and free any associated resources as well as sending RSTs on any + * connections in the SYN_RCVD state. + * + */ + +typedef void (*tcp_offload_listen_start_fn)(void *, struct tcpcb *); +typedef void (*tcp_offload_listen_stop_fn)(void *, struct tcpcb *); + +EVENTHANDLER_DECLARE(tcp_offload_listen_start, tcp_offload_listen_start_fn); +EVENTHANDLER_DECLARE(tcp_offload_listen_stop, tcp_offload_listen_stop_fn); + +/* + * Check if the socket can be offloaded by the following steps: + * - determine the egress interface + * - check the interface for TOE capability and TOE is enabled + * - check if the device has resources to offload the connection + */ +int tcp_offload_connect(struct socket *so, struct sockaddr *nam); + +/* + * The tcp_output_* routines are wrappers around the toe_usrreqs calls + * which trigger packet transmission. In the non-offloaded case they + * translate to tcp_output. The tcp_offload_* routines notify TOE + * of specific events. I the non-offloaded case they are no-ops. + * + * Listen is a special case because it is a 1 to many relationship + * and there can be more than one offload driver in the system. + */ + +/* + * Connection is offloaded + */ +#define tp_offload(tp) ((tp)->t_flags & TF_TOE) + +/* + * hackish way of allowing this file to also be included by TOE + * which needs to be kept ignorant of socket implementation details + */ +#ifdef _SYS_SOCKETVAR_HH_ +/* + * The socket has not been marked as "do not offload" + */ +#define SO_OFFLOADABLE(so) ((so->so_options & SO_NO_OFFLOAD) == 0) + +static __inline int +tcp_output_connect(struct socket *so, struct sockaddr *nam) +{ + struct tcpcb *tp = sototcpcb(so); + int error; + + /* + * If offload has been disabled for this socket or the + * connection cannot be offloaded just call tcp_output + * to start the TCP state machine. + */ +#ifndef TCP_OFFLOAD_DISABLE + if (!SO_OFFLOADABLE(so) || (error = tcp_offload_connect(so, nam)) != 0) +#endif + error = tcp_output(tp); + return (error); +} + +static __inline int +tcp_output_send(struct tcpcb *tp) +{ + +#ifndef TCP_OFFLOAD_DISABLE + if (tp_offload(tp)) + return (tp->t_tu->tu_send(tp)); +#endif + return (tcp_output(tp)); +} + +static __inline int +tcp_output_rcvd(struct tcpcb *tp) +{ + +#ifndef TCP_OFFLOAD_DISABLE + if (tp_offload(tp)) + return (tp->t_tu->tu_rcvd(tp)); +#endif + return (tcp_output(tp)); +} + +static __inline int +tcp_output_disconnect(struct tcpcb *tp) +{ + +#ifndef TCP_OFFLOAD_DISABLE + if (tp_offload(tp)) + return (tp->t_tu->tu_disconnect(tp)); +#endif + return (tcp_output(tp)); +} + +static __inline int +tcp_output_reset(struct tcpcb *tp) +{ + +#ifndef TCP_OFFLOAD_DISABLE + if (tp_offload(tp)) + return (tp->t_tu->tu_reset(tp)); +#endif + return (tcp_output(tp)); +} + +static __inline void +tcp_offload_detach(struct tcpcb *tp) +{ + +#ifndef TCP_OFFLOAD_DISABLE + if (tp_offload(tp)) + tp->t_tu->tu_detach(tp); +#endif +} + +static __inline void +tcp_offload_listen_open(struct tcpcb *tp) +{ + +#ifndef TCP_OFFLOAD_DISABLE + if (SO_OFFLOADABLE(tp->t_inpcb->inp_socket)) + EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp); +#endif +} + +static __inline void +tcp_offload_listen_close(struct tcpcb *tp) +{ + +#ifndef TCP_OFFLOAD_DISABLE + EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp); +#endif +} +#undef SO_OFFLOADABLE +#endif /* _SYS_SOCKETVAR_HH_ */ +#undef tp_offload + +void tcp_offload_twstart(struct tcpcb *tp); +struct tcpcb *tcp_offload_close(struct tcpcb *tp); +struct tcpcb *tcp_offload_drop(struct tcpcb *tp, int error); + +#endif /* _NETINET_TCP_OFFLOAD_HH_ */ diff --git a/freebsd/sys/netinet/tcp_output.c b/freebsd/sys/netinet/tcp_output.c new file mode 100644 index 00000000..bebab1f1 --- /dev/null +++ b/freebsd/sys/netinet/tcp_output.c @@ -0,0 +1,1485 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#include +#endif +#include +#define TCPOUTFLAGS +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif + +#ifdef IPSEC +#include +#endif /*IPSEC*/ + +#include + +#include + +#ifdef notyet +extern struct mbuf *m_copypack(); +#endif + +VNET_DEFINE(int, path_mtu_discovery) = 1; +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, + &VNET_NAME(path_mtu_discovery), 1, + "Enable Path MTU Discovery"); + +VNET_DEFINE(int, ss_fltsz) = 1; +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, + &VNET_NAME(ss_fltsz), 1, + "Slow start flight size"); + +VNET_DEFINE(int, ss_fltsz_local) = 4; +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, + CTLFLAG_RW, &VNET_NAME(ss_fltsz_local), 1, + "Slow start flight size for local networks"); + +VNET_DEFINE(int, tcp_do_newreno) = 1; +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, + &VNET_NAME(tcp_do_newreno), 0, + "Enable NewReno Algorithms"); + +VNET_DEFINE(int, tcp_do_tso) = 1; +#define V_tcp_do_tso VNET(tcp_do_tso) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, + &VNET_NAME(tcp_do_tso), 0, + "Enable TCP Segmentation Offload"); + +VNET_DEFINE(int, tcp_do_autosndbuf) = 1; +#define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, + &VNET_NAME(tcp_do_autosndbuf), 0, + "Enable automatic send buffer sizing"); + +VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; +#define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, + &VNET_NAME(tcp_autosndbuf_inc), 0, + "Incrementor step size of automatic send buffer"); + +VNET_DEFINE(int, tcp_autosndbuf_max) = 256*1024; +#define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, + &VNET_NAME(tcp_autosndbuf_max), 0, + "Max size of automatic send buffer"); + + +/* + * Tcp output routine: figure out what should be sent and send it. + */ +int +tcp_output(struct tcpcb *tp) +{ + struct socket *so = tp->t_inpcb->inp_socket; + long len, recwin, sendwin; + int off, flags, error, rw; + struct mbuf *m; + struct ip *ip = NULL; + struct ipovly *ipov = NULL; + struct tcphdr *th; + u_char opt[TCP_MAXOLEN]; + unsigned ipoptlen, optlen, hdrlen; +#ifdef IPSEC + unsigned ipsec_optlen = 0; +#endif + int idle, sendalot; + int sack_rxmit, sack_bytes_rxmt; + struct sackhole *p; + int tso; + struct tcpopt to; +#if 0 + int maxburst = TCP_MAXBURST; +#endif +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6; + + isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif + + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * Determine length of data that should be transmitted, + * and flags that will be used. + * If there is some data or critical controls (SYN, RST) + * to send, then transmit; otherwise, investigate further. + */ + idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); + if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) { + /* + * If we've been idle for more than one retransmit + * timeout the old congestion window is no longer + * current and we have to reduce it to the restart + * window before we can transmit again. + * + * The restart window is the initial window or the last + * CWND, whichever is smaller. + * + * This is done to prevent us from flooding the path with + * a full CWND at wirespeed, overloading router and switch + * buffers along the way. + * + * See RFC5681 Section 4.1. "Restarting Idle Connections". + */ + if (V_tcp_do_rfc3390) + rw = min(4 * tp->t_maxseg, + max(2 * tp->t_maxseg, 4380)); +#ifdef INET6 + else if ((isipv6 ? in6_localaddr(&tp->t_inpcb->in6p_faddr) : + in_localaddr(tp->t_inpcb->inp_faddr))) +#else + else if (in_localaddr(tp->t_inpcb->inp_faddr)) +#endif + rw = V_ss_fltsz_local * tp->t_maxseg; + else + rw = V_ss_fltsz * tp->t_maxseg; + + tp->snd_cwnd = min(rw, tp->snd_cwnd); + } + tp->t_flags &= ~TF_LASTIDLE; + if (idle) { + if (tp->t_flags & TF_MORETOCOME) { + tp->t_flags |= TF_LASTIDLE; + idle = 0; + } + } +again: + /* + * If we've recently taken a timeout, snd_max will be greater than + * snd_nxt. There may be SACK information that allows us to avoid + * resending already delivered data. Adjust snd_nxt accordingly. + */ + if ((tp->t_flags & TF_SACK_PERMIT) && + SEQ_LT(tp->snd_nxt, tp->snd_max)) + tcp_sack_adjust(tp); + sendalot = 0; + tso = 0; + off = tp->snd_nxt - tp->snd_una; + sendwin = min(tp->snd_wnd, tp->snd_cwnd); + sendwin = min(sendwin, tp->snd_bwnd); + + flags = tcp_outflags[tp->t_state]; + /* + * Send any SACK-generated retransmissions. If we're explicitly trying + * to send out new data (when sendalot is 1), bypass this function. + * If we retransmit in fast recovery mode, decrement snd_cwnd, since + * we're replacing a (future) new transmission with a retransmission + * now, and we previously incremented snd_cwnd in tcp_input(). + */ + /* + * Still in sack recovery , reset rxmit flag to zero. + */ + sack_rxmit = 0; + sack_bytes_rxmt = 0; + len = 0; + p = NULL; + if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) && + (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { + long cwin; + + cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; + if (cwin < 0) + cwin = 0; + /* Do not retransmit SACK segments beyond snd_recover */ + if (SEQ_GT(p->end, tp->snd_recover)) { + /* + * (At least) part of sack hole extends beyond + * snd_recover. Check to see if we can rexmit data + * for this hole. + */ + if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { + /* + * Can't rexmit any more data for this hole. + * That data will be rexmitted in the next + * sack recovery episode, when snd_recover + * moves past p->rxmit. + */ + p = NULL; + goto after_sack_rexmit; + } else + /* Can rexmit part of the current hole */ + len = ((long)ulmin(cwin, + tp->snd_recover - p->rxmit)); + } else + len = ((long)ulmin(cwin, p->end - p->rxmit)); + off = p->rxmit - tp->snd_una; + KASSERT(off >= 0,("%s: sack block to the left of una : %d", + __func__, off)); + if (len > 0) { + sack_rxmit = 1; + sendalot = 1; + TCPSTAT_INC(tcps_sack_rexmits); + TCPSTAT_ADD(tcps_sack_rexmit_bytes, + min(len, tp->t_maxseg)); + } + } +after_sack_rexmit: + /* + * Get standard flags, and add SYN or FIN if requested by 'hidden' + * state flags. + */ + if (tp->t_flags & TF_NEEDFIN) + flags |= TH_FIN; + if (tp->t_flags & TF_NEEDSYN) + flags |= TH_SYN; + + SOCKBUF_LOCK(&so->so_snd); + /* + * If in persist timeout with window of 0, send 1 byte. + * Otherwise, if window is small but nonzero + * and timer expired, we will send what we can + * and go to transmit state. + */ + if (tp->t_flags & TF_FORCEDATA) { + if (sendwin == 0) { + /* + * If we still have some data to send, then + * clear the FIN bit. Usually this would + * happen below when it realizes that we + * aren't sending all the data. However, + * if we have exactly 1 byte of unsent data, + * then it won't clear the FIN bit below, + * and if we are in persist state, we wind + * up sending the packet without recording + * that we sent the FIN bit. + * + * We can't just blindly clear the FIN bit, + * because if we don't have any more data + * to send then the probe will be the FIN + * itself. + */ + if (off < so->so_snd.sb_cc) + flags &= ~TH_FIN; + sendwin = 1; + } else { + tcp_timer_activate(tp, TT_PERSIST, 0); + tp->t_rxtshift = 0; + } + } + + /* + * If snd_nxt == snd_max and we have transmitted a FIN, the + * offset will be > 0 even if so_snd.sb_cc is 0, resulting in + * a negative length. This can also occur when TCP opens up + * its congestion window while receiving additional duplicate + * acks after fast-retransmit because TCP will reset snd_nxt + * to snd_max after the fast-retransmit. + * + * In the normal retransmit-FIN-only case, however, snd_nxt will + * be set to snd_una, the offset will be 0, and the length may + * wind up 0. + * + * If sack_rxmit is true we are retransmitting from the scoreboard + * in which case len is already set. + */ + if (sack_rxmit == 0) { + if (sack_bytes_rxmt == 0) + len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); + else { + long cwin; + + /* + * We are inside of a SACK recovery episode and are + * sending new data, having retransmitted all the + * data possible in the scoreboard. + */ + len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) + - off); + /* + * Don't remove this (len > 0) check ! + * We explicitly check for len > 0 here (although it + * isn't really necessary), to work around a gcc + * optimization issue - to force gcc to compute + * len above. Without this check, the computation + * of len is bungled by the optimizer. + */ + if (len > 0) { + cwin = tp->snd_cwnd - + (tp->snd_nxt - tp->sack_newdata) - + sack_bytes_rxmt; + if (cwin < 0) + cwin = 0; + len = lmin(len, cwin); + } + } + } + + /* + * Lop off SYN bit if it has already been sent. However, if this + * is SYN-SENT state and if segment contains data and if we don't + * know that foreign host supports TAO, suppress sending segment. + */ + if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { + if (tp->t_state != TCPS_SYN_RECEIVED) + flags &= ~TH_SYN; + off--, len++; + } + + /* + * Be careful not to send data and/or FIN on SYN segments. + * This measure is needed to prevent interoperability problems + * with not fully conformant TCP implementations. + */ + if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { + len = 0; + flags &= ~TH_FIN; + } + + if (len < 0) { + /* + * If FIN has been sent but not acked, + * but we haven't been called to retransmit, + * len will be < 0. Otherwise, window shrank + * after we sent into it. If window shrank to 0, + * cancel pending retransmit, pull snd_nxt back + * to (closed) window, and set the persist timer + * if it isn't already going. If the window didn't + * close completely, just wait for an ACK. + */ + len = 0; + if (sendwin == 0) { + tcp_timer_activate(tp, TT_REXMT, 0); + tp->t_rxtshift = 0; + tp->snd_nxt = tp->snd_una; + if (!tcp_timer_active(tp, TT_PERSIST)) + tcp_setpersist(tp); + } + } + + /* len will be >= 0 after this point. */ + KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); + + /* + * Automatic sizing of send socket buffer. Often the send buffer + * size is not optimally adjusted to the actual network conditions + * at hand (delay bandwidth product). Setting the buffer size too + * small limits throughput on links with high bandwidth and high + * delay (eg. trans-continental/oceanic links). Setting the + * buffer size too big consumes too much real kernel memory, + * especially with many connections on busy servers. + * + * The criteria to step up the send buffer one notch are: + * 1. receive window of remote host is larger than send buffer + * (with a fudge factor of 5/4th); + * 2. send buffer is filled to 7/8th with data (so we actually + * have data to make use of it); + * 3. send buffer fill has not hit maximal automatic size; + * 4. our send window (slow start and cogestion controlled) is + * larger than sent but unacknowledged data in send buffer. + * + * The remote host receive window scaling factor may limit the + * growing of the send buffer before it reaches its allowed + * maximum. + * + * It scales directly with slow start or congestion window + * and does at most one step per received ACK. This fast + * scaling has the drawback of growing the send buffer beyond + * what is strictly necessary to make full use of a given + * delay*bandwith product. However testing has shown this not + * to be much of an problem. At worst we are trading wasting + * of available bandwith (the non-use of it) for wasting some + * socket buffer memory. + * + * TODO: Shrink send buffer during idle periods together + * with congestion window. Requires another timer. Has to + * wait for upcoming tcp timer rewrite. + */ + if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { + if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && + so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && + so->so_snd.sb_cc < V_tcp_autosndbuf_max && + sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { + if (!sbreserve_locked(&so->so_snd, + min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, + V_tcp_autosndbuf_max), so, curthread)) + so->so_snd.sb_flags &= ~SB_AUTOSIZE; + } + } + + /* + * Truncate to the maximum segment length or enable TCP Segmentation + * Offloading (if supported by hardware) and ensure that FIN is removed + * if the length no longer contains the last data byte. + * + * TSO may only be used if we are in a pure bulk sending state. The + * presence of TCP-MD5, SACK retransmits, SACK advertizements and + * IP options prevent using TSO. With TSO the TCP header is the same + * (except for the sequence number) for all generated packets. This + * makes it impossible to transmit any options which vary per generated + * segment or packet. + * + * The length of TSO bursts is limited to TCP_MAXWIN. That limit and + * removal of FIN (if not already catched here) are handled later after + * the exact length of the TCP options are known. + */ +#ifdef IPSEC + /* + * Pre-calculate here as we save another lookup into the darknesses + * of IPsec that way and can actually decide if TSO is ok. + */ + ipsec_optlen = ipsec_hdrsiz_tcp(tp); +#endif + if (len > tp->t_maxseg) { + if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && + ((tp->t_flags & TF_SIGNATURE) == 0) && + tp->rcv_numsacks == 0 && sack_rxmit == 0 && + tp->t_inpcb->inp_options == NULL && + tp->t_inpcb->in6p_options == NULL +#ifdef IPSEC + && ipsec_optlen == 0 +#endif + ) { + tso = 1; + } else { + len = tp->t_maxseg; + sendalot = 1; + } + } + + if (sack_rxmit) { + if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) + flags &= ~TH_FIN; + } else { + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + flags &= ~TH_FIN; + } + + recwin = sbspace(&so->so_rcv); + + /* + * Sender silly window avoidance. We transmit under the following + * conditions when len is non-zero: + * + * - We have a full segment (or more with TSO) + * - This is the last buffer in a write()/send() and we are + * either idle or running NODELAY + * - we've timed out (e.g. persist timer) + * - we have more then 1/2 the maximum send window's worth of + * data (receiver may be limited the window size) + * - we need to retransmit + */ + if (len) { + if (len >= tp->t_maxseg) + goto send; + /* + * NOTE! on localhost connections an 'ack' from the remote + * end may occur synchronously with the output and cause + * us to flush a buffer queued with moretocome. XXX + * + * note: the len + off check is almost certainly unnecessary. + */ + if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ + (idle || (tp->t_flags & TF_NODELAY)) && + len + off >= so->so_snd.sb_cc && + (tp->t_flags & TF_NOPUSH) == 0) { + goto send; + } + if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ + goto send; + if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) + goto send; + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ + goto send; + if (sack_rxmit) + goto send; + } + + /* + * Compare available window to amount of window + * known to peer (as advertised window less + * next expected input). If the difference is at least two + * max size segments, or at least 50% of the maximum possible + * window, then want to send a window update to peer. + * Skip this if the connection is in T/TCP half-open state. + * Don't send pure window updates when the peer has closed + * the connection and won't ever send more data. + */ + if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && + !TCPS_HAVERCVDFIN(tp->t_state)) { + /* + * "adv" is the amount we can increase the window, + * taking into account that we are limited by + * TCP_MAXWIN << tp->rcv_scale. + */ + long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) - + (tp->rcv_adv - tp->rcv_nxt); + + if (adv >= (long) (2 * tp->t_maxseg)) + goto send; + if (2 * adv >= (long) so->so_rcv.sb_hiwat) + goto send; + } + + /* + * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW + * is also a catch-all for the retransmit timer timeout case. + */ + if (tp->t_flags & TF_ACKNOW) + goto send; + if ((flags & TH_RST) || + ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) + goto send; + if (SEQ_GT(tp->snd_up, tp->snd_una)) + goto send; + /* + * If our state indicates that FIN should be sent + * and we have not yet done so, then we need to send. + */ + if (flags & TH_FIN && + ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) + goto send; + /* + * In SACK, it is possible for tcp_output to fail to send a segment + * after the retransmission timer has been turned off. Make sure + * that the retransmission timer is set. + */ + if ((tp->t_flags & TF_SACK_PERMIT) && + SEQ_GT(tp->snd_max, tp->snd_una) && + !tcp_timer_active(tp, TT_REXMT) && + !tcp_timer_active(tp, TT_PERSIST)) { + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + goto just_return; + } + /* + * TCP window updates are not reliable, rather a polling protocol + * using ``persist'' packets is used to insure receipt of window + * updates. The three ``states'' for the output side are: + * idle not doing retransmits or persists + * persisting to move a small or zero window + * (re)transmitting and thereby not persisting + * + * tcp_timer_active(tp, TT_PERSIST) + * is true when we are in persist state. + * (tp->t_flags & TF_FORCEDATA) + * is set when we are called to send a persist packet. + * tcp_timer_active(tp, TT_REXMT) + * is set when we are retransmitting + * The output side is idle when both timers are zero. + * + * If send window is too small, there is data to transmit, and no + * retransmit or persist is pending, then go to persist state. + * If nothing happens soon, send when timer expires: + * if window is nonzero, transmit what we can, + * otherwise force out a byte. + */ + if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) && + !tcp_timer_active(tp, TT_PERSIST)) { + tp->t_rxtshift = 0; + tcp_setpersist(tp); + } + + /* + * No reason to send a segment, just return. + */ +just_return: + SOCKBUF_UNLOCK(&so->so_snd); + return (0); + +send: + SOCKBUF_LOCK_ASSERT(&so->so_snd); + /* + * Before ESTABLISHED, force sending of initial options + * unless TCP set not to do any options. + * NOTE: we assume that the IP/TCP header plus TCP options + * always fit in a single mbuf, leaving room for a maximum + * link header, i.e. + * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES + */ + optlen = 0; +#ifdef INET6 + if (isipv6) + hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + else +#endif + hdrlen = sizeof (struct tcpiphdr); + + /* + * Compute options for segment. + * We only have to care about SYN and established connection + * segments. Options for SYN-ACK segments are handled in TCP + * syncache. + */ + if ((tp->t_flags & TF_NOOPT) == 0) { + to.to_flags = 0; + /* Maximum segment size. */ + if (flags & TH_SYN) { + tp->snd_nxt = tp->iss; + to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); + to.to_flags |= TOF_MSS; + } + /* Window scaling. */ + if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { + to.to_wscale = tp->request_r_scale; + to.to_flags |= TOF_SCALE; + } + /* Timestamps. */ + if ((tp->t_flags & TF_RCVD_TSTMP) || + ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { + to.to_tsval = ticks + tp->ts_offset; + to.to_tsecr = tp->ts_recent; + to.to_flags |= TOF_TS; + /* Set receive buffer autosizing timestamp. */ + if (tp->rfbuf_ts == 0 && + (so->so_rcv.sb_flags & SB_AUTOSIZE)) + tp->rfbuf_ts = ticks; + } + /* Selective ACK's. */ + if (tp->t_flags & TF_SACK_PERMIT) { + if (flags & TH_SYN) + to.to_flags |= TOF_SACKPERM; + else if (TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->t_flags & TF_SACK_PERMIT) && + tp->rcv_numsacks > 0) { + to.to_flags |= TOF_SACK; + to.to_nsacks = tp->rcv_numsacks; + to.to_sacks = (u_char *)tp->sackblks; + } + } +#ifdef TCP_SIGNATURE + /* TCP-MD5 (RFC2385). */ + if (tp->t_flags & TF_SIGNATURE) + to.to_flags |= TOF_SIGNATURE; +#endif /* TCP_SIGNATURE */ + + /* Processing the options. */ + hdrlen += optlen = tcp_addoptions(&to, opt); + } + +#ifdef INET6 + if (isipv6) + ipoptlen = ip6_optlen(tp->t_inpcb); + else +#endif + if (tp->t_inpcb->inp_options) + ipoptlen = tp->t_inpcb->inp_options->m_len - + offsetof(struct ipoption, ipopt_list); + else + ipoptlen = 0; +#ifdef IPSEC + ipoptlen += ipsec_optlen; +#endif + + /* + * Adjust data length if insertion of options will + * bump the packet length beyond the t_maxopd length. + * Clear the FIN bit because we cut off the tail of + * the segment. + * + * When doing TSO limit a burst to TCP_MAXWIN minus the + * IP, TCP and Options length to keep ip->ip_len from + * overflowing. Prevent the last segment from being + * fractional thus making them all equal sized and set + * the flag to continue sending. TSO is disabled when + * IP options or IPSEC are present. + */ + if (len + optlen + ipoptlen > tp->t_maxopd) { + flags &= ~TH_FIN; + if (tso) { + if (len > TCP_MAXWIN - hdrlen - optlen) { + len = TCP_MAXWIN - hdrlen - optlen; + len = len - (len % (tp->t_maxopd - optlen)); + sendalot = 1; + } else if (tp->t_flags & TF_NEEDFIN) + sendalot = 1; + } else { + len = tp->t_maxopd - optlen - ipoptlen; + sendalot = 1; + } + } + +/*#ifdef DIAGNOSTIC*/ +#ifdef INET6 + if (max_linkhdr + hdrlen > MCLBYTES) +#else + if (max_linkhdr + hdrlen > MHLEN) +#endif + panic("tcphdr too big"); +/*#endif*/ + + /* + * This KASSERT is here to catch edge cases at a well defined place. + * Before, those had triggered (random) panic conditions further down. + */ + KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); + + /* + * Grab a header mbuf, attaching a copy of data to + * be transmitted, and initialize the header from + * the template for sends on this connection. + */ + if (len) { + struct mbuf *mb; + u_int moff; + + if ((tp->t_flags & TF_FORCEDATA) && len == 1) + TCPSTAT_INC(tcps_sndprobe); + else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { + TCPSTAT_INC(tcps_sndrexmitpack); + TCPSTAT_ADD(tcps_sndrexmitbyte, len); + } else { + TCPSTAT_INC(tcps_sndpack); + TCPSTAT_ADD(tcps_sndbyte, len); + } +#ifdef notyet + if ((m = m_copypack(so->so_snd.sb_mb, off, + (int)len, max_linkhdr + hdrlen)) == 0) { + SOCKBUF_UNLOCK(&so->so_snd); + error = ENOBUFS; + goto out; + } + /* + * m_copypack left space for our hdr; use it. + */ + m->m_len += hdrlen; + m->m_data -= hdrlen; +#else + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) { + SOCKBUF_UNLOCK(&so->so_snd); + error = ENOBUFS; + goto out; + } +#ifdef INET6 + if (MHLEN < hdrlen + max_linkhdr) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + SOCKBUF_UNLOCK(&so->so_snd); + m_freem(m); + error = ENOBUFS; + goto out; + } + } +#endif + m->m_data += max_linkhdr; + m->m_len = hdrlen; + + /* + * Start the m_copy functions from the closest mbuf + * to the offset in the socket buffer chain. + */ + mb = sbsndptr(&so->so_snd, off, len, &moff); + + if (len <= MHLEN - hdrlen - max_linkhdr) { + m_copydata(mb, moff, (int)len, + mtod(m, caddr_t) + hdrlen); + m->m_len += len; + } else { + m->m_next = m_copy(mb, moff, (int)len); + if (m->m_next == NULL) { + SOCKBUF_UNLOCK(&so->so_snd); + (void) m_free(m); + error = ENOBUFS; + goto out; + } + } +#endif + /* + * If we're sending everything we've got, set PUSH. + * (This will keep happy those implementations which only + * give data to the user when a buffer fills or + * a PUSH comes in.) + */ + if (off + len == so->so_snd.sb_cc) + flags |= TH_PUSH; + SOCKBUF_UNLOCK(&so->so_snd); + } else { + SOCKBUF_UNLOCK(&so->so_snd); + if (tp->t_flags & TF_ACKNOW) + TCPSTAT_INC(tcps_sndacks); + else if (flags & (TH_SYN|TH_FIN|TH_RST)) + TCPSTAT_INC(tcps_sndctrl); + else if (SEQ_GT(tp->snd_up, tp->snd_una)) + TCPSTAT_INC(tcps_sndurg); + else + TCPSTAT_INC(tcps_sndwinup); + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + goto out; + } +#ifdef INET6 + if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && + MHLEN >= hdrlen) { + MH_ALIGN(m, hdrlen); + } else +#endif + m->m_data += max_linkhdr; + m->m_len = hdrlen; + } + SOCKBUF_UNLOCK_ASSERT(&so->so_snd); + m->m_pkthdr.rcvif = (struct ifnet *)0; +#ifdef MAC + mac_inpcb_create_mbuf(tp->t_inpcb, m); +#endif +#ifdef INET6 + if (isipv6) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + tcpip_fillheaders(tp->t_inpcb, ip6, th); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + ipov = (struct ipovly *)ip; + th = (struct tcphdr *)(ip + 1); + tcpip_fillheaders(tp->t_inpcb, ip, th); + } + + /* + * Fill in fields, remembering maximum advertised + * window for use in delaying messages about window sizes. + * If resending a FIN, be sure not to use a new sequence number. + */ + if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && + tp->snd_nxt == tp->snd_max) + tp->snd_nxt--; + /* + * If we are starting a connection, send ECN setup + * SYN packet. If we are on a retransmit, we may + * resend those bits a number of times as per + * RFC 3168. + */ + if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { + if (tp->t_rxtshift >= 1) { + if (tp->t_rxtshift <= V_tcp_ecn_maxretries) + flags |= TH_ECE|TH_CWR; + } else + flags |= TH_ECE|TH_CWR; + } + + if (tp->t_state == TCPS_ESTABLISHED && + (tp->t_flags & TF_ECN_PERMIT)) { + /* + * If the peer has ECN, mark data packets with + * ECN capable transmission (ECT). + * Ignore pure ack packets, retransmissions and window probes. + */ + if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && + !((tp->t_flags & TF_FORCEDATA) && len == 1)) { +#ifdef INET6 + if (isipv6) + ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); + else +#endif + ip->ip_tos |= IPTOS_ECN_ECT0; + TCPSTAT_INC(tcps_ecn_ect0); + } + + /* + * Reply with proper ECN notifications. + */ + if (tp->t_flags & TF_ECN_SND_CWR) { + flags |= TH_CWR; + tp->t_flags &= ~TF_ECN_SND_CWR; + } + if (tp->t_flags & TF_ECN_SND_ECE) + flags |= TH_ECE; + } + + /* + * If we are doing retransmissions, then snd_nxt will + * not reflect the first unsent octet. For ACK only + * packets, we do not want the sequence number of the + * retransmitted packet, we want the sequence number + * of the next unsent octet. So, if there is no data + * (and no SYN or FIN), use snd_max instead of snd_nxt + * when filling in ti_seq. But if we are in persist + * state, snd_max might reflect one byte beyond the + * right edge of the window, so use snd_nxt in that + * case, since we know we aren't doing a retransmission. + * (retransmit and persist are mutually exclusive...) + */ + if (sack_rxmit == 0) { + if (len || (flags & (TH_SYN|TH_FIN)) || + tcp_timer_active(tp, TT_PERSIST)) + th->th_seq = htonl(tp->snd_nxt); + else + th->th_seq = htonl(tp->snd_max); + } else { + th->th_seq = htonl(p->rxmit); + p->rxmit += len; + tp->sackhint.sack_bytes_rexmit += len; + } + th->th_ack = htonl(tp->rcv_nxt); + if (optlen) { + bcopy(opt, th + 1, optlen); + th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; + } + th->th_flags = flags; + /* + * Calculate receive window. Don't shrink window, + * but avoid silly window syndrome. + */ + if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && + recwin < (long)tp->t_maxseg) + recwin = 0; + if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) + recwin = (long)(tp->rcv_adv - tp->rcv_nxt); + if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) + recwin = (long)TCP_MAXWIN << tp->rcv_scale; + + /* + * According to RFC1323 the window field in a SYN (i.e., a + * or ) segment itself is never scaled. The + * case is handled in syncache. + */ + if (flags & TH_SYN) + th->th_win = htons((u_short) + (min(sbspace(&so->so_rcv), TCP_MAXWIN))); + else + th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); + + /* + * Adjust the RXWIN0SENT flag - indicate that we have advertised + * a 0 window. This may cause the remote transmitter to stall. This + * flag tells soreceive() to disable delayed acknowledgements when + * draining the buffer. This can occur if the receiver is attempting + * to read more data than can be buffered prior to transmitting on + * the connection. + */ + if (th->th_win == 0) + tp->t_flags |= TF_RXWIN0SENT; + else + tp->t_flags &= ~TF_RXWIN0SENT; + if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { + th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); + th->th_flags |= TH_URG; + } else + /* + * If no urgent pointer to send, then we pull + * the urgent pointer to the left edge of the send window + * so that it doesn't drift into the send window on sequence + * number wraparound. + */ + tp->snd_up = tp->snd_una; /* drag it along */ + +#ifdef TCP_SIGNATURE + if (tp->t_flags & TF_SIGNATURE) { + int sigoff = to.to_signature - opt; + tcp_signature_compute(m, 0, len, optlen, + (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); + } +#endif + + /* + * Put TCP length in extended header, and then + * checksum extended header and data. + */ + m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ +#ifdef INET6 + if (isipv6) + /* + * ip6_plen is not need to be filled now, and will be filled + * in ip6_output. + */ + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), + sizeof(struct tcphdr) + optlen + len); + else +#endif /* INET6 */ + { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); + + /* IP version must be set here for ipv4/ipv6 checking later */ + KASSERT(ip->ip_v == IPVERSION, + ("%s: IP version incorrect: %d", __func__, ip->ip_v)); + } + + /* + * Enable TSO and specify the size of the segments. + * The TCP pseudo header checksum is always provided. + * XXX: Fixme: This is currently not the case for IPv6. + */ + if (tso) { + KASSERT(len > tp->t_maxopd - optlen, + ("%s: len <= tso_segsz", __func__)); + m->m_pkthdr.csum_flags |= CSUM_TSO; + m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; + } + + /* + * In transmit state, time the transmission and arrange for + * the retransmit. In persist state, just set snd_max. + */ + if ((tp->t_flags & TF_FORCEDATA) == 0 || + !tcp_timer_active(tp, TT_PERSIST)) { + tcp_seq startseq = tp->snd_nxt; + + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (flags & (TH_SYN|TH_FIN)) { + if (flags & TH_SYN) + tp->snd_nxt++; + if (flags & TH_FIN) { + tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } + } + if (sack_rxmit) + goto timer; + tp->snd_nxt += len; + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + tp->snd_max = tp->snd_nxt; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + TCPSTAT_INC(tcps_segstimed); + } + } + + /* + * Set retransmit timer if not currently set, + * and not doing a pure ack or a keep-alive probe. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. + */ +timer: + if (!tcp_timer_active(tp, TT_REXMT) && + ((sack_rxmit && tp->snd_nxt != tp->snd_max) || + (tp->snd_nxt != tp->snd_una))) { + if (tcp_timer_active(tp, TT_PERSIST)) { + tcp_timer_activate(tp, TT_PERSIST, 0); + tp->t_rxtshift = 0; + } + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + } + } else { + /* + * Persist case, update snd_max but since we are in + * persist mode (no window) we do not update snd_nxt. + */ + int xlen = len; + if (flags & TH_SYN) + ++xlen; + if (flags & TH_FIN) { + ++xlen; + tp->t_flags |= TF_SENTFIN; + } + if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) + tp->snd_max = tp->snd_nxt + len; + } + +#ifdef TCPDEBUG + /* + * Trace. + */ + if (so->so_options & SO_DEBUG) { + u_short save = 0; +#ifdef INET6 + if (!isipv6) +#endif + { + save = ipov->ih_len; + ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); + } + tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); +#ifdef INET6 + if (!isipv6) +#endif + ipov->ih_len = save; + } +#endif + + /* + * Fill in IP length and desired time to live and + * send to IP level. There should be a better way + * to handle ttl and tos; we could keep them in + * the template, but need a way to checksum without them. + */ + /* + * m->m_pkthdr.len should have been set before cksum calcuration, + * because in6_cksum() need it. + */ +#ifdef INET6 + if (isipv6) { + /* + * we separately set hoplimit for every segment, since the + * user might want to change the value via setsockopt. + * Also, desired default hop limit might be changed via + * Neighbor Discovery. + */ + ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); + + /* TODO: IPv6 IP6TOS_ECT bit on */ + error = ip6_output(m, + tp->t_inpcb->in6p_outputopts, NULL, + ((so->so_options & SO_DONTROUTE) ? + IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); + } else +#endif /* INET6 */ + { + ip->ip_len = m->m_pkthdr.len; +#ifdef INET6 + if (tp->t_inpcb->inp_vflag & INP_IPV6PROTO) + ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); +#endif /* INET6 */ + /* + * If we do path MTU discovery, then we set DF on every packet. + * This might not be the best thing to do according to RFC3390 + * Section 2. However the tcp hostcache migitates the problem + * so it affects only the first tcp connection with a host. + * + * NB: Don't set DF on small MTU/MSS to have a safe fallback. + */ + if (V_path_mtu_discovery && tp->t_maxopd > V_tcp_minmss) + ip->ip_off |= IP_DF; + + error = ip_output(m, tp->t_inpcb->inp_options, NULL, + ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, + tp->t_inpcb); + } + if (error) { + + /* + * We know that the packet was lost, so back out the + * sequence number advance, if any. + * + * If the error is EPERM the packet got blocked by the + * local firewall. Normally we should terminate the + * connection but the blocking may have been spurious + * due to a firewall reconfiguration cycle. So we treat + * it like a packet loss and let the retransmit timer and + * timeouts do their work over time. + * XXX: It is a POLA question whether calling tcp_drop right + * away would be the really correct behavior instead. + */ + if (((tp->t_flags & TF_FORCEDATA) == 0 || + !tcp_timer_active(tp, TT_PERSIST)) && + ((flags & TH_SYN) == 0) && + (error != EPERM)) { + if (sack_rxmit) { + p->rxmit -= len; + tp->sackhint.sack_bytes_rexmit -= len; + KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, + ("sackhint bytes rtx >= 0")); + } else + tp->snd_nxt -= len; + } +out: + SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ + switch (error) { + case EPERM: + tp->t_softerror = error; + return (error); + case ENOBUFS: + if (!tcp_timer_active(tp, TT_REXMT) && + !tcp_timer_active(tp, TT_PERSIST)) + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + tp->snd_cwnd = tp->t_maxseg; + return (0); + case EMSGSIZE: + /* + * For some reason the interface we used initially + * to send segments changed to another or lowered + * its MTU. + * + * tcp_mtudisc() will find out the new MTU and as + * its last action, initiate retransmission, so it + * is important to not do so here. + * + * If TSO was active we either got an interface + * without TSO capabilits or TSO was turned off. + * Disable it for this connection as too and + * immediatly retry with MSS sized segments generated + * by this function. + */ + if (tso) + tp->t_flags &= ~TF_TSO; + tcp_mtudisc(tp->t_inpcb, 0); + return (0); + case EHOSTDOWN: + case EHOSTUNREACH: + case ENETDOWN: + case ENETUNREACH: + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_softerror = error; + return (0); + } + /* FALLTHROUGH */ + default: + return (error); + } + } + TCPSTAT_INC(tcps_sndtotal); + + /* + * Data sent (as far as we can tell). + * If this advertises a larger window than any other segment, + * then remember the size of the advertised window. + * Any pending ACK has now been sent. + */ + if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + recwin; + tp->last_ack_sent = tp->rcv_nxt; + tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); + if (tcp_timer_active(tp, TT_DELACK)) + tcp_timer_activate(tp, TT_DELACK, 0); +#if 0 + /* + * This completely breaks TCP if newreno is turned on. What happens + * is that if delayed-acks are turned on on the receiver, this code + * on the transmitter effectively destroys the TCP window, forcing + * it to four packets (1.5Kx4 = 6K window). + */ + if (sendalot && (!V_tcp_do_newreno || --maxburst)) + goto again; +#endif + if (sendalot) + goto again; + return (0); +} + +void +tcp_setpersist(struct tcpcb *tp) +{ + int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; + int tt; + + if (tcp_timer_active(tp, TT_REXMT)) + panic("tcp_setpersist: retransmit pending"); + /* + * Start/restart persistance timer. + */ + TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], + TCPTV_PERSMIN, TCPTV_PERSMAX); + tcp_timer_activate(tp, TT_PERSIST, tt); + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; +} + +/* + * Insert TCP options according to the supplied parameters to the place + * optp in a consistent way. Can handle unaligned destinations. + * + * The order of the option processing is crucial for optimal packing and + * alignment for the scarce option space. + * + * The optimal order for a SYN/SYN-ACK segment is: + * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. + * + * The SACK options should be last. SACK blocks consume 8*n+2 bytes. + * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). + * At minimum we need 10 bytes (to generate 1 SACK block). If both + * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, + * we only have 10 bytes for SACK options (40 - (12 + 18)). + */ +int +tcp_addoptions(struct tcpopt *to, u_char *optp) +{ + u_int mask, optlen = 0; + + for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { + if ((to->to_flags & mask) != mask) + continue; + if (optlen == TCP_MAXOLEN) + break; + switch (to->to_flags & mask) { + case TOF_MSS: + while (optlen % 4) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) + continue; + optlen += TCPOLEN_MAXSEG; + *optp++ = TCPOPT_MAXSEG; + *optp++ = TCPOLEN_MAXSEG; + to->to_mss = htons(to->to_mss); + bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); + optp += sizeof(to->to_mss); + break; + case TOF_SCALE: + while (!optlen || optlen % 2 != 1) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) + continue; + optlen += TCPOLEN_WINDOW; + *optp++ = TCPOPT_WINDOW; + *optp++ = TCPOLEN_WINDOW; + *optp++ = to->to_wscale; + break; + case TOF_SACKPERM: + while (optlen % 2) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) + continue; + optlen += TCPOLEN_SACK_PERMITTED; + *optp++ = TCPOPT_SACK_PERMITTED; + *optp++ = TCPOLEN_SACK_PERMITTED; + break; + case TOF_TS: + while (!optlen || optlen % 4 != 2) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) + continue; + optlen += TCPOLEN_TIMESTAMP; + *optp++ = TCPOPT_TIMESTAMP; + *optp++ = TCPOLEN_TIMESTAMP; + to->to_tsval = htonl(to->to_tsval); + to->to_tsecr = htonl(to->to_tsecr); + bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); + optp += sizeof(to->to_tsval); + bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); + optp += sizeof(to->to_tsecr); + break; + case TOF_SIGNATURE: + { + int siglen = TCPOLEN_SIGNATURE - 2; + + while (!optlen || optlen % 4 != 2) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) + continue; + optlen += TCPOLEN_SIGNATURE; + *optp++ = TCPOPT_SIGNATURE; + *optp++ = TCPOLEN_SIGNATURE; + to->to_signature = optp; + while (siglen--) + *optp++ = 0; + break; + } + case TOF_SACK: + { + int sackblks = 0; + struct sackblk *sack = (struct sackblk *)to->to_sacks; + tcp_seq sack_seq; + + while (!optlen || optlen % 4 != 2) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) + continue; + optlen += TCPOLEN_SACKHDR; + *optp++ = TCPOPT_SACK; + sackblks = min(to->to_nsacks, + (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); + *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; + while (sackblks--) { + sack_seq = htonl(sack->start); + bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); + optp += sizeof(sack_seq); + sack_seq = htonl(sack->end); + bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); + optp += sizeof(sack_seq); + optlen += TCPOLEN_SACK; + sack++; + } + TCPSTAT_INC(tcps_sack_send_blocks); + break; + } + default: + panic("%s: unknown TCP option type", __func__); + break; + } + } + + /* Terminate and pad TCP options to a 4 byte boundary. */ + if (optlen % 4) { + optlen += TCPOLEN_EOL; + *optp++ = TCPOPT_EOL; + } + /* + * According to RFC 793 (STD0007): + * "The content of the header beyond the End-of-Option option + * must be header padding (i.e., zero)." + * and later: "The padding is composed of zeros." + */ + while (optlen % 4) { + optlen += TCPOLEN_PAD; + *optp++ = TCPOPT_PAD; + } + + KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); + return (optlen); +} diff --git a/freebsd/sys/netinet/tcp_reass.c b/freebsd/sys/netinet/tcp_reass.c new file mode 100644 index 00000000..aea58740 --- /dev/null +++ b/freebsd/sys/netinet/tcp_reass.c @@ -0,0 +1,335 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif /* TCPDEBUG */ + +static int tcp_reass_sysctl_maxseg(SYSCTL_HANDLER_ARGS); +static int tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS); + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, + "TCP Segment Reassembly Queue"); + +static VNET_DEFINE(int, tcp_reass_maxseg) = 0; +#define V_tcp_reass_maxseg VNET(tcp_reass_maxseg) +SYSCTL_VNET_PROC(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, + &VNET_NAME(tcp_reass_maxseg), 0, &tcp_reass_sysctl_maxseg, "I", + "Global maximum number of TCP Segments in Reassembly Queue"); + +static VNET_DEFINE(int, tcp_reass_qsize) = 0; +#define V_tcp_reass_qsize VNET(tcp_reass_qsize) +SYSCTL_VNET_PROC(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, + &VNET_NAME(tcp_reass_qsize), 0, &tcp_reass_sysctl_qsize, "I", + "Global number of TCP Segments currently in Reassembly Queue"); + +static VNET_DEFINE(int, tcp_reass_overflows) = 0; +#define V_tcp_reass_overflows VNET(tcp_reass_overflows) +SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, + &VNET_NAME(tcp_reass_overflows), 0, + "Global number of TCP Segment Reassembly Queue Overflows"); + +static VNET_DEFINE(uma_zone_t, tcp_reass_zone); +#define V_tcp_reass_zone VNET(tcp_reass_zone) + +/* Initialize TCP reassembly queue */ +static void +tcp_reass_zone_change(void *tag) +{ + + V_tcp_reass_maxseg = nmbclusters / 16; + uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg); +} + +void +tcp_reass_init(void) +{ + + V_tcp_reass_maxseg = nmbclusters / 16; + TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", + &V_tcp_reass_maxseg); + V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(V_tcp_reass_zone, V_tcp_reass_maxseg); + EVENTHANDLER_REGISTER(nmbclusters_change, + tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); +} + +#ifdef VIMAGE +void +tcp_reass_destroy(void) +{ + + uma_zdestroy(V_tcp_reass_zone); +} +#endif + +void +tcp_reass_flush(struct tcpcb *tp) +{ + struct tseg_qent *qe; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) { + LIST_REMOVE(qe, tqe_q); + m_freem(qe->tqe_m); + uma_zfree(V_tcp_reass_zone, qe); + tp->t_segqlen--; + } + + KASSERT((tp->t_segqlen == 0), + ("TCP reass queue %p segment count is %d instead of 0 after flush.", + tp, tp->t_segqlen)); +} + +static int +tcp_reass_sysctl_maxseg(SYSCTL_HANDLER_ARGS) +{ + V_tcp_reass_maxseg = uma_zone_get_max(V_tcp_reass_zone); + return (sysctl_handle_int(oidp, arg1, arg2, req)); +} + +static int +tcp_reass_sysctl_qsize(SYSCTL_HANDLER_ARGS) +{ + V_tcp_reass_qsize = uma_zone_get_cur(V_tcp_reass_zone); + return (sysctl_handle_int(oidp, arg1, arg2, req)); +} + +int +tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) +{ + struct tseg_qent *q; + struct tseg_qent *p = NULL; + struct tseg_qent *nq; + struct tseg_qent *te = NULL; + struct socket *so = tp->t_inpcb->inp_socket; + int flags; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * XXX: tcp_reass() is rather inefficient with its data structures + * and should be rewritten (see NetBSD for optimizations). + */ + + /* + * Call with th==NULL after become established to + * force pre-ESTABLISHED data up to user socket. + */ + if (th == NULL) + goto present; + + /* + * Limit the number of segments that can be queued to reduce the + * potential for mbuf exhaustion. For best performance, we want to be + * able to queue a full window's worth of segments. The size of the + * socket receive buffer determines our advertised window and grows + * automatically when socket buffer autotuning is enabled. Use it as the + * basis for our queue limit. + * Always let the missing segment through which caused this queue. + * NB: Access to the socket buffer is left intentionally unlocked as we + * can tolerate stale information here. + * + * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat + * should work but causes packets to be dropped when they shouldn't. + * Investigate why and re-evaluate the below limit after the behaviour + * is understood. + */ + if (th->th_seq != tp->rcv_nxt && + tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) { + V_tcp_reass_overflows++; + TCPSTAT_INC(tcps_rcvmemdrop); + m_freem(m); + *tlenp = 0; + return (0); + } + + /* + * Allocate a new queue entry. If we can't, or hit the zone limit + * just drop the pkt. + */ + te = uma_zalloc(V_tcp_reass_zone, M_NOWAIT); + if (te == NULL) { + TCPSTAT_INC(tcps_rcvmemdrop); + m_freem(m); + *tlenp = 0; + return (0); + } + tp->t_segqlen++; + + /* + * Find a segment which begins after this one does. + */ + LIST_FOREACH(q, &tp->t_segq, tqe_q) { + if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) + break; + p = q; + } + + /* + * If there is a preceding segment, it may provide some of + * our data already. If so, drop the data from the incoming + * segment. If it provides all of our data, drop us. + */ + if (p != NULL) { + int i; + /* conversion to int (in i) handles seq wraparound */ + i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; + if (i > 0) { + if (i >= *tlenp) { + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp); + m_freem(m); + uma_zfree(V_tcp_reass_zone, te); + tp->t_segqlen--; + /* + * Try to present any queued data + * at the left window edge to the user. + * This is needed after the 3-WHS + * completes. + */ + goto present; /* ??? */ + } + m_adj(m, i); + *tlenp -= i; + th->th_seq += i; + } + } + TCPSTAT_INC(tcps_rcvoopack); + TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); + + /* + * While we overlap succeeding segments trim them or, + * if they are completely covered, dequeue them. + */ + while (q) { + int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; + if (i <= 0) + break; + if (i < q->tqe_len) { + q->tqe_th->th_seq += i; + q->tqe_len -= i; + m_adj(q->tqe_m, i); + break; + } + + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + m_freem(q->tqe_m); + uma_zfree(V_tcp_reass_zone, q); + tp->t_segqlen--; + q = nq; + } + + /* Insert the new segment queue entry into place. */ + te->tqe_m = m; + te->tqe_th = th; + te->tqe_len = *tlenp; + + if (p == NULL) { + LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); + } else { + LIST_INSERT_AFTER(p, te, tqe_q); + } + +present: + /* + * Present data to user, advancing rcv_nxt through + * completed sequence space. + */ + if (!TCPS_HAVEESTABLISHED(tp->t_state)) + return (0); + q = LIST_FIRST(&tp->t_segq); + if (!q || q->tqe_th->th_seq != tp->rcv_nxt) + return (0); + SOCKBUF_LOCK(&so->so_rcv); + do { + tp->rcv_nxt += q->tqe_len; + flags = q->tqe_th->th_flags & TH_FIN; + nq = LIST_NEXT(q, tqe_q); + LIST_REMOVE(q, tqe_q); + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) + m_freem(q->tqe_m); + else + sbappendstream_locked(&so->so_rcv, q->tqe_m); + uma_zfree(V_tcp_reass_zone, q); + tp->t_segqlen--; + q = nq; + } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + ND6_HINT(tp); + sorwakeup_locked(so); + return (flags); +} diff --git a/freebsd/sys/netinet/tcp_sack.c b/freebsd/sys/netinet/tcp_sack.c new file mode 100644 index 00000000..94bae57b --- /dev/null +++ b/freebsd/sys/netinet/tcp_sack.c @@ -0,0 +1,687 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 + */ + +/*- + * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 + * + * NRL grants permission for redistribution and use in source and binary + * forms, with or without modification, of the software and documentation + * created at NRL provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgements: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * This product includes software developed at the Information + * Technology Division, US Naval Research Laboratory. + * 4. Neither the name of the NRL nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS + * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation + * are those of the authors and should not be interpreted as representing + * official policies, either expressed or implied, of the US Naval + * Research Laboratory (NRL). + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* for proc0 declaration */ +#include +#include +#include +#include +#include + +#include /* before tcp_seq.h, for tcp_random18() */ + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif /* TCPDEBUG */ + +#include + +VNET_DECLARE(struct uma_zone *, sack_hole_zone); +#define V_sack_hole_zone VNET(sack_hole_zone) + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); +VNET_DEFINE(int, tcp_do_sack) = 1; +#define V_tcp_do_sack VNET(tcp_do_sack) +SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW, + &VNET_NAME(tcp_do_sack), 0, "Enable/Disable TCP SACK support"); + +VNET_DEFINE(int, tcp_sack_maxholes) = 128; +#define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) +SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW, + &VNET_NAME(tcp_sack_maxholes), 0, + "Maximum number of TCP SACK holes allowed per connection"); + +VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536; +#define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) +SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_RW, + &VNET_NAME(tcp_sack_globalmaxholes), 0, + "Global maximum number of TCP SACK holes"); + +VNET_DEFINE(int, tcp_sack_globalholes) = 0; +#define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) +SYSCTL_VNET_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_RD, + &VNET_NAME(tcp_sack_globalholes), 0, + "Global number of TCP SACK holes currently allocated"); + +/* + * This function is called upon receipt of new valid data (while not in + * header prediction mode), and it updates the ordered list of sacks. + */ +void +tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) +{ + /* + * First reported block MUST be the most recent one. Subsequent + * blocks SHOULD be in the order in which they arrived at the + * receiver. These two conditions make the implementation fully + * compliant with RFC 2018. + */ + struct sackblk head_blk, saved_blks[MAX_SACK_BLKS]; + int num_head, num_saved, i; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* Check arguments. */ + KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end")); + + /* SACK block for the received segment. */ + head_blk.start = rcv_start; + head_blk.end = rcv_end; + + /* + * Merge updated SACK blocks into head_blk, and save unchanged SACK + * blocks into saved_blks[]. num_saved will have the number of the + * saved SACK blocks. + */ + num_saved = 0; + for (i = 0; i < tp->rcv_numsacks; i++) { + tcp_seq start = tp->sackblks[i].start; + tcp_seq end = tp->sackblks[i].end; + if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) { + /* + * Discard this SACK block. + */ + } else if (SEQ_LEQ(head_blk.start, end) && + SEQ_GEQ(head_blk.end, start)) { + /* + * Merge this SACK block into head_blk. This SACK + * block itself will be discarded. + */ + if (SEQ_GT(head_blk.start, start)) + head_blk.start = start; + if (SEQ_LT(head_blk.end, end)) + head_blk.end = end; + } else { + /* + * Save this SACK block. + */ + saved_blks[num_saved].start = start; + saved_blks[num_saved].end = end; + num_saved++; + } + } + + /* + * Update SACK list in tp->sackblks[]. + */ + num_head = 0; + if (SEQ_GT(head_blk.start, tp->rcv_nxt)) { + /* + * The received data segment is an out-of-order segment. Put + * head_blk at the top of SACK list. + */ + tp->sackblks[0] = head_blk; + num_head = 1; + /* + * If the number of saved SACK blocks exceeds its limit, + * discard the last SACK block. + */ + if (num_saved >= MAX_SACK_BLKS) + num_saved--; + } + if (num_saved > 0) { + /* + * Copy the saved SACK blocks back. + */ + bcopy(saved_blks, &tp->sackblks[num_head], + sizeof(struct sackblk) * num_saved); + } + + /* Save the number of SACK blocks. */ + tp->rcv_numsacks = num_head + num_saved; +} + +/* + * Delete all receiver-side SACK information. + */ +void +tcp_clean_sackreport(struct tcpcb *tp) +{ + int i; + + INP_WLOCK_ASSERT(tp->t_inpcb); + tp->rcv_numsacks = 0; + for (i = 0; i < MAX_SACK_BLKS; i++) + tp->sackblks[i].start = tp->sackblks[i].end=0; +} + +/* + * Allocate struct sackhole. + */ +static struct sackhole * +tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end) +{ + struct sackhole *hole; + + if (tp->snd_numholes >= V_tcp_sack_maxholes || + V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) { + TCPSTAT_INC(tcps_sack_sboverflow); + return NULL; + } + + hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT); + if (hole == NULL) + return NULL; + + hole->start = start; + hole->end = end; + hole->rxmit = start; + + tp->snd_numholes++; + atomic_add_int(&V_tcp_sack_globalholes, 1); + + return hole; +} + +/* + * Free struct sackhole. + */ +static void +tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole) +{ + + uma_zfree(V_sack_hole_zone, hole); + + tp->snd_numholes--; + atomic_subtract_int(&V_tcp_sack_globalholes, 1); + + KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0")); + KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0")); +} + +/* + * Insert new SACK hole into scoreboard. + */ +static struct sackhole * +tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end, + struct sackhole *after) +{ + struct sackhole *hole; + + /* Allocate a new SACK hole. */ + hole = tcp_sackhole_alloc(tp, start, end); + if (hole == NULL) + return NULL; + + /* Insert the new SACK hole into scoreboard. */ + if (after != NULL) + TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink); + else + TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink); + + /* Update SACK hint. */ + if (tp->sackhint.nexthole == NULL) + tp->sackhint.nexthole = hole; + + return hole; +} + +/* + * Remove SACK hole from scoreboard. + */ +static void +tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole) +{ + + /* Update SACK hint. */ + if (tp->sackhint.nexthole == hole) + tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink); + + /* Remove this SACK hole. */ + TAILQ_REMOVE(&tp->snd_holes, hole, scblink); + + /* Free this SACK hole. */ + tcp_sackhole_free(tp, hole); +} + +/* + * Process cumulative ACK and the TCP SACK option to update the scoreboard. + * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of + * the sequence space). + */ +void +tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) +{ + struct sackhole *cur, *temp; + struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; + int i, j, num_sack_blks; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + num_sack_blks = 0; + /* + * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, + * treat [SND.UNA, SEG.ACK) as if it is a SACK block. + */ + if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { + sack_blocks[num_sack_blks].start = tp->snd_una; + sack_blocks[num_sack_blks++].end = th_ack; + } + /* + * Append received valid SACK blocks to sack_blocks[], but only if we + * received new blocks from the other side. + */ + if (to->to_flags & TOF_SACK) { + for (i = 0; i < to->to_nsacks; i++) { + bcopy((to->to_sacks + i * TCPOLEN_SACK), + &sack, sizeof(sack)); + sack.start = ntohl(sack.start); + sack.end = ntohl(sack.end); + if (SEQ_GT(sack.end, sack.start) && + SEQ_GT(sack.start, tp->snd_una) && + SEQ_GT(sack.start, th_ack) && + SEQ_LT(sack.start, tp->snd_max) && + SEQ_GT(sack.end, tp->snd_una) && + SEQ_LEQ(sack.end, tp->snd_max)) + sack_blocks[num_sack_blks++] = sack; + } + } + /* + * Return if SND.UNA is not advanced and no valid SACK block is + * received. + */ + if (num_sack_blks == 0) + return; + + /* + * Sort the SACK blocks so we can update the scoreboard with just one + * pass. The overhead of sorting upto 4+1 elements is less than + * making upto 4+1 passes over the scoreboard. + */ + for (i = 0; i < num_sack_blks; i++) { + for (j = i + 1; j < num_sack_blks; j++) { + if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { + sack = sack_blocks[i]; + sack_blocks[i] = sack_blocks[j]; + sack_blocks[j] = sack; + } + } + } + if (TAILQ_EMPTY(&tp->snd_holes)) + /* + * Empty scoreboard. Need to initialize snd_fack (it may be + * uninitialized or have a bogus value). Scoreboard holes + * (from the sack blocks received) are created later below + * (in the logic that adds holes to the tail of the + * scoreboard). + */ + tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); + /* + * In the while-loop below, incoming SACK blocks (sack_blocks[]) and + * SACK holes (snd_holes) are traversed from their tails with just + * one pass in order to reduce the number of compares especially when + * the bandwidth-delay product is large. + * + * Note: Typically, in the first RTT of SACK recovery, the highest + * three or four SACK blocks with the same ack number are received. + * In the second RTT, if retransmitted data segments are not lost, + * the highest three or four SACK blocks with ack number advancing + * are received. + */ + sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */ + if (SEQ_LT(tp->snd_fack, sblkp->start)) { + /* + * The highest SACK block is beyond fack. Append new SACK + * hole at the tail. If the second or later highest SACK + * blocks are also beyond the current fack, they will be + * inserted by way of hole splitting in the while-loop below. + */ + temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); + if (temp != NULL) { + tp->snd_fack = sblkp->end; + /* Go to the previous sack block. */ + sblkp--; + } else { + /* + * We failed to add a new hole based on the current + * sack block. Skip over all the sack blocks that + * fall completely to the right of snd_fack and + * proceed to trim the scoreboard based on the + * remaining sack blocks. This also trims the + * scoreboard for th_ack (which is sack_blocks[0]). + */ + while (sblkp >= sack_blocks && + SEQ_LT(tp->snd_fack, sblkp->start)) + sblkp--; + if (sblkp >= sack_blocks && + SEQ_LT(tp->snd_fack, sblkp->end)) + tp->snd_fack = sblkp->end; + } + } else if (SEQ_LT(tp->snd_fack, sblkp->end)) + /* fack is advanced. */ + tp->snd_fack = sblkp->end; + /* We must have at least one SACK hole in scoreboard. */ + KASSERT(!TAILQ_EMPTY(&tp->snd_holes), + ("SACK scoreboard must not be empty")); + cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */ + /* + * Since the incoming sack blocks are sorted, we can process them + * making one sweep of the scoreboard. + */ + while (sblkp >= sack_blocks && cur != NULL) { + if (SEQ_GEQ(sblkp->start, cur->end)) { + /* + * SACKs data beyond the current hole. Go to the + * previous sack block. + */ + sblkp--; + continue; + } + if (SEQ_LEQ(sblkp->end, cur->start)) { + /* + * SACKs data before the current hole. Go to the + * previous hole. + */ + cur = TAILQ_PREV(cur, sackhole_head, scblink); + continue; + } + tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start); + KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, + ("sackhint bytes rtx >= 0")); + if (SEQ_LEQ(sblkp->start, cur->start)) { + /* Data acks at least the beginning of hole. */ + if (SEQ_GEQ(sblkp->end, cur->end)) { + /* Acks entire hole, so delete hole. */ + temp = cur; + cur = TAILQ_PREV(cur, sackhole_head, scblink); + tcp_sackhole_remove(tp, temp); + /* + * The sack block may ack all or part of the + * next hole too, so continue onto the next + * hole. + */ + continue; + } else { + /* Move start of hole forward. */ + cur->start = sblkp->end; + cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); + } + } else { + /* Data acks at least the end of hole. */ + if (SEQ_GEQ(sblkp->end, cur->end)) { + /* Move end of hole backward. */ + cur->end = sblkp->start; + cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); + } else { + /* + * ACKs some data in middle of a hole; need + * to split current hole + */ + temp = tcp_sackhole_insert(tp, sblkp->end, + cur->end, cur); + if (temp != NULL) { + if (SEQ_GT(cur->rxmit, temp->rxmit)) { + temp->rxmit = cur->rxmit; + tp->sackhint.sack_bytes_rexmit + += (temp->rxmit + - temp->start); + } + cur->end = sblkp->start; + cur->rxmit = SEQ_MIN(cur->rxmit, + cur->end); + } + } + } + tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start); + /* + * Testing sblkp->start against cur->start tells us whether + * we're done with the sack block or the sack hole. + * Accordingly, we advance one or the other. + */ + if (SEQ_LEQ(sblkp->start, cur->start)) + cur = TAILQ_PREV(cur, sackhole_head, scblink); + else + sblkp--; + } +} + +/* + * Free all SACK holes to clear the scoreboard. + */ +void +tcp_free_sackholes(struct tcpcb *tp) +{ + struct sackhole *q; + + INP_WLOCK_ASSERT(tp->t_inpcb); + while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL) + tcp_sackhole_remove(tp, q); + tp->sackhint.sack_bytes_rexmit = 0; + + KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0")); + KASSERT(tp->sackhint.nexthole == NULL, + ("tp->sackhint.nexthole == NULL")); +} + +/* + * Partial ack handling within a sack recovery episode. Keeping this very + * simple for now. When a partial ack is received, force snd_cwnd to a value + * that will allow the sender to transmit no more than 2 segments. If + * necessary, a better scheme can be adopted at a later point, but for now, + * the goal is to prevent the sender from bursting a large amount of data in + * the midst of sack recovery. + */ +void +tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th) +{ + int num_segs = 1; + + INP_WLOCK_ASSERT(tp->t_inpcb); + tcp_timer_activate(tp, TT_REXMT, 0); + tp->t_rtttime = 0; + /* Send one or 2 segments based on how much new data was acked. */ + if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2) + num_segs = 2; + tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + + (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg); + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); +} + +#if 0 +/* + * Debug version of tcp_sack_output() that walks the scoreboard. Used for + * now to sanity check the hint. + */ +static struct sackhole * +tcp_sack_output_debug(struct tcpcb *tp, int *sack_bytes_rexmt) +{ + struct sackhole *p; + + INP_WLOCK_ASSERT(tp->t_inpcb); + *sack_bytes_rexmt = 0; + TAILQ_FOREACH(p, &tp->snd_holes, scblink) { + if (SEQ_LT(p->rxmit, p->end)) { + if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ + continue; + } + *sack_bytes_rexmt += (p->rxmit - p->start); + break; + } + *sack_bytes_rexmt += (p->rxmit - p->start); + } + return (p); +} +#endif + +/* + * Returns the next hole to retransmit and the number of retransmitted bytes + * from the scoreboard. We store both the next hole and the number of + * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK + * reception). This avoids scoreboard traversals completely. + * + * The loop here will traverse *at most* one link. Here's the argument. For + * the loop to traverse more than 1 link before finding the next hole to + * retransmit, we would need to have at least 1 node following the current + * hint with (rxmit == end). But, for all holes following the current hint, + * (start == rxmit), since we have not yet retransmitted from them. + * Therefore, in order to traverse more 1 link in the loop below, we need to + * have at least one node following the current hint with (start == rxmit == + * end). But that can't happen, (start == end) means that all the data in + * that hole has been sacked, in which case, the hole would have been removed + * from the scoreboard. + */ +struct sackhole * +tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) +{ + struct sackhole *hole = NULL; + + INP_WLOCK_ASSERT(tp->t_inpcb); + *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit; + hole = tp->sackhint.nexthole; + if (hole == NULL || SEQ_LT(hole->rxmit, hole->end)) + goto out; + while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) { + if (SEQ_LT(hole->rxmit, hole->end)) { + tp->sackhint.nexthole = hole; + break; + } + } +out: + return (hole); +} + +/* + * After a timeout, the SACK list may be rebuilt. This SACK information + * should be used to avoid retransmitting SACKed data. This function + * traverses the SACK list to see if snd_nxt should be moved forward. + */ +void +tcp_sack_adjust(struct tcpcb *tp) +{ + struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); + + INP_WLOCK_ASSERT(tp->t_inpcb); + if (cur == NULL) + return; /* No holes */ + if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) + return; /* We're already beyond any SACKed blocks */ + /*- + * Two cases for which we want to advance snd_nxt: + * i) snd_nxt lies between end of one hole and beginning of another + * ii) snd_nxt lies between end of last hole and snd_fack + */ + while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { + if (SEQ_LT(tp->snd_nxt, cur->end)) + return; + if (SEQ_GEQ(tp->snd_nxt, p->start)) + cur = p; + else { + tp->snd_nxt = p->start; + return; + } + } + if (SEQ_LT(tp->snd_nxt, cur->end)) + return; + tp->snd_nxt = tp->snd_fack; +} diff --git a/freebsd/sys/netinet/tcp_seq.h b/freebsd/sys/netinet/tcp_seq.h new file mode 100644 index 00000000..8af7b0ab --- /dev/null +++ b/freebsd/sys/netinet/tcp_seq.h @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 1982, 1986, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_seq.h 8.3 (Berkeley) 6/21/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_SEQ_HH_ +#define _NETINET_TCP_SEQ_HH_ +/* + * TCP sequence numbers are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. + */ +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) + +#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) +#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) + +/* for modulo comparisons of timestamps */ +#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) +#define TSTMP_GT(a,b) ((int)((a)-(b)) > 0) +#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) + +/* + * Macros to initialize tcp sequence numbers for + * send and receive from initial send and receive + * sequence numbers. + */ +#define tcp_rcvseqinit(tp) \ + (tp)->rcv_adv = (tp)->rcv_nxt = (tp)->irs + 1 + +#define tcp_sendseqinit(tp) \ + (tp)->snd_una = (tp)->snd_nxt = (tp)->snd_max = (tp)->snd_up = \ + (tp)->snd_recover = (tp)->iss + +#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * hz) + /* timestamp wrap-around time */ + +#endif /* _NETINET_TCP_SEQ_HH_ */ diff --git a/freebsd/sys/netinet/tcp_subr.c b/freebsd/sys/netinet/tcp_subr.c new file mode 100644 index 00000000..83777450 --- /dev/null +++ b/freebsd/sys/netinet/tcp_subr.c @@ -0,0 +1,2315 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#ifdef INET6 +#include +#endif +#include +#include +#ifdef INET6 +#include +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#ifdef TCPDEBUG +#include +#endif +#include + +#ifdef IPSEC +#include +#include +#ifdef INET6 +#include +#endif +#include +#include +#endif /*IPSEC*/ + +#include +#include + +#include + +VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; +#ifdef INET6 +VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; +#endif + +static int +sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) +{ + int error, new; + + new = V_tcp_mssdflt; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + if (new < TCP_MINMSS) + error = EINVAL; + else + V_tcp_mssdflt = new; + } + return (error); +} + +SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, + CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_mssdflt), 0, + &sysctl_net_inet_tcp_mss_check, "I", + "Default TCP Maximum Segment Size"); + +#ifdef INET6 +static int +sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) +{ + int error, new; + + new = V_tcp_v6mssdflt; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + if (new < TCP_MINMSS) + error = EINVAL; + else + V_tcp_v6mssdflt = new; + } + return (error); +} + +SYSCTL_VNET_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, + CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_v6mssdflt), 0, + &sysctl_net_inet_tcp_mss_v6_check, "I", + "Default TCP Maximum Segment Size for IPv6"); +#endif + +static int +vnet_sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS) +{ + + VNET_SYSCTL_ARG(req, arg1); + return (sysctl_msec_to_ticks(oidp, arg1, arg2, req)); +} + +/* + * Minimum MSS we accept and use. This prevents DoS attacks where + * we are forced to a ridiculous low MSS like 20 and send hundreds + * of packets instead of one. The effect scales with the available + * bandwidth and quickly saturates the CPU and network interface + * with packet generation and sending. Set to zero to disable MINMSS + * checking. This setting prevents us from sending too small packets. + */ +VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, + &VNET_NAME(tcp_minmss), 0, + "Minmum TCP Maximum Segment Size"); + +VNET_DEFINE(int, tcp_do_rfc1323) = 1; +SYSCTL_VNET_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, + &VNET_NAME(tcp_do_rfc1323), 0, + "Enable rfc1323 (high performance TCP) extensions"); + +static int tcp_log_debug = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, + &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); + +static int tcp_tcbhashsize = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, + &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); + +static int do_tcpdrain = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, + "Enable tcp_drain routine for extra help when low on mbufs"); + +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, + &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); + +static VNET_DEFINE(int, icmp_may_rst) = 1; +#define V_icmp_may_rst VNET(icmp_may_rst) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, + &VNET_NAME(icmp_may_rst), 0, + "Certain ICMP unreachable messages may abort connections in SYN_SENT"); + +static VNET_DEFINE(int, tcp_isn_reseed_interval) = 0; +#define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, + &VNET_NAME(tcp_isn_reseed_interval), 0, + "Seconds between reseeding of ISN secret"); + +/* + * TCP bandwidth limiting sysctls. Note that the default lower bound of + * 1024 exists only for debugging. A good production default would be + * something like 6100. + */ +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, + "TCP inflight data limiting"); + +static VNET_DEFINE(int, tcp_inflight_enable) = 0; +#define V_tcp_inflight_enable VNET(tcp_inflight_enable) +SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, + &VNET_NAME(tcp_inflight_enable), 0, + "Enable automatic TCP inflight data limiting"); + +static int tcp_inflight_debug = 0; +SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, + &tcp_inflight_debug, 0, + "Debug TCP inflight calculations"); + +static VNET_DEFINE(int, tcp_inflight_rttthresh); +#define V_tcp_inflight_rttthresh VNET(tcp_inflight_rttthresh) +SYSCTL_VNET_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, + CTLTYPE_INT|CTLFLAG_RW, &VNET_NAME(tcp_inflight_rttthresh), 0, + vnet_sysctl_msec_to_ticks, "I", + "RTT threshold below which inflight will deactivate itself"); + +static VNET_DEFINE(int, tcp_inflight_min) = 6144; +#define V_tcp_inflight_min VNET(tcp_inflight_min) +SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, + &VNET_NAME(tcp_inflight_min), 0, + "Lower-bound for TCP inflight window"); + +static VNET_DEFINE(int, tcp_inflight_max) = TCP_MAXWIN << TCP_MAX_WINSHIFT; +#define V_tcp_inflight_max VNET(tcp_inflight_max) +SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, + &VNET_NAME(tcp_inflight_max), 0, + "Upper-bound for TCP inflight window"); + +static VNET_DEFINE(int, tcp_inflight_stab) = 20; +#define V_tcp_inflight_stab VNET(tcp_inflight_stab) +SYSCTL_VNET_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, + &VNET_NAME(tcp_inflight_stab), 0, + "Inflight Algorithm Stabilization 20 = 2 packets"); + +VNET_DEFINE(uma_zone_t, sack_hole_zone); +#define V_sack_hole_zone VNET(sack_hole_zone) + +static struct inpcb *tcp_notify(struct inpcb *, int); +static void tcp_isn_tick(void *); +static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, + void *ip4hdr, const void *ip6hdr); + +/* + * Target size of TCP PCB hash tables. Must be a power of two. + * + * Note that this can be overridden by the kernel environment + * variable net.inet.tcp.tcbhashsize + */ +#ifndef TCBHASHSIZE +#define TCBHASHSIZE 512 +#endif + +/* + * XXX + * Callouts should be moved into struct tcp directly. They are currently + * separate because the tcpcb structure is exported to userland for sysctl + * parsing purposes, which do not know about callouts. + */ +struct tcpcb_mem { + struct tcpcb tcb; + struct tcp_timer tt; +}; + +static VNET_DEFINE(uma_zone_t, tcpcb_zone); +#define V_tcpcb_zone VNET(tcpcb_zone) + +MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); +struct callout isn_callout; +static struct mtx isn_mtx; + +#define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) +#define ISN_LOCK() mtx_lock(&isn_mtx) +#define ISN_UNLOCK() mtx_unlock(&isn_mtx) + +/* + * TCP initialization. + */ +static void +tcp_zone_change(void *tag) +{ + + uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); + uma_zone_set_max(V_tcpcb_zone, maxsockets); + tcp_tw_zone_change(); +} + +static int +tcp_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp = mem; + + INP_LOCK_INIT(inp, "inp", "tcpinp"); + return (0); +} + +void +tcp_init(void) +{ + int hashsize; + + INP_INFO_LOCK_INIT(&V_tcbinfo, "tcp"); + LIST_INIT(&V_tcb); +#ifdef VIMAGE + V_tcbinfo.ipi_vnet = curvnet; +#endif + V_tcbinfo.ipi_listhead = &V_tcb; + hashsize = TCBHASHSIZE; + TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize); + if (!powerof2(hashsize)) { + printf("WARNING: TCB hash size not a power of 2\n"); + hashsize = 512; /* safe default */ + } + V_tcbinfo.ipi_hashbase = hashinit(hashsize, M_PCB, + &V_tcbinfo.ipi_hashmask); + V_tcbinfo.ipi_porthashbase = hashinit(hashsize, M_PCB, + &V_tcbinfo.ipi_porthashmask); + V_tcbinfo.ipi_zone = uma_zcreate("tcp_inpcb", sizeof(struct inpcb), + NULL, NULL, tcp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(V_tcbinfo.ipi_zone, maxsockets); + V_tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; + + /* + * These have to be type stable for the benefit of the timers. + */ + V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(V_tcpcb_zone, maxsockets); + + tcp_tw_init(); + syncache_init(); + tcp_hc_init(); + tcp_reass_init(); + + TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); + V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + + /* Skip initialization of globals for non-default instances. */ + if (!IS_DEFAULT_VNET(curvnet)) + return; + + /* XXX virtualize those bellow? */ + tcp_delacktime = TCPTV_DELACK; + tcp_keepinit = TCPTV_KEEP_INIT; + tcp_keepidle = TCPTV_KEEP_IDLE; + tcp_keepintvl = TCPTV_KEEPINTVL; + tcp_maxpersistidle = TCPTV_KEEP_IDLE; + tcp_msl = TCPTV_MSL; + tcp_rexmit_min = TCPTV_MIN; + if (tcp_rexmit_min < 1) + tcp_rexmit_min = 1; + tcp_rexmit_slop = TCPTV_CPU_VAR; + tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; + tcp_tcbhashsize = hashsize; + +#ifdef INET6 +#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) +#else /* INET6 */ +#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr)) +#endif /* INET6 */ + if (max_protohdr < TCP_MINPROTOHDR) + max_protohdr = TCP_MINPROTOHDR; + if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) + panic("tcp_init"); +#undef TCP_MINPROTOHDR + + ISN_LOCK_INIT(); + callout_init(&isn_callout, CALLOUT_MPSAFE); + callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); + EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, + SHUTDOWN_PRI_DEFAULT); + EVENTHANDLER_REGISTER(maxsockets_change, tcp_zone_change, NULL, + EVENTHANDLER_PRI_ANY); +} + +#ifdef VIMAGE +void +tcp_destroy(void) +{ + + tcp_reass_destroy(); + tcp_hc_destroy(); + syncache_destroy(); + tcp_tw_destroy(); + + /* XXX check that hashes are empty! */ + hashdestroy(V_tcbinfo.ipi_hashbase, M_PCB, + V_tcbinfo.ipi_hashmask); + hashdestroy(V_tcbinfo.ipi_porthashbase, M_PCB, + V_tcbinfo.ipi_porthashmask); + + uma_zdestroy(V_sack_hole_zone); + uma_zdestroy(V_tcpcb_zone); + uma_zdestroy(V_tcbinfo.ipi_zone); + + INP_INFO_LOCK_DESTROY(&V_tcbinfo); +} +#endif + +void +tcp_fini(void *xtp) +{ + + callout_stop(&isn_callout); +} + +/* + * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. + * tcp_template used to store this data in mbufs, but we now recopy it out + * of the tcpcb each time to conserve mbufs. + */ +void +tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr) +{ + struct tcphdr *th = (struct tcphdr *)tcp_ptr; + + INP_WLOCK_ASSERT(inp); + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)ip_ptr; + ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | + (inp->inp_flow & IPV6_FLOWINFO_MASK); + ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | + (IPV6_VERSION & IPV6_VERSION_MASK); + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_plen = htons(sizeof(struct tcphdr)); + ip6->ip6_src = inp->in6p_laddr; + ip6->ip6_dst = inp->in6p_faddr; + } else +#endif + { + struct ip *ip; + + ip = (struct ip *)ip_ptr; + ip->ip_v = IPVERSION; + ip->ip_hl = 5; + ip->ip_tos = inp->inp_ip_tos; + ip->ip_len = 0; + ip->ip_id = 0; + ip->ip_off = 0; + ip->ip_ttl = inp->inp_ip_ttl; + ip->ip_sum = 0; + ip->ip_p = IPPROTO_TCP; + ip->ip_src = inp->inp_laddr; + ip->ip_dst = inp->inp_faddr; + } + th->th_sport = inp->inp_lport; + th->th_dport = inp->inp_fport; + th->th_seq = 0; + th->th_ack = 0; + th->th_x2 = 0; + th->th_off = 5; + th->th_flags = 0; + th->th_win = 0; + th->th_urp = 0; + th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ +} + +/* + * Create template to be used to send tcp packets on a connection. + * Allocates an mbuf and fills in a skeletal tcp/ip header. The only + * use for this function is in keepalives, which use tcp_respond. + */ +struct tcptemp * +tcpip_maketemplate(struct inpcb *inp) +{ + struct tcptemp *t; + + t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); + if (t == NULL) + return (NULL); + tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t); + return (t); +} + +/* + * Send a single message to the TCP at address specified by + * the given TCP/IP header. If m == NULL, then we make a copy + * of the tcpiphdr at ti and send directly to the addressed host. + * This is used to force keep alive messages out using the TCP + * template for a connection. If flags are given then we send + * a message back to the TCP which originated the * segment ti, + * and discard the mbuf containing it and any other attached mbufs. + * + * In any case the ack and sequence number of the transmitted + * segment are as specified by the parameters. + * + * NOTE: If m != NULL, then ti must point to *inside* the mbuf. + */ +void +tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, + tcp_seq ack, tcp_seq seq, int flags) +{ + int tlen; + int win = 0; + struct ip *ip; + struct tcphdr *nth; +#ifdef INET6 + struct ip6_hdr *ip6; + int isipv6; +#endif /* INET6 */ + int ipflags = 0; + struct inpcb *inp; + + KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); + +#ifdef INET6 + isipv6 = ((struct ip *)ipgen)->ip_v == 6; + ip6 = ipgen; +#endif /* INET6 */ + ip = ipgen; + + if (tp != NULL) { + inp = tp->t_inpcb; + KASSERT(inp != NULL, ("tcp control block w/o inpcb")); + INP_WLOCK_ASSERT(inp); + } else + inp = NULL; + + if (tp != NULL) { + if (!(flags & TH_RST)) { + win = sbspace(&inp->inp_socket->so_rcv); + if (win > (long)TCP_MAXWIN << tp->rcv_scale) + win = (long)TCP_MAXWIN << tp->rcv_scale; + } + } + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + return; + tlen = 0; + m->m_data += max_linkhdr; +#ifdef INET6 + if (isipv6) { + bcopy((caddr_t)ip6, mtod(m, caddr_t), + sizeof(struct ip6_hdr)); + ip6 = mtod(m, struct ip6_hdr *); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); + ip = mtod(m, struct ip *); + nth = (struct tcphdr *)(ip + 1); + } + bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); + flags = TH_ACK; + } else { + /* + * reuse the mbuf. + * XXX MRT We inherrit the FIB, which is lucky. + */ + m_freem(m->m_next); + m->m_next = NULL; + m->m_data = (caddr_t)ipgen; + /* m_len is set later */ + tlen = 0; +#define xchg(a,b,type) { type t; t=a; a=b; b=t; } +#ifdef INET6 + if (isipv6) { + xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); + nth = (struct tcphdr *)(ip6 + 1); + } else +#endif /* INET6 */ + { + xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); + nth = (struct tcphdr *)(ip + 1); + } + if (th != nth) { + /* + * this is usually a case when an extension header + * exists between the IPv6 header and the + * TCP header. + */ + nth->th_sport = th->th_sport; + nth->th_dport = th->th_dport; + } + xchg(nth->th_dport, nth->th_sport, uint16_t); +#undef xchg + } +#ifdef INET6 + if (isipv6) { + ip6->ip6_flow = 0; + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) + + tlen)); + tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + } else +#endif + { + tlen += sizeof (struct tcpiphdr); + ip->ip_len = tlen; + ip->ip_ttl = V_ip_defttl; + if (V_path_mtu_discovery) + ip->ip_off |= IP_DF; + } + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = NULL; +#ifdef MAC + if (inp != NULL) { + /* + * Packet is associated with a socket, so allow the + * label of the response to reflect the socket label. + */ + INP_WLOCK_ASSERT(inp); + mac_inpcb_create_mbuf(inp, m); + } else { + /* + * Packet is not associated with a socket, so possibly + * update the label in place. + */ + mac_netinet_tcp_reply(m); + } +#endif + nth->th_seq = htonl(seq); + nth->th_ack = htonl(ack); + nth->th_x2 = 0; + nth->th_off = sizeof (struct tcphdr) >> 2; + nth->th_flags = flags; + if (tp != NULL) + nth->th_win = htons((u_short) (win >> tp->rcv_scale)); + else + nth->th_win = htons((u_short)win); + nth->th_urp = 0; +#ifdef INET6 + if (isipv6) { + nth->th_sum = 0; + nth->th_sum = in6_cksum(m, IPPROTO_TCP, + sizeof(struct ip6_hdr), + tlen - sizeof(struct ip6_hdr)); + ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : + NULL, NULL); + } else +#endif /* INET6 */ + { + nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + } +#ifdef TCPDEBUG + if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); +#endif +#ifdef INET6 + if (isipv6) + (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); + else +#endif /* INET6 */ + (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); +} + +/* + * Create a new TCP control block, making an + * empty reassembly queue and hooking it to the argument + * protocol control block. The `inp' parameter must have + * come from the zone allocator set up in tcp_init(). + */ +struct tcpcb * +tcp_newtcpcb(struct inpcb *inp) +{ + struct tcpcb_mem *tm; + struct tcpcb *tp; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + + tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); + if (tm == NULL) + return (NULL); + tp = &tm->tcb; +#ifdef VIMAGE + tp->t_vnet = inp->inp_vnet; +#endif + tp->t_timers = &tm->tt; + /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ + tp->t_maxseg = tp->t_maxopd = +#ifdef INET6 + isipv6 ? V_tcp_v6mssdflt : +#endif /* INET6 */ + V_tcp_mssdflt; + + /* Set up our timeouts. */ + callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE); + callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE); + callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE); + callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE); + callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE); + + if (V_tcp_do_rfc1323) + tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); + if (V_tcp_do_sack) + tp->t_flags |= TF_SACK_PERMIT; + TAILQ_INIT(&tp->snd_holes); + tp->t_inpcb = inp; /* XXX */ + /* + * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no + * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives + * reasonable initial retransmit time. + */ + tp->t_srtt = TCPTV_SRTTBASE; + tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; + tp->t_rttmin = tcp_rexmit_min; + tp->t_rxtcur = TCPTV_RTOBASE; + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->t_rcvtime = ticks; + tp->t_bw_rtttime = ticks; + /* + * IPv4 TTL initialization is necessary for an IPv6 socket as well, + * because the socket may be bound to an IPv6 wildcard address, + * which may match an IPv4-mapped IPv6 address. + */ + inp->inp_ip_ttl = V_ip_defttl; + inp->inp_ppcb = tp; + return (tp); /* XXX */ +} + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +tcp_drop(struct tcpcb *tp, int errno) +{ + struct socket *so = tp->t_inpcb->inp_socket; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_output_reset(tp); + TCPSTAT_INC(tcps_drops); + } else + TCPSTAT_INC(tcps_conndrops); + if (errno == ETIMEDOUT && tp->t_softerror) + errno = tp->t_softerror; + so->so_error = errno; + return (tcp_close(tp)); +} + +void +tcp_discardcb(struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; +#ifdef INET6 + int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif /* INET6 */ + + INP_WLOCK_ASSERT(inp); + + /* + * Make sure that all of our timers are stopped before we + * delete the PCB. + */ + callout_stop(&tp->t_timers->tt_rexmt); + callout_stop(&tp->t_timers->tt_persist); + callout_stop(&tp->t_timers->tt_keep); + callout_stop(&tp->t_timers->tt_2msl); + callout_stop(&tp->t_timers->tt_delack); + + /* + * If we got enough samples through the srtt filter, + * save the rtt and rttvar in the routing entry. + * 'Enough' is arbitrarily defined as 4 rtt samples. + * 4 samples is enough for the srtt filter to converge + * to within enough % of the correct value; fewer samples + * and we could save a bogus rtt. The danger is not high + * as tcp quickly recovers from everything. + * XXX: Works very well but needs some more statistics! + */ + if (tp->t_rttupdated >= 4) { + struct hc_metrics_lite metrics; + u_long ssthresh; + + bzero(&metrics, sizeof(metrics)); + /* + * Update the ssthresh always when the conditions below + * are satisfied. This gives us better new start value + * for the congestion avoidance for new connections. + * ssthresh is only set if packet loss occured on a session. + * + * XXXRW: 'so' may be NULL here, and/or socket buffer may be + * being torn down. Ideally this code would not use 'so'. + */ + ssthresh = tp->snd_ssthresh; + if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { + /* + * convert the limit from user data bytes to + * packets then to packet data bytes. + */ + ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; + if (ssthresh < 2) + ssthresh = 2; + ssthresh *= (u_long)(tp->t_maxseg + +#ifdef INET6 + (isipv6 ? sizeof (struct ip6_hdr) + + sizeof (struct tcphdr) : +#endif + sizeof (struct tcpiphdr) +#ifdef INET6 + ) +#endif + ); + } else + ssthresh = 0; + metrics.rmx_ssthresh = ssthresh; + + metrics.rmx_rtt = tp->t_srtt; + metrics.rmx_rttvar = tp->t_rttvar; + /* XXX: This wraps if the pipe is more than 4 Gbit per second */ + metrics.rmx_bandwidth = tp->snd_bandwidth; + metrics.rmx_cwnd = tp->snd_cwnd; + metrics.rmx_sendpipe = 0; + metrics.rmx_recvpipe = 0; + + tcp_hc_update(&inp->inp_inc, &metrics); + } + + /* free the reassembly queue, if any */ + tcp_reass_flush(tp); + /* Disconnect offload device, if any. */ + tcp_offload_detach(tp); + + tcp_free_sackholes(tp); + inp->inp_ppcb = NULL; + tp->t_inpcb = NULL; + uma_zfree(V_tcpcb_zone, tp); +} + +/* + * Attempt to close a TCP control block, marking it as dropped, and freeing + * the socket if we hold the only reference. + */ +struct tcpcb * +tcp_close(struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); + + /* Notify any offload devices of listener close */ + if (tp->t_state == TCPS_LISTEN) + tcp_offload_listen_close(tp); + in_pcbdrop(inp); + TCPSTAT_INC(tcps_closed); + KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); + so = inp->inp_socket; + soisdisconnected(so); + if (inp->inp_flags & INP_SOCKREF) { + KASSERT(so->so_state & SS_PROTOREF, + ("tcp_close: !SS_PROTOREF")); + inp->inp_flags &= ~INP_SOCKREF; + INP_WUNLOCK(inp); + ACCEPT_LOCK(); + SOCK_LOCK(so); + so->so_state &= ~SS_PROTOREF; + sofree(so); + return (NULL); + } + return (tp); +} + +void +tcp_drain(void) +{ + VNET_ITERATOR_DECL(vnet_iter); + + if (!do_tcpdrain) + return; + + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + struct inpcb *inpb; + struct tcpcb *tcpb; + + /* + * Walk the tcpbs, if existing, and flush the reassembly queue, + * if there is one... + * XXX: The "Net/3" implementation doesn't imply that the TCP + * reassembly queue should be flushed, but in a situation + * where we're really low on mbufs, this is potentially + * usefull. + */ + INP_INFO_RLOCK(&V_tcbinfo); + LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { + if (inpb->inp_flags & INP_TIMEWAIT) + continue; + INP_WLOCK(inpb); + if ((tcpb = intotcpcb(inpb)) != NULL) { + tcp_reass_flush(tcpb); + tcp_clean_sackreport(tcpb); + } + INP_WUNLOCK(inpb); + } + INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + * + * Do not wake up user since there currently is no mechanism for + * reporting soft errors (yet - a kqueue filter may be added). + */ +static struct inpcb * +tcp_notify(struct inpcb *inp, int error) +{ + struct tcpcb *tp; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); + + if ((inp->inp_flags & INP_TIMEWAIT) || + (inp->inp_flags & INP_DROPPED)) + return (inp); + + tp = intotcpcb(inp); + KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return (inp); + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) { + tp = tcp_drop(tp, error); + if (tp != NULL) + return (inp); + else + return (NULL); + } else { + tp->t_softerror = error; + return (inp); + } +#if 0 + wakeup( &so->so_timeo); + sorwakeup(so); + sowwakeup(so); +#endif +} + +static int +tcp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, m, n, pcb_count; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the TCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == NULL) { + n = V_tcbinfo.ipi_count + syncache_pcbcount(); + n += imax(n / 8, 10); + req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); + return (0); + } + + if (req->newptr != NULL) + return (EPERM); + + /* + * OK, now we're committed to doing something. + */ + INP_INFO_RLOCK(&V_tcbinfo); + gencnt = V_tcbinfo.ipi_gencnt; + n = V_tcbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_tcbinfo); + + m = syncache_pcbcount(); + + error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + + (n + m) * sizeof(struct xtcpcb)); + if (error != 0) + return (error); + + xig.xig_len = sizeof xig; + xig.xig_count = n + m; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return (error); + + error = syncache_pcblist(req, m, &pcb_count); + if (error) + return (error); + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == NULL) + return (ENOMEM); + + INP_INFO_RLOCK(&V_tcbinfo); + for (inp = LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; + inp != NULL && i < n; inp = LIST_NEXT(inp, inp_list)) { + INP_WLOCK(inp); + if (inp->inp_gencnt <= gencnt) { + /* + * XXX: This use of cr_cansee(), introduced with + * TCP state changes, is not quite right, but for + * now, better than nothing. + */ + if (inp->inp_flags & INP_TIMEWAIT) { + if (intotw(inp) != NULL) + error = cr_cansee(req->td->td_ucred, + intotw(inp)->tw_cred); + else + error = EINVAL; /* Skip this inp. */ + } else + error = cr_canseeinpcb(req->td->td_ucred, inp); + if (error == 0) { + in_pcbref(inp); + inp_list[i++] = inp; + } + } + INP_WUNLOCK(inp); + } + INP_INFO_RUNLOCK(&V_tcbinfo); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_RLOCK(inp); + if (inp->inp_gencnt <= gencnt) { + struct xtcpcb xt; + void *inp_ppcb; + + bzero(&xt, sizeof(xt)); + xt.xt_len = sizeof xt; + /* XXX should avoid extra copy */ + bcopy(inp, &xt.xt_inp, sizeof *inp); + inp_ppcb = inp->inp_ppcb; + if (inp_ppcb == NULL) + bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); + else if (inp->inp_flags & INP_TIMEWAIT) { + bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); + xt.xt_tp.t_state = TCPS_TIME_WAIT; + } else + bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); + if (inp->inp_socket != NULL) + sotoxsocket(inp->inp_socket, &xt.xt_socket); + else { + bzero(&xt.xt_socket, sizeof xt.xt_socket); + xt.xt_socket.xso_protocol = IPPROTO_TCP; + } + xt.xt_inp.inp_gencnt = inp->inp_gencnt; + INP_RUNLOCK(inp); + error = SYSCTL_OUT(req, &xt, sizeof xt); + } else + INP_RUNLOCK(inp); + } + INP_INFO_WLOCK(&V_tcbinfo); + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_WLOCK(inp); + if (!in_pcbrele(inp)) + INP_WUNLOCK(inp); + } + INP_INFO_WUNLOCK(&V_tcbinfo); + + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + INP_INFO_RLOCK(&V_tcbinfo); + xig.xig_gen = V_tcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = V_tcbinfo.ipi_count + pcb_count; + INP_INFO_RUNLOCK(&V_tcbinfo); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); + +static int +tcp_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error; + + error = priv_check(req->td, PRIV_NETINET_GETCRED); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + INP_INFO_RLOCK(&V_tcbinfo); + inp = in_pcblookup_hash(&V_tcbinfo, addrs[1].sin_addr, + addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, 0, NULL); + if (inp != NULL) { + INP_RLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + if (inp->inp_socket == NULL) + error = ENOENT; + if (error == 0) + error = cr_canseeinpcb(req->td->td_ucred, inp); + if (error == 0) + cru2x(inp->inp_cred, &xuc); + INP_RUNLOCK(inp); + } else { + INP_INFO_RUNLOCK(&V_tcbinfo); + error = ENOENT; + } + if (error == 0) + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, + CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, + tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); + +#ifdef INET6 +static int +tcp6_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in6 addrs[2]; + struct inpcb *inp; + int error, mapped = 0; + + error = priv_check(req->td, PRIV_NETINET_GETCRED); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || + (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { + return (error); + } + if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { + if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) + mapped = 1; + else + return (EINVAL); + } + + INP_INFO_RLOCK(&V_tcbinfo); + if (mapped == 1) + inp = in_pcblookup_hash(&V_tcbinfo, + *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], + addrs[1].sin6_port, + *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], + addrs[0].sin6_port, + 0, NULL); + else + inp = in6_pcblookup_hash(&V_tcbinfo, + &addrs[1].sin6_addr, addrs[1].sin6_port, + &addrs[0].sin6_addr, addrs[0].sin6_port, 0, NULL); + if (inp != NULL) { + INP_RLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + if (inp->inp_socket == NULL) + error = ENOENT; + if (error == 0) + error = cr_canseeinpcb(req->td->td_ucred, inp); + if (error == 0) + cru2x(inp->inp_cred, &xuc); + INP_RUNLOCK(inp); + } else { + INP_INFO_RUNLOCK(&V_tcbinfo); + error = ENOENT; + } + if (error == 0) + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); + return (error); +} + +SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, + CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, + tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); +#endif + + +void +tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + struct ip *ip = vip; + struct tcphdr *th; + struct in_addr faddr; + struct inpcb *inp; + struct tcpcb *tp; + struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; + struct icmp *icp; + struct in_conninfo inc; + tcp_seq icmp_tcp_seq; + int mtu; + + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; + + if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || + cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) + notify = tcp_drop_syn_sent; + /* + * Redirects don't need to be handled up here. + */ + else if (PRC_IS_REDIRECT(cmd)) + return; + /* + * Source quench is depreciated. + */ + else if (cmd == PRC_QUENCH) + return; + /* + * Hostdead is ugly because it goes linearly through all PCBs. + * XXX: We never get this from ICMP, otherwise it makes an + * excellent DoS attack on machines with many connections. + */ + else if (cmd == PRC_HOSTDEAD) + ip = NULL; + else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) + return; + if (ip != NULL) { + icp = (struct icmp *)((caddr_t)ip + - offsetof(struct icmp, icmp_ip)); + th = (struct tcphdr *)((caddr_t)ip + + (ip->ip_hl << 2)); + INP_INFO_WLOCK(&V_tcbinfo); + inp = in_pcblookup_hash(&V_tcbinfo, faddr, th->th_dport, + ip->ip_src, th->th_sport, 0, NULL); + if (inp != NULL) { + INP_WLOCK(inp); + if (!(inp->inp_flags & INP_TIMEWAIT) && + !(inp->inp_flags & INP_DROPPED) && + !(inp->inp_socket == NULL)) { + icmp_tcp_seq = htonl(th->th_seq); + tp = intotcpcb(inp); + if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && + SEQ_LT(icmp_tcp_seq, tp->snd_max)) { + if (cmd == PRC_MSGSIZE) { + /* + * MTU discovery: + * If we got a needfrag set the MTU + * in the route to the suggested new + * value (if given) and then notify. + */ + bzero(&inc, sizeof(inc)); + inc.inc_faddr = faddr; + inc.inc_fibnum = + inp->inp_inc.inc_fibnum; + + mtu = ntohs(icp->icmp_nextmtu); + /* + * If no alternative MTU was + * proposed, try the next smaller + * one. ip->ip_len has already + * been swapped in icmp_input(). + */ + if (!mtu) + mtu = ip_next_mtu(ip->ip_len, + 1); + if (mtu < V_tcp_minmss + + sizeof(struct tcpiphdr)) + mtu = V_tcp_minmss + + sizeof(struct tcpiphdr); + /* + * Only cache the the MTU if it + * is smaller than the interface + * or route MTU. tcp_mtudisc() + * will do right thing by itself. + */ + if (mtu <= tcp_maxmtu(&inc, NULL)) + tcp_hc_updatemtu(&inc, mtu); + } + + inp = (*notify)(inp, inetctlerrmap[cmd]); + } + } + if (inp != NULL) + INP_WUNLOCK(inp); + } else { + bzero(&inc, sizeof(inc)); + inc.inc_fport = th->th_dport; + inc.inc_lport = th->th_sport; + inc.inc_faddr = faddr; + inc.inc_laddr = ip->ip_src; + syncache_unreach(&inc, th); + } + INP_INFO_WUNLOCK(&V_tcbinfo); + } else + in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify); +} + +#ifdef INET6 +void +tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) +{ + struct tcphdr th; + struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; + struct ip6_hdr *ip6; + struct mbuf *m; + struct ip6ctlparam *ip6cp = NULL; + const struct sockaddr_in6 *sa6_src = NULL; + int off; + struct tcp_portonly { + u_int16_t th_sport; + u_int16_t th_dport; + } *thp; + + if (sa->sa_family != AF_INET6 || + sa->sa_len != sizeof(struct sockaddr_in6)) + return; + + if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) + return; + /* Source quench is depreciated. */ + else if (cmd == PRC_QUENCH) + return; + + /* if the parameter is from icmp6, decode it. */ + if (d != NULL) { + ip6cp = (struct ip6ctlparam *)d; + m = ip6cp->ip6c_m; + ip6 = ip6cp->ip6c_ip6; + off = ip6cp->ip6c_off; + sa6_src = ip6cp->ip6c_src; + } else { + m = NULL; + ip6 = NULL; + off = 0; /* fool gcc */ + sa6_src = &sa6_any; + } + + if (ip6 != NULL) { + struct in_conninfo inc; + /* + * XXX: We assume that when IPV6 is non NULL, + * M and OFF are valid. + */ + + /* check if we can safely examine src and dst ports */ + if (m->m_pkthdr.len < off + sizeof(*thp)) + return; + + bzero(&th, sizeof(th)); + m_copydata(m, off, sizeof(*thp), (caddr_t)&th); + + in6_pcbnotify(&V_tcbinfo, sa, th.th_dport, + (struct sockaddr *)ip6cp->ip6c_src, + th.th_sport, cmd, NULL, notify); + + bzero(&inc, sizeof(inc)); + inc.inc_fport = th.th_dport; + inc.inc_lport = th.th_sport; + inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; + inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; + inc.inc_flags |= INC_ISIPV6; + INP_INFO_WLOCK(&V_tcbinfo); + syncache_unreach(&inc, &th); + INP_INFO_WUNLOCK(&V_tcbinfo); + } else + in6_pcbnotify(&V_tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, + 0, cmd, NULL, notify); +} +#endif /* INET6 */ + + +/* + * Following is where TCP initial sequence number generation occurs. + * + * There are two places where we must use initial sequence numbers: + * 1. In SYN-ACK packets. + * 2. In SYN packets. + * + * All ISNs for SYN-ACK packets are generated by the syncache. See + * tcp_syncache.c for details. + * + * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling + * depends on this property. In addition, these ISNs should be + * unguessable so as to prevent connection hijacking. To satisfy + * the requirements of this situation, the algorithm outlined in + * RFC 1948 is used, with only small modifications. + * + * Implementation details: + * + * Time is based off the system timer, and is corrected so that it + * increases by one megabyte per second. This allows for proper + * recycling on high speed LANs while still leaving over an hour + * before rollover. + * + * As reading the *exact* system time is too expensive to be done + * whenever setting up a TCP connection, we increment the time + * offset in two ways. First, a small random positive increment + * is added to isn_offset for each connection that is set up. + * Second, the function tcp_isn_tick fires once per clock tick + * and increments isn_offset as necessary so that sequence numbers + * are incremented at approximately ISN_BYTES_PER_SECOND. The + * random positive increments serve only to ensure that the same + * exact sequence number is never sent out twice (as could otherwise + * happen when a port is recycled in less than the system tick + * interval.) + * + * net.inet.tcp.isn_reseed_interval controls the number of seconds + * between seeding of isn_secret. This is normally set to zero, + * as reseeding should not be necessary. + * + * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, + * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In + * general, this means holding an exclusive (write) lock. + */ + +#define ISN_BYTES_PER_SECOND 1048576 +#define ISN_STATIC_INCREMENT 4096 +#define ISN_RANDOM_INCREMENT (4096 - 1) + +static VNET_DEFINE(u_char, isn_secret[32]); +static VNET_DEFINE(int, isn_last_reseed); +static VNET_DEFINE(u_int32_t, isn_offset); +static VNET_DEFINE(u_int32_t, isn_offset_old); + +#define V_isn_secret VNET(isn_secret) +#define V_isn_last_reseed VNET(isn_last_reseed) +#define V_isn_offset VNET(isn_offset) +#define V_isn_offset_old VNET(isn_offset_old) + +tcp_seq +tcp_new_isn(struct tcpcb *tp) +{ + MD5_CTX isn_ctx; + u_int32_t md5_buffer[4]; + tcp_seq new_isn; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + ISN_LOCK(); + /* Seed if this is the first use, reseed if requested. */ + if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && + (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) + < (u_int)ticks))) { + read_random(&V_isn_secret, sizeof(V_isn_secret)); + V_isn_last_reseed = ticks; + } + + /* Compute the md5 hash and return the ISN. */ + MD5Init(&isn_ctx); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short)); +#ifdef INET6 + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) { + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr, + sizeof(struct in6_addr)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr, + sizeof(struct in6_addr)); + } else +#endif + { + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr, + sizeof(struct in_addr)); + MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr, + sizeof(struct in_addr)); + } + MD5Update(&isn_ctx, (u_char *) &V_isn_secret, sizeof(V_isn_secret)); + MD5Final((u_char *) &md5_buffer, &isn_ctx); + new_isn = (tcp_seq) md5_buffer[0]; + V_isn_offset += ISN_STATIC_INCREMENT + + (arc4random() & ISN_RANDOM_INCREMENT); + new_isn += V_isn_offset; + ISN_UNLOCK(); + return (new_isn); +} + +/* + * Increment the offset to the next ISN_BYTES_PER_SECOND / 100 boundary + * to keep time flowing at a relatively constant rate. If the random + * increments have already pushed us past the projected offset, do nothing. + */ +static void +tcp_isn_tick(void *xtp) +{ + VNET_ITERATOR_DECL(vnet_iter); + u_int32_t projected_offset; + + VNET_LIST_RLOCK_NOSLEEP(); + ISN_LOCK(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS */ + projected_offset = + V_isn_offset_old + ISN_BYTES_PER_SECOND / 100; + + if (SEQ_GT(projected_offset, V_isn_offset)) + V_isn_offset = projected_offset; + + V_isn_offset_old = V_isn_offset; + CURVNET_RESTORE(); + } + ISN_UNLOCK(); + VNET_LIST_RUNLOCK_NOSLEEP(); + callout_reset(&isn_callout, hz/100, tcp_isn_tick, NULL); +} + +/* + * When a specific ICMP unreachable message is received and the + * connection state is SYN-SENT, drop the connection. This behavior + * is controlled by the icmp_may_rst sysctl. + */ +struct inpcb * +tcp_drop_syn_sent(struct inpcb *inp, int errno) +{ + struct tcpcb *tp; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); + + if ((inp->inp_flags & INP_TIMEWAIT) || + (inp->inp_flags & INP_DROPPED)) + return (inp); + + tp = intotcpcb(inp); + if (tp->t_state != TCPS_SYN_SENT) + return (inp); + + tp = tcp_drop(tp, errno); + if (tp != NULL) + return (inp); + else + return (NULL); +} + +/* + * When `need fragmentation' ICMP is received, update our idea of the MSS + * based on the new value in the route. Also nudge TCP to send something, + * since we know the packet we just sent was dropped. + * This duplicates some code in the tcp_mss() function in tcp_input.c. + */ +struct inpcb * +tcp_mtudisc(struct inpcb *inp, int errno) +{ + struct tcpcb *tp; + struct socket *so; + + INP_WLOCK_ASSERT(inp); + if ((inp->inp_flags & INP_TIMEWAIT) || + (inp->inp_flags & INP_DROPPED)) + return (inp); + + tp = intotcpcb(inp); + KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); + + tcp_mss_update(tp, -1, NULL, NULL); + + so = inp->inp_socket; + SOCKBUF_LOCK(&so->so_snd); + /* If the mss is larger than the socket buffer, decrease the mss. */ + if (so->so_snd.sb_hiwat < tp->t_maxseg) + tp->t_maxseg = so->so_snd.sb_hiwat; + SOCKBUF_UNLOCK(&so->so_snd); + + TCPSTAT_INC(tcps_mturesent); + tp->t_rtttime = 0; + tp->snd_nxt = tp->snd_una; + tcp_free_sackholes(tp); + tp->snd_recover = tp->snd_max; + if (tp->t_flags & TF_SACK_PERMIT) + EXIT_FASTRECOVERY(tp); + tcp_output_send(tp); + return (inp); +} + +/* + * Look-up the routing entry to the peer of this inpcb. If no route + * is found and it cannot be allocated, then return 0. This routine + * is called by TCP routines that access the rmx structure and by + * tcp_mss_update to get the peer/interface MTU. + */ +u_long +tcp_maxmtu(struct in_conninfo *inc, int *flags) +{ + struct route sro; + struct sockaddr_in *dst; + struct ifnet *ifp; + u_long maxmtu = 0; + + KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); + + bzero(&sro, sizeof(sro)); + if (inc->inc_faddr.s_addr != INADDR_ANY) { + dst = (struct sockaddr_in *)&sro.ro_dst; + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = inc->inc_faddr; + in_rtalloc_ign(&sro, 0, inc->inc_fibnum); + } + if (sro.ro_rt != NULL) { + ifp = sro.ro_rt->rt_ifp; + if (sro.ro_rt->rt_rmx.rmx_mtu == 0) + maxmtu = ifp->if_mtu; + else + maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); + + /* Report additional interface capabilities. */ + if (flags != NULL) { + if (ifp->if_capenable & IFCAP_TSO4 && + ifp->if_hwassist & CSUM_TSO) + *flags |= CSUM_TSO; + } + RTFREE(sro.ro_rt); + } + return (maxmtu); +} + +#ifdef INET6 +u_long +tcp_maxmtu6(struct in_conninfo *inc, int *flags) +{ + struct route_in6 sro6; + struct ifnet *ifp; + u_long maxmtu = 0; + + KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); + + bzero(&sro6, sizeof(sro6)); + if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { + sro6.ro_dst.sin6_family = AF_INET6; + sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); + sro6.ro_dst.sin6_addr = inc->inc6_faddr; + rtalloc_ign((struct route *)&sro6, 0); + } + if (sro6.ro_rt != NULL) { + ifp = sro6.ro_rt->rt_ifp; + if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) + maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); + else + maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, + IN6_LINKMTU(sro6.ro_rt->rt_ifp)); + + /* Report additional interface capabilities. */ + if (flags != NULL) { + if (ifp->if_capenable & IFCAP_TSO6 && + ifp->if_hwassist & CSUM_TSO) + *flags |= CSUM_TSO; + } + RTFREE(sro6.ro_rt); + } + + return (maxmtu); +} +#endif /* INET6 */ + +#ifdef IPSEC +/* compute ESP/AH header size for TCP, including outer IP header. */ +size_t +ipsec_hdrsiz_tcp(struct tcpcb *tp) +{ + struct inpcb *inp; + struct mbuf *m; + size_t hdrsiz; + struct ip *ip; +#ifdef INET6 + struct ip6_hdr *ip6; +#endif + struct tcphdr *th; + + if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) + return (0); + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (!m) + return (0); + +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV6) != 0) { + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + m->m_pkthdr.len = m->m_len = + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + tcpip_fillheaders(inp, ip6, th); + hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } else +#endif /* INET6 */ + { + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr); + tcpip_fillheaders(inp, ip, th); + hdrsiz = ipsec_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp); + } + + m_free(m); + return (hdrsiz); +} +#endif /* IPSEC */ + +/* + * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING + * + * This code attempts to calculate the bandwidth-delay product as a + * means of determining the optimal window size to maximize bandwidth, + * minimize RTT, and avoid the over-allocation of buffers on interfaces and + * routers. This code also does a fairly good job keeping RTTs in check + * across slow links like modems. We implement an algorithm which is very + * similar (but not meant to be) TCP/Vegas. The code operates on the + * transmitter side of a TCP connection and so only effects the transmit + * side of the connection. + * + * BACKGROUND: TCP makes no provision for the management of buffer space + * at the end points or at the intermediate routers and switches. A TCP + * stream, whether using NewReno or not, will eventually buffer as + * many packets as it is able and the only reason this typically works is + * due to the fairly small default buffers made available for a connection + * (typicaly 16K or 32K). As machines use larger windows and/or window + * scaling it is now fairly easy for even a single TCP connection to blow-out + * all available buffer space not only on the local interface, but on + * intermediate routers and switches as well. NewReno makes a misguided + * attempt to 'solve' this problem by waiting for an actual failure to occur, + * then backing off, then steadily increasing the window again until another + * failure occurs, ad-infinitum. This results in terrible oscillation that + * is only made worse as network loads increase and the idea of intentionally + * blowing out network buffers is, frankly, a terrible way to manage network + * resources. + * + * It is far better to limit the transmit window prior to the failure + * condition being achieved. There are two general ways to do this: First + * you can 'scan' through different transmit window sizes and locate the + * point where the RTT stops increasing, indicating that you have filled the + * pipe, then scan backwards until you note that RTT stops decreasing, then + * repeat ad-infinitum. This method works in principle but has severe + * implementation issues due to RTT variances, timer granularity, and + * instability in the algorithm which can lead to many false positives and + * create oscillations as well as interact badly with other TCP streams + * implementing the same algorithm. + * + * The second method is to limit the window to the bandwidth delay product + * of the link. This is the method we implement. RTT variances and our + * own manipulation of the congestion window, bwnd, can potentially + * destabilize the algorithm. For this reason we have to stabilize the + * elements used to calculate the window. We do this by using the minimum + * observed RTT, the long term average of the observed bandwidth, and + * by adding two segments worth of slop. It isn't perfect but it is able + * to react to changing conditions and gives us a very stable basis on + * which to extend the algorithm. + */ +void +tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) +{ + u_long bw; + u_long bwnd; + int save_ticks; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * If inflight_enable is disabled in the middle of a tcp connection, + * make sure snd_bwnd is effectively disabled. + */ + if (V_tcp_inflight_enable == 0 || + tp->t_rttlow < V_tcp_inflight_rttthresh) { + tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; + tp->snd_bandwidth = 0; + return; + } + + /* + * Figure out the bandwidth. Due to the tick granularity this + * is a very rough number and it MUST be averaged over a fairly + * long period of time. XXX we need to take into account a link + * that is not using all available bandwidth, but for now our + * slop will ramp us up if this case occurs and the bandwidth later + * increases. + * + * Note: if ticks rollover 'bw' may wind up negative. We must + * effectively reset t_bw_rtttime for this case. + */ + save_ticks = ticks; + if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) + return; + + bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / + (save_ticks - tp->t_bw_rtttime); + tp->t_bw_rtttime = save_ticks; + tp->t_bw_rtseq = ack_seq; + if (tp->t_bw_rtttime == 0 || (int)bw < 0) + return; + bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; + + tp->snd_bandwidth = bw; + + /* + * Calculate the semi-static bandwidth delay product, plus two maximal + * segments. The additional slop puts us squarely in the sweet + * spot and also handles the bandwidth run-up case and stabilization. + * Without the slop we could be locking ourselves into a lower + * bandwidth. + * + * Situations Handled: + * (1) Prevents over-queueing of packets on LANs, especially on + * high speed LANs, allowing larger TCP buffers to be + * specified, and also does a good job preventing + * over-queueing of packets over choke points like modems + * (at least for the transmit side). + * + * (2) Is able to handle changing network loads (bandwidth + * drops so bwnd drops, bandwidth increases so bwnd + * increases). + * + * (3) Theoretically should stabilize in the face of multiple + * connections implementing the same algorithm (this may need + * a little work). + * + * (4) Stability value (defaults to 20 = 2 maximal packets) can + * be adjusted with a sysctl but typically only needs to be + * on very slow connections. A value no smaller then 5 + * should be used, but only reduce this default if you have + * no other choice. + */ +#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) + bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + V_tcp_inflight_stab * tp->t_maxseg / 10; +#undef USERTT + + if (tcp_inflight_debug > 0) { + static int ltime; + if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { + ltime = ticks; + printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", + tp, + bw, + tp->t_rttbest, + tp->t_srtt, + bwnd + ); + } + } + if ((long)bwnd < V_tcp_inflight_min) + bwnd = V_tcp_inflight_min; + if (bwnd > V_tcp_inflight_max) + bwnd = V_tcp_inflight_max; + if ((long)bwnd < tp->t_maxseg * 2) + bwnd = tp->t_maxseg * 2; + tp->snd_bwnd = bwnd; +} + +#ifdef TCP_SIGNATURE +/* + * Callback function invoked by m_apply() to digest TCP segment data + * contained within an mbuf chain. + */ +static int +tcp_signature_apply(void *fstate, void *data, u_int len) +{ + + MD5Update(fstate, (u_char *)data, len); + return (0); +} + +/* + * Compute TCP-MD5 hash of a TCP segment. (RFC2385) + * + * Parameters: + * m pointer to head of mbuf chain + * _unused + * len length of TCP segment data, excluding options + * optlen length of TCP segment options + * buf pointer to storage for computed MD5 digest + * direction direction of flow (IPSEC_DIR_INBOUND or OUTBOUND) + * + * We do this over ip, tcphdr, segment data, and the key in the SADB. + * When called from tcp_input(), we can be sure that th_sum has been + * zeroed out and verified already. + * + * Return 0 if successful, otherwise return -1. + * + * XXX The key is retrieved from the system's PF_KEY SADB, by keying a + * search with the destination IP address, and a 'magic SPI' to be + * determined by the application. This is hardcoded elsewhere to 1179 + * right now. Another branch of this code exists which uses the SPD to + * specify per-application flows but it is unstable. + */ +int +tcp_signature_compute(struct mbuf *m, int _unused, int len, int optlen, + u_char *buf, u_int direction) +{ + union sockaddr_union dst; + struct ippseudo ippseudo; + MD5_CTX ctx; + int doff; + struct ip *ip; + struct ipovly *ipovly; + struct secasvar *sav; + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6; + struct in6_addr in6; + char ip6buf[INET6_ADDRSTRLEN]; + uint32_t plen; + uint16_t nhdr; +#endif + u_short savecsum; + + KASSERT(m != NULL, ("NULL mbuf chain")); + KASSERT(buf != NULL, ("NULL signature pointer")); + + /* Extract the destination from the IP header in the mbuf. */ + bzero(&dst, sizeof(union sockaddr_union)); + ip = mtod(m, struct ip *); +#ifdef INET6 + ip6 = NULL; /* Make the compiler happy. */ +#endif + switch (ip->ip_v) { + case IPVERSION: + dst.sa.sa_len = sizeof(struct sockaddr_in); + dst.sa.sa_family = AF_INET; + dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ? + ip->ip_src : ip->ip_dst; + break; +#ifdef INET6 + case (IPV6_VERSION >> 4): + ip6 = mtod(m, struct ip6_hdr *); + dst.sa.sa_len = sizeof(struct sockaddr_in6); + dst.sa.sa_family = AF_INET6; + dst.sin6.sin6_addr = (direction == IPSEC_DIR_INBOUND) ? + ip6->ip6_src : ip6->ip6_dst; + break; +#endif + default: + return (EINVAL); + /* NOTREACHED */ + break; + } + + /* Look up an SADB entry which matches the address of the peer. */ + sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI)); + if (sav == NULL) { + ipseclog((LOG_ERR, "%s: SADB lookup failed for %s\n", __func__, + (ip->ip_v == IPVERSION) ? inet_ntoa(dst.sin.sin_addr) : +#ifdef INET6 + (ip->ip_v == (IPV6_VERSION >> 4)) ? + ip6_sprintf(ip6buf, &dst.sin6.sin6_addr) : +#endif + "(unsupported)")); + return (EINVAL); + } + + MD5Init(&ctx); + /* + * Step 1: Update MD5 hash with IP(v6) pseudo-header. + * + * XXX The ippseudo header MUST be digested in network byte order, + * or else we'll fail the regression test. Assume all fields we've + * been doing arithmetic on have been in host byte order. + * XXX One cannot depend on ipovly->ih_len here. When called from + * tcp_output(), the underlying ip_len member has not yet been set. + */ + switch (ip->ip_v) { + case IPVERSION: + ipovly = (struct ipovly *)ip; + ippseudo.ippseudo_src = ipovly->ih_src; + ippseudo.ippseudo_dst = ipovly->ih_dst; + ippseudo.ippseudo_pad = 0; + ippseudo.ippseudo_p = IPPROTO_TCP; + ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + + optlen); + MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo)); + + th = (struct tcphdr *)((u_char *)ip + sizeof(struct ip)); + doff = sizeof(struct ip) + sizeof(struct tcphdr) + optlen; + break; +#ifdef INET6 + /* + * RFC 2385, 2.0 Proposal + * For IPv6, the pseudo-header is as described in RFC 2460, namely the + * 128-bit source IPv6 address, 128-bit destination IPv6 address, zero- + * extended next header value (to form 32 bits), and 32-bit segment + * length. + * Note: Upper-Layer Packet Length comes before Next Header. + */ + case (IPV6_VERSION >> 4): + in6 = ip6->ip6_src; + in6_clearscope(&in6); + MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); + in6 = ip6->ip6_dst; + in6_clearscope(&in6); + MD5Update(&ctx, (char *)&in6, sizeof(struct in6_addr)); + plen = htonl(len + sizeof(struct tcphdr) + optlen); + MD5Update(&ctx, (char *)&plen, sizeof(uint32_t)); + nhdr = 0; + MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); + MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); + MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); + nhdr = IPPROTO_TCP; + MD5Update(&ctx, (char *)&nhdr, sizeof(uint8_t)); + + th = (struct tcphdr *)((u_char *)ip6 + sizeof(struct ip6_hdr)); + doff = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + optlen; + break; +#endif + default: + return (EINVAL); + /* NOTREACHED */ + break; + } + + + /* + * Step 2: Update MD5 hash with TCP header, excluding options. + * The TCP checksum must be set to zero. + */ + savecsum = th->th_sum; + th->th_sum = 0; + MD5Update(&ctx, (char *)th, sizeof(struct tcphdr)); + th->th_sum = savecsum; + + /* + * Step 3: Update MD5 hash with TCP segment data. + * Use m_apply() to avoid an early m_pullup(). + */ + if (len > 0) + m_apply(m, doff, len, tcp_signature_apply, &ctx); + + /* + * Step 4: Update MD5 hash with shared secret. + */ + MD5Update(&ctx, sav->key_auth->key_data, _KEYLEN(sav->key_auth)); + MD5Final(buf, &ctx); + + key_sa_recordxfer(sav, m); + KEY_FREESAV(&sav); + return (0); +} +#endif /* TCP_SIGNATURE */ + +static int +sysctl_drop(SYSCTL_HANDLER_ARGS) +{ + /* addrs[0] is a foreign socket, addrs[1] is a local one. */ + struct sockaddr_storage addrs[2]; + struct inpcb *inp; + struct tcpcb *tp; + struct tcptw *tw; + struct sockaddr_in *fin, *lin; +#ifdef INET6 + struct sockaddr_in6 *fin6, *lin6; +#endif + int error; + + inp = NULL; + fin = lin = NULL; +#ifdef INET6 + fin6 = lin6 = NULL; +#endif + error = 0; + + if (req->oldptr != NULL || req->oldlen != 0) + return (EINVAL); + if (req->newptr == NULL) + return (EPERM); + if (req->newlen < sizeof(addrs)) + return (ENOMEM); + error = SYSCTL_IN(req, &addrs, sizeof(addrs)); + if (error) + return (error); + + switch (addrs[0].ss_family) { +#ifdef INET6 + case AF_INET6: + fin6 = (struct sockaddr_in6 *)&addrs[0]; + lin6 = (struct sockaddr_in6 *)&addrs[1]; + if (fin6->sin6_len != sizeof(struct sockaddr_in6) || + lin6->sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { + if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) + return (EINVAL); + in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); + in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); + fin = (struct sockaddr_in *)&addrs[0]; + lin = (struct sockaddr_in *)&addrs[1]; + break; + } + error = sa6_embedscope(fin6, V_ip6_use_defzone); + if (error) + return (error); + error = sa6_embedscope(lin6, V_ip6_use_defzone); + if (error) + return (error); + break; +#endif + case AF_INET: + fin = (struct sockaddr_in *)&addrs[0]; + lin = (struct sockaddr_in *)&addrs[1]; + if (fin->sin_len != sizeof(struct sockaddr_in) || + lin->sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + break; + default: + return (EINVAL); + } + INP_INFO_WLOCK(&V_tcbinfo); + switch (addrs[0].ss_family) { +#ifdef INET6 + case AF_INET6: + inp = in6_pcblookup_hash(&V_tcbinfo, &fin6->sin6_addr, + fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, 0, + NULL); + break; +#endif + case AF_INET: + inp = in_pcblookup_hash(&V_tcbinfo, fin->sin_addr, + fin->sin_port, lin->sin_addr, lin->sin_port, 0, NULL); + break; + } + if (inp != NULL) { + INP_WLOCK(inp); + if (inp->inp_flags & INP_TIMEWAIT) { + /* + * XXXRW: There currently exists a state where an + * inpcb is present, but its timewait state has been + * discarded. For now, don't allow dropping of this + * type of inpcb. + */ + tw = intotw(inp); + if (tw != NULL) + tcp_twclose(tw, 0); + else + INP_WUNLOCK(inp); + } else if (!(inp->inp_flags & INP_DROPPED) && + !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { + tp = intotcpcb(inp); + tp = tcp_drop(tp, ECONNABORTED); + if (tp != NULL) + INP_WUNLOCK(inp); + } else + INP_WUNLOCK(inp); + } else + error = ESRCH; + INP_INFO_WUNLOCK(&V_tcbinfo); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, + CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, + 0, sysctl_drop, "", "Drop TCP connection"); + +/* + * Generate a standardized TCP log line for use throughout the + * tcp subsystem. Memory allocation is done with M_NOWAIT to + * allow use in the interrupt context. + * + * NB: The caller MUST free(s, M_TCPLOG) the returned string. + * NB: The function may return NULL if memory allocation failed. + * + * Due to header inclusion and ordering limitations the struct ip + * and ip6_hdr pointers have to be passed as void pointers. + */ +char * +tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, + const void *ip6hdr) +{ + + /* Is logging enabled? */ + if (tcp_log_in_vain == 0) + return (NULL); + + return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); +} + +char * +tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, + const void *ip6hdr) +{ + + /* Is logging enabled? */ + if (tcp_log_debug == 0) + return (NULL); + + return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); +} + +static char * +tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, void *ip4hdr, + const void *ip6hdr) +{ + char *s, *sp; + size_t size; + struct ip *ip; +#ifdef INET6 + const struct ip6_hdr *ip6; + + ip6 = (const struct ip6_hdr *)ip6hdr; +#endif /* INET6 */ + ip = (struct ip *)ip4hdr; + + /* + * The log line looks like this: + * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" + */ + size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + + sizeof(PRINT_TH_FLAGS) + 1 + +#ifdef INET6 + 2 * INET6_ADDRSTRLEN; +#else + 2 * INET_ADDRSTRLEN; +#endif /* INET6 */ + + s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); + if (s == NULL) + return (NULL); + + strcat(s, "TCP: ["); + sp = s + strlen(s); + + if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { + inet_ntoa_r(inc->inc_faddr, sp); + sp = s + strlen(s); + sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); + sp = s + strlen(s); + inet_ntoa_r(inc->inc_laddr, sp); + sp = s + strlen(s); + sprintf(sp, "]:%i", ntohs(inc->inc_lport)); +#ifdef INET6 + } else if (inc) { + ip6_sprintf(sp, &inc->inc6_faddr); + sp = s + strlen(s); + sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); + sp = s + strlen(s); + ip6_sprintf(sp, &inc->inc6_laddr); + sp = s + strlen(s); + sprintf(sp, "]:%i", ntohs(inc->inc_lport)); + } else if (ip6 && th) { + ip6_sprintf(sp, &ip6->ip6_src); + sp = s + strlen(s); + sprintf(sp, "]:%i to [", ntohs(th->th_sport)); + sp = s + strlen(s); + ip6_sprintf(sp, &ip6->ip6_dst); + sp = s + strlen(s); + sprintf(sp, "]:%i", ntohs(th->th_dport)); +#endif /* INET6 */ + } else if (ip && th) { + inet_ntoa_r(ip->ip_src, sp); + sp = s + strlen(s); + sprintf(sp, "]:%i to [", ntohs(th->th_sport)); + sp = s + strlen(s); + inet_ntoa_r(ip->ip_dst, sp); + sp = s + strlen(s); + sprintf(sp, "]:%i", ntohs(th->th_dport)); + } else { + free(s, M_TCPLOG); + return (NULL); + } + sp = s + strlen(s); + if (th) + sprintf(sp, " tcpflags 0x%b", th->th_flags, PRINT_TH_FLAGS); + if (*(s + size - 1) != '\0') + panic("%s: string too long", __func__); + return (s); +} diff --git a/freebsd/sys/netinet/tcp_syncache.c b/freebsd/sys/netinet/tcp_syncache.c new file mode 100644 index 00000000..78790cc8 --- /dev/null +++ b/freebsd/sys/netinet/tcp_syncache.c @@ -0,0 +1,1823 @@ +#include + +/*- + * Copyright (c) 2001 McAfee, Inc. + * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Jonathan Lemon + * and McAfee Research, the Security Research Division of McAfee, Inc. under + * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the + * DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for proc0 declaration */ +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#include +#include +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif + +#ifdef IPSEC +#include +#ifdef INET6 +#include +#endif +#include +#endif /*IPSEC*/ + +#include + +#include + +static VNET_DEFINE(int, tcp_syncookies) = 1; +#define V_tcp_syncookies VNET(tcp_syncookies) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, + &VNET_NAME(tcp_syncookies), 0, + "Use TCP SYN cookies if the syncache overflows"); + +static VNET_DEFINE(int, tcp_syncookiesonly) = 0; +#define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW, + &VNET_NAME(tcp_syncookiesonly), 0, + "Use only TCP SYN cookies"); + +#ifdef TCP_OFFLOAD_DISABLE +#define TOEPCB_ISSET(sc) (0) +#else +#define TOEPCB_ISSET(sc) ((sc)->sc_toepcb != NULL) +#endif + +static void syncache_drop(struct syncache *, struct syncache_head *); +static void syncache_free(struct syncache *); +static void syncache_insert(struct syncache *, struct syncache_head *); +struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **); +static int syncache_respond(struct syncache *); +static struct socket *syncache_socket(struct syncache *, struct socket *, + struct mbuf *m); +static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, + int docallout); +static void syncache_timer(void *); +static void syncookie_generate(struct syncache_head *, struct syncache *, + u_int32_t *); +static struct syncache + *syncookie_lookup(struct in_conninfo *, struct syncache_head *, + struct syncache *, struct tcpopt *, struct tcphdr *, + struct socket *); + +/* + * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. + * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds, + * the odds are that the user has given up attempting to connect by then. + */ +#define SYNCACHE_MAXREXMTS 3 + +/* Arbitrary values */ +#define TCP_SYNCACHE_HASHSIZE 512 +#define TCP_SYNCACHE_BUCKETLIMIT 30 + +static VNET_DEFINE(struct tcp_syncache, tcp_syncache); +#define V_tcp_syncache VNET(tcp_syncache) + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); + +SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, + &VNET_NAME(tcp_syncache.bucket_limit), 0, + "Per-bucket hash limit for syncache"); + +SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, + &VNET_NAME(tcp_syncache.cache_limit), 0, + "Overall entry limit for syncache"); + +SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD, + &VNET_NAME(tcp_syncache.cache_count), 0, + "Current number of entries in syncache"); + +SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN, + &VNET_NAME(tcp_syncache.hashsize), 0, + "Size of TCP syncache hashtable"); + +SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, + &VNET_NAME(tcp_syncache.rexmt_limit), 0, + "Limit on SYN/ACK retransmissions"); + +VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; +SYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, + CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, + "Send reset on socket allocation failure"); + +static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); + +#define SYNCACHE_HASH(inc, mask) \ + ((V_tcp_syncache.hash_secret ^ \ + (inc)->inc_faddr.s_addr ^ \ + ((inc)->inc_faddr.s_addr >> 16) ^ \ + (inc)->inc_fport ^ (inc)->inc_lport) & mask) + +#define SYNCACHE_HASH6(inc, mask) \ + ((V_tcp_syncache.hash_secret ^ \ + (inc)->inc6_faddr.s6_addr32[0] ^ \ + (inc)->inc6_faddr.s6_addr32[3] ^ \ + (inc)->inc_fport ^ (inc)->inc_lport) & mask) + +#define ENDPTS_EQ(a, b) ( \ + (a)->ie_fport == (b)->ie_fport && \ + (a)->ie_lport == (b)->ie_lport && \ + (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ + (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ +) + +#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) + +#define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) +#define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) +#define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) + +/* + * Requires the syncache entry to be already removed from the bucket list. + */ +static void +syncache_free(struct syncache *sc) +{ + + if (sc->sc_ipopts) + (void) m_free(sc->sc_ipopts); + if (sc->sc_cred) + crfree(sc->sc_cred); +#ifdef MAC + mac_syncache_destroy(&sc->sc_label); +#endif + + uma_zfree(V_tcp_syncache.zone, sc); +} + +void +syncache_init(void) +{ + int i; + + V_tcp_syncache.cache_count = 0; + V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; + V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; + V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; + V_tcp_syncache.hash_secret = arc4random(); + + TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", + &V_tcp_syncache.hashsize); + TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", + &V_tcp_syncache.bucket_limit); + if (!powerof2(V_tcp_syncache.hashsize) || + V_tcp_syncache.hashsize == 0) { + printf("WARNING: syncache hash size is not a power of 2.\n"); + V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; + } + V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; + + /* Set limits. */ + V_tcp_syncache.cache_limit = + V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; + TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", + &V_tcp_syncache.cache_limit); + + /* Allocate the hash table. */ + V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * + sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); + + /* Initialize the hash buckets. */ + for (i = 0; i < V_tcp_syncache.hashsize; i++) { +#ifdef VIMAGE + V_tcp_syncache.hashbase[i].sch_vnet = curvnet; +#endif + TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); + mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", + NULL, MTX_DEF); + callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, + &V_tcp_syncache.hashbase[i].sch_mtx, 0); + V_tcp_syncache.hashbase[i].sch_length = 0; + } + + /* Create the syncache entry zone. */ + V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); +} + +#ifdef VIMAGE +void +syncache_destroy(void) +{ + struct syncache_head *sch; + struct syncache *sc, *nsc; + int i; + + /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ + for (i = 0; i < V_tcp_syncache.hashsize; i++) { + + sch = &V_tcp_syncache.hashbase[i]; + callout_drain(&sch->sch_timer); + + SCH_LOCK(sch); + TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) + syncache_drop(sc, sch); + SCH_UNLOCK(sch); + KASSERT(TAILQ_EMPTY(&sch->sch_bucket), + ("%s: sch->sch_bucket not empty", __func__)); + KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0", + __func__, sch->sch_length)); + mtx_destroy(&sch->sch_mtx); + } + + KASSERT(V_tcp_syncache.cache_count == 0, ("%s: cache_count %d not 0", + __func__, V_tcp_syncache.cache_count)); + + /* Free the allocated global resources. */ + uma_zdestroy(V_tcp_syncache.zone); + free(V_tcp_syncache.hashbase, M_SYNCACHE); +} +#endif + +/* + * Inserts a syncache entry into the specified bucket row. + * Locks and unlocks the syncache_head autonomously. + */ +static void +syncache_insert(struct syncache *sc, struct syncache_head *sch) +{ + struct syncache *sc2; + + SCH_LOCK(sch); + + /* + * Make sure that we don't overflow the per-bucket limit. + * If the bucket is full, toss the oldest element. + */ + if (sch->sch_length >= V_tcp_syncache.bucket_limit) { + KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), + ("sch->sch_length incorrect")); + sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); + syncache_drop(sc2, sch); + TCPSTAT_INC(tcps_sc_bucketoverflow); + } + + /* Put it into the bucket. */ + TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); + sch->sch_length++; + + /* Reinitialize the bucket row's timer. */ + if (sch->sch_length == 1) + sch->sch_nextc = ticks + INT_MAX; + syncache_timeout(sc, sch, 1); + + SCH_UNLOCK(sch); + + V_tcp_syncache.cache_count++; + TCPSTAT_INC(tcps_sc_added); +} + +/* + * Remove and free entry from syncache bucket row. + * Expects locked syncache head. + */ +static void +syncache_drop(struct syncache *sc, struct syncache_head *sch) +{ + + SCH_LOCK_ASSERT(sch); + + TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); + sch->sch_length--; + +#ifndef TCP_OFFLOAD_DISABLE + if (sc->sc_tu) + sc->sc_tu->tu_syncache_event(TOE_SC_DROP, sc->sc_toepcb); +#endif + syncache_free(sc); + V_tcp_syncache.cache_count--; +} + +/* + * Engage/reengage time on bucket row. + */ +static void +syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) +{ + sc->sc_rxttime = ticks + + TCPTV_RTOBASE * (tcp_backoff[sc->sc_rxmits]); + sc->sc_rxmits++; + if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { + sch->sch_nextc = sc->sc_rxttime; + if (docallout) + callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, + syncache_timer, (void *)sch); + } +} + +/* + * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. + * If we have retransmitted an entry the maximum number of times, expire it. + * One separate timer for each bucket row. + */ +static void +syncache_timer(void *xsch) +{ + struct syncache_head *sch = (struct syncache_head *)xsch; + struct syncache *sc, *nsc; + int tick = ticks; + char *s; + + CURVNET_SET(sch->sch_vnet); + + /* NB: syncache_head has already been locked by the callout. */ + SCH_LOCK_ASSERT(sch); + + /* + * In the following cycle we may remove some entries and/or + * advance some timeouts, so re-initialize the bucket timer. + */ + sch->sch_nextc = tick + INT_MAX; + + TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { + /* + * We do not check if the listen socket still exists + * and accept the case where the listen socket may be + * gone by the time we resend the SYN/ACK. We do + * not expect this to happens often. If it does, + * then the RST will be sent by the time the remote + * host does the SYN/ACK->ACK. + */ + if (TSTMP_GT(sc->sc_rxttime, tick)) { + if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) + sch->sch_nextc = sc->sc_rxttime; + continue; + } + if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { + if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " + "giving up and removing syncache entry\n", + s, __func__); + free(s, M_TCPLOG); + } + syncache_drop(sc, sch); + TCPSTAT_INC(tcps_sc_stale); + continue; + } + if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Response timeout, " + "retransmitting (%u) SYN|ACK\n", + s, __func__, sc->sc_rxmits); + free(s, M_TCPLOG); + } + + (void) syncache_respond(sc); + TCPSTAT_INC(tcps_sc_retransmitted); + syncache_timeout(sc, sch, 0); + } + if (!TAILQ_EMPTY(&(sch)->sch_bucket)) + callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, + syncache_timer, (void *)(sch)); + CURVNET_RESTORE(); +} + +/* + * Find an entry in the syncache. + * Returns always with locked syncache_head plus a matching entry or NULL. + */ +struct syncache * +syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) +{ + struct syncache *sc; + struct syncache_head *sch; + +#ifdef INET6 + if (inc->inc_flags & INC_ISIPV6) { + sch = &V_tcp_syncache.hashbase[ + SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)]; + *schp = sch; + + SCH_LOCK(sch); + + /* Circle through bucket row to find matching entry. */ + TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { + if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) + return (sc); + } + } else +#endif + { + sch = &V_tcp_syncache.hashbase[ + SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)]; + *schp = sch; + + SCH_LOCK(sch); + + /* Circle through bucket row to find matching entry. */ + TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { +#ifdef INET6 + if (sc->sc_inc.inc_flags & INC_ISIPV6) + continue; +#endif + if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) + return (sc); + } + } + SCH_LOCK_ASSERT(*schp); + return (NULL); /* always returns with locked sch */ +} + +/* + * This function is called when we get a RST for a + * non-existent connection, so that we can see if the + * connection is in the syn cache. If it is, zap it. + */ +void +syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) +{ + struct syncache *sc; + struct syncache_head *sch; + char *s = NULL; + + sc = syncache_lookup(inc, &sch); /* returns locked sch */ + SCH_LOCK_ASSERT(sch); + + /* + * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. + * See RFC 793 page 65, section SEGMENT ARRIVES. + */ + if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " + "FIN flag set, segment ignored\n", s, __func__); + TCPSTAT_INC(tcps_badrst); + goto done; + } + + /* + * No corresponding connection was found in syncache. + * If syncookies are enabled and possibly exclusively + * used, or we are under memory pressure, a valid RST + * may not find a syncache entry. In that case we're + * done and no SYN|ACK retransmissions will happen. + * Otherwise the the RST was misdirected or spoofed. + */ + if (sc == NULL) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Spurious RST without matching " + "syncache entry (possibly syncookie only), " + "segment ignored\n", s, __func__); + TCPSTAT_INC(tcps_badrst); + goto done; + } + + /* + * If the RST bit is set, check the sequence number to see + * if this is a valid reset segment. + * RFC 793 page 37: + * In all states except SYN-SENT, all reset (RST) segments + * are validated by checking their SEQ-fields. A reset is + * valid if its sequence number is in the window. + * + * The sequence number in the reset segment is normally an + * echo of our outgoing acknowlegement numbers, but some hosts + * send a reset with the sequence number at the rightmost edge + * of our receive window, and we have to handle this case. + */ + if (SEQ_GEQ(th->th_seq, sc->sc_irs) && + SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { + syncache_drop(sc, sch); + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " + "connection attempt aborted by remote endpoint\n", + s, __func__); + TCPSTAT_INC(tcps_sc_reset); + } else { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != " + "IRS %u (+WND %u), segment ignored\n", + s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd); + TCPSTAT_INC(tcps_badrst); + } + +done: + if (s != NULL) + free(s, M_TCPLOG); + SCH_UNLOCK(sch); +} + +void +syncache_badack(struct in_conninfo *inc) +{ + struct syncache *sc; + struct syncache_head *sch; + + sc = syncache_lookup(inc, &sch); /* returns locked sch */ + SCH_LOCK_ASSERT(sch); + if (sc != NULL) { + syncache_drop(sc, sch); + TCPSTAT_INC(tcps_sc_badack); + } + SCH_UNLOCK(sch); +} + +void +syncache_unreach(struct in_conninfo *inc, struct tcphdr *th) +{ + struct syncache *sc; + struct syncache_head *sch; + + sc = syncache_lookup(inc, &sch); /* returns locked sch */ + SCH_LOCK_ASSERT(sch); + if (sc == NULL) + goto done; + + /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ + if (ntohl(th->th_seq) != sc->sc_iss) + goto done; + + /* + * If we've rertransmitted 3 times and this is our second error, + * we remove the entry. Otherwise, we allow it to continue on. + * This prevents us from incorrectly nuking an entry during a + * spurious network outage. + * + * See tcp_notify(). + */ + if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { + sc->sc_flags |= SCF_UNREACH; + goto done; + } + syncache_drop(sc, sch); + TCPSTAT_INC(tcps_sc_unreach); +done: + SCH_UNLOCK(sch); +} + +/* + * Build a new TCP socket structure from a syncache entry. + */ +static struct socket * +syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) +{ + struct inpcb *inp = NULL; + struct socket *so; + struct tcpcb *tp; + int error = 0; + char *s; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + + /* + * Ok, create the full blown connection, and set things up + * as they would have been set up if we had created the + * connection when the SYN arrived. If we can't create + * the connection, abort it. + */ + so = sonewconn(lso, SS_ISCONNECTED); + if (so == NULL) { + /* + * Drop the connection; we will either send a RST or + * have the peer retransmit its SYN again after its + * RTO and try again. + */ + TCPSTAT_INC(tcps_listendrop); + if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Socket create failed " + "due to limits or memory shortage\n", + s, __func__); + free(s, M_TCPLOG); + } + goto abort2; + } +#ifdef MAC + mac_socketpeer_set_from_mbuf(m, so); +#endif + + inp = sotoinpcb(so); + inp->inp_inc.inc_fibnum = so->so_fibnum; + INP_WLOCK(inp); + + /* Insert new socket into PCB hash list. */ + inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; +#ifdef INET6 + if (sc->sc_inc.inc_flags & INC_ISIPV6) { + inp->in6p_laddr = sc->sc_inc.inc6_laddr; + } else { + inp->inp_vflag &= ~INP_IPV6; + inp->inp_vflag |= INP_IPV4; +#endif + inp->inp_laddr = sc->sc_inc.inc_laddr; +#ifdef INET6 + } +#endif + inp->inp_lport = sc->sc_inc.inc_lport; + if ((error = in_pcbinshash(inp)) != 0) { + /* + * Undo the assignments above if we failed to + * put the PCB on the hash lists. + */ +#ifdef INET6 + if (sc->sc_inc.inc_flags & INC_ISIPV6) + inp->in6p_laddr = in6addr_any; + else +#endif + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_lport = 0; + if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: in_pcbinshash failed " + "with error %i\n", + s, __func__, error); + free(s, M_TCPLOG); + } + goto abort; + } +#ifdef IPSEC + /* Copy old policy into new socket's. */ + if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) + printf("syncache_socket: could not copy policy\n"); +#endif +#ifdef INET6 + if (sc->sc_inc.inc_flags & INC_ISIPV6) { + struct inpcb *oinp = sotoinpcb(lso); + struct in6_addr laddr6; + struct sockaddr_in6 sin6; + /* + * Inherit socket options from the listening socket. + * Note that in6p_inputopts are not (and should not be) + * copied, since it stores previously received options and is + * used to detect if each new option is different than the + * previous one and hence should be passed to a user. + * If we copied in6p_inputopts, a user would not be able to + * receive options just after calling the accept system call. + */ + inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; + if (oinp->in6p_outputopts) + inp->in6p_outputopts = + ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); + + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); + sin6.sin6_addr = sc->sc_inc.inc6_faddr; + sin6.sin6_port = sc->sc_inc.inc_fport; + sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; + laddr6 = inp->in6p_laddr; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = sc->sc_inc.inc6_laddr; +#ifndef __rtems__ + if ((error = in6_pcbconnect(inp, (struct sockaddr *)&sin6, + thread0.td_ucred)) != 0) { +#else /* __rtems__ */ + if ((error = in6_pcbconnect(inp, (struct sockaddr *)&sin6, + rtems_bsd_thread0_ucred)) != 0) { +#endif /* __rtems__ */ + inp->in6p_laddr = laddr6; + if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed " + "with error %i\n", + s, __func__, error); + free(s, M_TCPLOG); + } + goto abort; + } + /* Override flowlabel from in6_pcbconnect. */ + inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; + inp->inp_flow |= sc->sc_flowlabel; + } else +#endif + { + struct in_addr laddr; + struct sockaddr_in sin; + + inp->inp_options = (m) ? ip_srcroute(m) : NULL; + + if (inp->inp_options == NULL) { + inp->inp_options = sc->sc_ipopts; + sc->sc_ipopts = NULL; + } + + sin.sin_family = AF_INET; + sin.sin_len = sizeof(sin); + sin.sin_addr = sc->sc_inc.inc_faddr; + sin.sin_port = sc->sc_inc.inc_fport; + bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); + laddr = inp->inp_laddr; + if (inp->inp_laddr.s_addr == INADDR_ANY) + inp->inp_laddr = sc->sc_inc.inc_laddr; +#ifndef __rtems__ + if ((error = in_pcbconnect(inp, (struct sockaddr *)&sin, + thread0.td_ucred)) != 0) { +#else /* __rtems__ */ + if ((error = in_pcbconnect(inp, (struct sockaddr *)&sin, + rtems_bsd_thread0_ucred)) != 0) { +#endif /* __rtems__ */ + + inp->inp_laddr = laddr; + if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: in_pcbconnect failed " + "with error %i\n", + s, __func__, error); + free(s, M_TCPLOG); + } + goto abort; + } + } + tp = intotcpcb(inp); + tp->t_state = TCPS_SYN_RECEIVED; + tp->iss = sc->sc_iss; + tp->irs = sc->sc_irs; + tcp_rcvseqinit(tp); + tcp_sendseqinit(tp); + tp->snd_wl1 = sc->sc_irs; + tp->snd_max = tp->iss + 1; + tp->snd_nxt = tp->iss + 1; + tp->rcv_up = sc->sc_irs + 1; + tp->rcv_wnd = sc->sc_wnd; + tp->rcv_adv += tp->rcv_wnd; + tp->last_ack_sent = tp->rcv_nxt; + + tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); + if (sc->sc_flags & SCF_NOOPT) + tp->t_flags |= TF_NOOPT; + else { + if (sc->sc_flags & SCF_WINSCALE) { + tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; + tp->snd_scale = sc->sc_requested_s_scale; + tp->request_r_scale = sc->sc_requested_r_scale; + } + if (sc->sc_flags & SCF_TIMESTAMP) { + tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; + tp->ts_recent = sc->sc_tsreflect; + tp->ts_recent_age = ticks; + tp->ts_offset = sc->sc_tsoff; + } +#ifdef TCP_SIGNATURE + if (sc->sc_flags & SCF_SIGNATURE) + tp->t_flags |= TF_SIGNATURE; +#endif + if (sc->sc_flags & SCF_SACK) + tp->t_flags |= TF_SACK_PERMIT; + } + + if (sc->sc_flags & SCF_ECN) + tp->t_flags |= TF_ECN_PERMIT; + + /* + * Set up MSS and get cached values from tcp_hostcache. + * This might overwrite some of the defaults we just set. + */ + tcp_mss(tp, sc->sc_peer_mss); + + /* + * If the SYN,ACK was retransmitted, reset cwnd to 1 segment. + * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits. + */ + if (sc->sc_rxmits > 1) + tp->snd_cwnd = tp->t_maxseg; + tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); + + INP_WUNLOCK(inp); + + TCPSTAT_INC(tcps_accepts); + return (so); + +abort: + INP_WUNLOCK(inp); +abort2: + if (so != NULL) + soabort(so); + return (NULL); +} + +/* + * This function gets called when we receive an ACK for a + * socket in the LISTEN state. We look up the connection + * in the syncache, and if its there, we pull it out of + * the cache and turn it into a full-blown connection in + * the SYN-RECEIVED state. + */ +int +syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, + struct socket **lsop, struct mbuf *m) +{ + struct syncache *sc; + struct syncache_head *sch; + struct syncache scs; + char *s; + + /* + * Global TCP locks are held because we manipulate the PCB lists + * and create a new socket. + */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, + ("%s: can handle only ACK", __func__)); + + sc = syncache_lookup(inc, &sch); /* returns locked sch */ + SCH_LOCK_ASSERT(sch); + if (sc == NULL) { + /* + * There is no syncache entry, so see if this ACK is + * a returning syncookie. To do this, first: + * A. See if this socket has had a syncache entry dropped in + * the past. We don't want to accept a bogus syncookie + * if we've never received a SYN. + * B. check that the syncookie is valid. If it is, then + * cobble up a fake syncache entry, and return. + */ + if (!V_tcp_syncookies) { + SCH_UNLOCK(sch); + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Spurious ACK, " + "segment rejected (syncookies disabled)\n", + s, __func__); + goto failed; + } + bzero(&scs, sizeof(scs)); + sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop); + SCH_UNLOCK(sch); + if (sc == NULL) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Segment failed " + "SYNCOOKIE authentication, segment rejected " + "(probably spoofed)\n", s, __func__); + goto failed; + } + } else { + /* Pull out the entry to unlock the bucket row. */ + TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); + sch->sch_length--; + V_tcp_syncache.cache_count--; + SCH_UNLOCK(sch); + } + + /* + * Segment validation: + * ACK must match our initial sequence number + 1 (the SYN|ACK). + */ + if (th->th_ack != sc->sc_iss + 1 && !TOEPCB_ISSET(sc)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " + "rejected\n", s, __func__, th->th_ack, sc->sc_iss); + goto failed; + } + + /* + * The SEQ must fall in the window starting at the received + * initial receive sequence number + 1 (the SYN). + */ + if ((SEQ_LEQ(th->th_seq, sc->sc_irs) || + SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) && + !TOEPCB_ISSET(sc)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " + "rejected\n", s, __func__, th->th_seq, sc->sc_irs); + goto failed; + } + + if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: Timestamp not expected, " + "segment rejected\n", s, __func__); + goto failed; + } + /* + * If timestamps were negotiated the reflected timestamp + * must be equal to what we actually sent in the SYN|ACK. + */ + if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts && + !TOEPCB_ISSET(sc)) { + if ((s = tcp_log_addrs(inc, th, NULL, NULL))) + log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " + "segment rejected\n", + s, __func__, to->to_tsecr, sc->sc_ts); + goto failed; + } + + *lsop = syncache_socket(sc, *lsop, m); + + if (*lsop == NULL) + TCPSTAT_INC(tcps_sc_aborted); + else + TCPSTAT_INC(tcps_sc_completed); + +/* how do we find the inp for the new socket? */ + if (sc != &scs) + syncache_free(sc); + return (1); +failed: + if (sc != NULL && sc != &scs) + syncache_free(sc); + if (s != NULL) + free(s, M_TCPLOG); + *lsop = NULL; + return (0); +} + +int +tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, + struct tcphdr *th, struct socket **lsop, struct mbuf *m) +{ + struct tcpopt to; + int rc; + + bzero(&to, sizeof(struct tcpopt)); + to.to_mss = toeo->to_mss; + to.to_wscale = toeo->to_wscale; + to.to_flags = toeo->to_flags; + + INP_INFO_WLOCK(&V_tcbinfo); + rc = syncache_expand(inc, &to, th, lsop, m); + INP_INFO_WUNLOCK(&V_tcbinfo); + + return (rc); +} + +/* + * Given a LISTEN socket and an inbound SYN request, add + * this to the syn cache, and send back a segment: + * + * to the source. + * + * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. + * Doing so would require that we hold onto the data and deliver it + * to the application. However, if we are the target of a SYN-flood + * DoS attack, an attacker could send data which would eventually + * consume all available buffer space if it were ACKed. By not ACKing + * the data, we avoid this DoS scenario. + */ +static void +_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, + struct inpcb *inp, struct socket **lsop, struct mbuf *m, + struct toe_usrreqs *tu, void *toepcb) +{ + struct tcpcb *tp; + struct socket *so; + struct syncache *sc = NULL; + struct syncache_head *sch; + struct mbuf *ipopts = NULL; + u_int32_t flowtmp; + int win, sb_hiwat, ip_ttl, ip_tos, noopt; + char *s; +#ifdef INET6 + int autoflowlabel = 0; +#endif +#ifdef MAC + struct label *maclabel; +#endif + struct syncache scs; + struct ucred *cred; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); /* listen socket */ + KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, + ("%s: unexpected tcp flags", __func__)); + + /* + * Combine all so/tp operations very early to drop the INP lock as + * soon as possible. + */ + so = *lsop; + tp = sototcpcb(so); + cred = crhold(so->so_cred); + +#ifdef INET6 + if ((inc->inc_flags & INC_ISIPV6) && + (inp->inp_flags & IN6P_AUTOFLOWLABEL)) + autoflowlabel = 1; +#endif + ip_ttl = inp->inp_ip_ttl; + ip_tos = inp->inp_ip_tos; + win = sbspace(&so->so_rcv); + sb_hiwat = so->so_rcv.sb_hiwat; + noopt = (tp->t_flags & TF_NOOPT); + + /* By the time we drop the lock these should no longer be used. */ + so = NULL; + tp = NULL; + +#ifdef MAC + if (mac_syncache_init(&maclabel) != 0) { + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + goto done; + } else + mac_syncache_create(maclabel, inp); +#endif + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + + /* + * Remember the IP options, if any. + */ +#ifdef INET6 + if (!(inc->inc_flags & INC_ISIPV6)) +#endif + ipopts = (m) ? ip_srcroute(m) : NULL; + + /* + * See if we already have an entry for this connection. + * If we do, resend the SYN,ACK, and reset the retransmit timer. + * + * XXX: should the syncache be re-initialized with the contents + * of the new SYN here (which may have different options?) + * + * XXX: We do not check the sequence number to see if this is a + * real retransmit or a new connection attempt. The question is + * how to handle such a case; either ignore it as spoofed, or + * drop the current entry and create a new one? + */ + sc = syncache_lookup(inc, &sch); /* returns locked entry */ + SCH_LOCK_ASSERT(sch); + if (sc != NULL) { +#ifndef TCP_OFFLOAD_DISABLE + if (sc->sc_tu) + sc->sc_tu->tu_syncache_event(TOE_SC_ENTRY_PRESENT, + sc->sc_toepcb); +#endif + TCPSTAT_INC(tcps_sc_dupsyn); + if (ipopts) { + /* + * If we were remembering a previous source route, + * forget it and use the new one we've been given. + */ + if (sc->sc_ipopts) + (void) m_free(sc->sc_ipopts); + sc->sc_ipopts = ipopts; + } + /* + * Update timestamp if present. + */ + if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) + sc->sc_tsreflect = to->to_tsval; + else + sc->sc_flags &= ~SCF_TIMESTAMP; +#ifdef MAC + /* + * Since we have already unconditionally allocated label + * storage, free it up. The syncache entry will already + * have an initialized label we can use. + */ + mac_syncache_destroy(&maclabel); +#endif + /* Retransmit SYN|ACK and reset retransmit count. */ + if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " + "resetting timer and retransmitting SYN|ACK\n", + s, __func__); + free(s, M_TCPLOG); + } + if (!TOEPCB_ISSET(sc) && syncache_respond(sc) == 0) { + sc->sc_rxmits = 0; + syncache_timeout(sc, sch, 1); + TCPSTAT_INC(tcps_sndacks); + TCPSTAT_INC(tcps_sndtotal); + } + SCH_UNLOCK(sch); + goto done; + } + + sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); + if (sc == NULL) { + /* + * The zone allocator couldn't provide more entries. + * Treat this as if the cache was full; drop the oldest + * entry and insert the new one. + */ + TCPSTAT_INC(tcps_sc_zonefail); + if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) + syncache_drop(sc, sch); + sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); + if (sc == NULL) { + if (V_tcp_syncookies) { + bzero(&scs, sizeof(scs)); + sc = &scs; + } else { + SCH_UNLOCK(sch); + if (ipopts) + (void) m_free(ipopts); + goto done; + } + } + } + + /* + * Fill in the syncache values. + */ +#ifdef MAC + sc->sc_label = maclabel; +#endif + sc->sc_cred = cred; + cred = NULL; + sc->sc_ipopts = ipopts; + bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); +#ifdef INET6 + if (!(inc->inc_flags & INC_ISIPV6)) +#endif + { + sc->sc_ip_tos = ip_tos; + sc->sc_ip_ttl = ip_ttl; + } +#ifndef TCP_OFFLOAD_DISABLE + sc->sc_tu = tu; + sc->sc_toepcb = toepcb; +#endif + sc->sc_irs = th->th_seq; + sc->sc_iss = arc4random(); + sc->sc_flags = 0; + sc->sc_flowlabel = 0; + + /* + * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. + * win was derived from socket earlier in the function. + */ + win = imax(win, 0); + win = imin(win, TCP_MAXWIN); + sc->sc_wnd = win; + + if (V_tcp_do_rfc1323) { + /* + * A timestamp received in a SYN makes + * it ok to send timestamp requests and replies. + */ + if (to->to_flags & TOF_TS) { + sc->sc_tsreflect = to->to_tsval; + sc->sc_ts = ticks; + sc->sc_flags |= SCF_TIMESTAMP; + } + if (to->to_flags & TOF_SCALE) { + int wscale = 0; + + /* + * Pick the smallest possible scaling factor that + * will still allow us to scale up to sb_max, aka + * kern.ipc.maxsockbuf. + * + * We do this because there are broken firewalls that + * will corrupt the window scale option, leading to + * the other endpoint believing that our advertised + * window is unscaled. At scale factors larger than + * 5 the unscaled window will drop below 1500 bytes, + * leading to serious problems when traversing these + * broken firewalls. + * + * With the default maxsockbuf of 256K, a scale factor + * of 3 will be chosen by this algorithm. Those who + * choose a larger maxsockbuf should watch out + * for the compatiblity problems mentioned above. + * + * RFC1323: The Window field in a SYN (i.e., a + * or ) segment itself is never scaled. + */ + while (wscale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << wscale) < sb_max) + wscale++; + sc->sc_requested_r_scale = wscale; + sc->sc_requested_s_scale = to->to_wscale; + sc->sc_flags |= SCF_WINSCALE; + } + } +#ifdef TCP_SIGNATURE + /* + * If listening socket requested TCP digests, and received SYN + * contains the option, flag this in the syncache so that + * syncache_respond() will do the right thing with the SYN+ACK. + * XXX: Currently we always record the option by default and will + * attempt to use it in syncache_respond(). + */ + if (to->to_flags & TOF_SIGNATURE) + sc->sc_flags |= SCF_SIGNATURE; +#endif + if (to->to_flags & TOF_SACKPERM) + sc->sc_flags |= SCF_SACK; + if (to->to_flags & TOF_MSS) + sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ + if (noopt) + sc->sc_flags |= SCF_NOOPT; + if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) + sc->sc_flags |= SCF_ECN; + + if (V_tcp_syncookies) { + syncookie_generate(sch, sc, &flowtmp); +#ifdef INET6 + if (autoflowlabel) + sc->sc_flowlabel = flowtmp; +#endif + } else { +#ifdef INET6 + if (autoflowlabel) + sc->sc_flowlabel = + (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); +#endif + } + SCH_UNLOCK(sch); + + /* + * Do a standard 3-way handshake. + */ + if (TOEPCB_ISSET(sc) || syncache_respond(sc) == 0) { + if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) + syncache_free(sc); + else if (sc != &scs) + syncache_insert(sc, sch); /* locks and unlocks sch */ + TCPSTAT_INC(tcps_sndacks); + TCPSTAT_INC(tcps_sndtotal); + } else { + if (sc != &scs) + syncache_free(sc); + TCPSTAT_INC(tcps_sc_dropped); + } + +done: + if (cred != NULL) + crfree(cred); +#ifdef MAC + if (sc == &scs) + mac_syncache_destroy(&maclabel); +#endif + if (m) { + + *lsop = NULL; + m_freem(m); + } +} + +static int +syncache_respond(struct syncache *sc) +{ + struct ip *ip = NULL; + struct mbuf *m; + struct tcphdr *th; + int optlen, error; + u_int16_t hlen, tlen, mssopt; + struct tcpopt to; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; +#endif + + hlen = +#ifdef INET6 + (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) : +#endif + sizeof(struct ip); + tlen = hlen + sizeof(struct tcphdr); + + /* Determine MSS we advertize to other end of connection. */ + mssopt = tcp_mssopt(&sc->sc_inc); + if (sc->sc_peer_mss) + mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss); + + /* XXX: Assume that the entire packet will fit in a header mbuf. */ + KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, + ("syncache: mbuf too small")); + + /* Create the IP+TCP header from scratch. */ + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + return (ENOBUFS); +#ifdef MAC + mac_syncache_create_mbuf(sc->sc_label, m); +#endif + m->m_data += max_linkhdr; + m->m_len = tlen; + m->m_pkthdr.len = tlen; + m->m_pkthdr.rcvif = NULL; + +#ifdef INET6 + if (sc->sc_inc.inc_flags & INC_ISIPV6) { + ip6 = mtod(m, struct ip6_hdr *); + ip6->ip6_vfc = IPV6_VERSION; + ip6->ip6_nxt = IPPROTO_TCP; + ip6->ip6_src = sc->sc_inc.inc6_laddr; + ip6->ip6_dst = sc->sc_inc.inc6_faddr; + ip6->ip6_plen = htons(tlen - hlen); + /* ip6_hlim is set after checksum */ + ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; + ip6->ip6_flow |= sc->sc_flowlabel; + + th = (struct tcphdr *)(ip6 + 1); + } else +#endif + { + ip = mtod(m, struct ip *); + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(struct ip) >> 2; + ip->ip_len = tlen; + ip->ip_id = 0; + ip->ip_off = 0; + ip->ip_sum = 0; + ip->ip_p = IPPROTO_TCP; + ip->ip_src = sc->sc_inc.inc_laddr; + ip->ip_dst = sc->sc_inc.inc_faddr; + ip->ip_ttl = sc->sc_ip_ttl; + ip->ip_tos = sc->sc_ip_tos; + + /* + * See if we should do MTU discovery. Route lookups are + * expensive, so we will only unset the DF bit if: + * + * 1) path_mtu_discovery is disabled + * 2) the SCF_UNREACH flag has been set + */ + if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) + ip->ip_off |= IP_DF; + + th = (struct tcphdr *)(ip + 1); + } + th->th_sport = sc->sc_inc.inc_lport; + th->th_dport = sc->sc_inc.inc_fport; + + th->th_seq = htonl(sc->sc_iss); + th->th_ack = htonl(sc->sc_irs + 1); + th->th_off = sizeof(struct tcphdr) >> 2; + th->th_x2 = 0; + th->th_flags = TH_SYN|TH_ACK; + th->th_win = htons(sc->sc_wnd); + th->th_urp = 0; + + if (sc->sc_flags & SCF_ECN) { + th->th_flags |= TH_ECE; + TCPSTAT_INC(tcps_ecn_shs); + } + + /* Tack on the TCP options. */ + if ((sc->sc_flags & SCF_NOOPT) == 0) { + to.to_flags = 0; + + to.to_mss = mssopt; + to.to_flags = TOF_MSS; + if (sc->sc_flags & SCF_WINSCALE) { + to.to_wscale = sc->sc_requested_r_scale; + to.to_flags |= TOF_SCALE; + } + if (sc->sc_flags & SCF_TIMESTAMP) { + /* Virgin timestamp or TCP cookie enhanced one. */ + to.to_tsval = sc->sc_ts; + to.to_tsecr = sc->sc_tsreflect; + to.to_flags |= TOF_TS; + } + if (sc->sc_flags & SCF_SACK) + to.to_flags |= TOF_SACKPERM; +#ifdef TCP_SIGNATURE + if (sc->sc_flags & SCF_SIGNATURE) + to.to_flags |= TOF_SIGNATURE; +#endif + optlen = tcp_addoptions(&to, (u_char *)(th + 1)); + + /* Adjust headers by option size. */ + th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + m->m_len += optlen; + m->m_pkthdr.len += optlen; + +#ifdef TCP_SIGNATURE + if (sc->sc_flags & SCF_SIGNATURE) + tcp_signature_compute(m, 0, 0, optlen, + to.to_signature, IPSEC_DIR_OUTBOUND); +#endif +#ifdef INET6 + if (sc->sc_inc.inc_flags & INC_ISIPV6) + ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); + else +#endif + ip->ip_len += optlen; + } else + optlen = 0; + + M_SETFIB(m, sc->sc_inc.inc_fibnum); +#ifdef INET6 + if (sc->sc_inc.inc_flags & INC_ISIPV6) { + th->th_sum = 0; + th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, + tlen + optlen - hlen); + ip6->ip6_hlim = in6_selecthlim(NULL, NULL); + error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); + } else +#endif + { + th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(tlen + optlen - hlen + IPPROTO_TCP)); + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); + } + return (error); +} + +void +syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, + struct inpcb *inp, struct socket **lsop, struct mbuf *m) +{ + _syncache_add(inc, to, th, inp, lsop, m, NULL, NULL); +} + +void +tcp_offload_syncache_add(struct in_conninfo *inc, struct toeopt *toeo, + struct tcphdr *th, struct inpcb *inp, struct socket **lsop, + struct toe_usrreqs *tu, void *toepcb) +{ + struct tcpopt to; + + bzero(&to, sizeof(struct tcpopt)); + to.to_mss = toeo->to_mss; + to.to_wscale = toeo->to_wscale; + to.to_flags = toeo->to_flags; + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + + _syncache_add(inc, &to, th, inp, lsop, NULL, tu, toepcb); +} + +/* + * The purpose of SYN cookies is to avoid keeping track of all SYN's we + * receive and to be able to handle SYN floods from bogus source addresses + * (where we will never receive any reply). SYN floods try to exhaust all + * our memory and available slots in the SYN cache table to cause a denial + * of service to legitimate users of the local host. + * + * The idea of SYN cookies is to encode and include all necessary information + * about the connection setup state within the SYN-ACK we send back and thus + * to get along without keeping any local state until the ACK to the SYN-ACK + * arrives (if ever). Everything we need to know should be available from + * the information we encoded in the SYN-ACK. + * + * More information about the theory behind SYN cookies and its first + * discussion and specification can be found at: + * http://cr.yp.to/syncookies.html (overview) + * http://cr.yp.to/syncookies/archive (gory details) + * + * This implementation extends the orginal idea and first implementation + * of FreeBSD by using not only the initial sequence number field to store + * information but also the timestamp field if present. This way we can + * keep track of the entire state we need to know to recreate the session in + * its original form. Almost all TCP speakers implement RFC1323 timestamps + * these days. For those that do not we still have to live with the known + * shortcomings of the ISN only SYN cookies. + * + * Cookie layers: + * + * Initial sequence number we send: + * 31|................................|0 + * DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP + * D = MD5 Digest (first dword) + * M = MSS index + * R = Rotation of secret + * P = Odd or Even secret + * + * The MD5 Digest is computed with over following parameters: + * a) randomly rotated secret + * b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6) + * c) the received initial sequence number from remote host + * d) the rotation offset and odd/even bit + * + * Timestamp we send: + * 31|................................|0 + * DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5 + * D = MD5 Digest (third dword) (only as filler) + * S = Requested send window scale + * R = Requested receive window scale + * A = SACK allowed + * 5 = TCP-MD5 enabled (not implemented yet) + * XORed with MD5 Digest (forth dword) + * + * The timestamp isn't cryptographically secure and doesn't need to be. + * The double use of the MD5 digest dwords ties it to a specific remote/ + * local host/port, remote initial sequence number and our local time + * limited secret. A received timestamp is reverted (XORed) and then + * the contained MD5 dword is compared to the computed one to ensure the + * timestamp belongs to the SYN-ACK we sent. The other parameters may + * have been tampered with but this isn't different from supplying bogus + * values in the SYN in the first place. + * + * Some problems with SYN cookies remain however: + * Consider the problem of a recreated (and retransmitted) cookie. If the + * original SYN was accepted, the connection is established. The second + * SYN is inflight, and if it arrives with an ISN that falls within the + * receive window, the connection is killed. + * + * Notes: + * A heuristic to determine when to accept syn cookies is not necessary. + * An ACK flood would cause the syncookie verification to be attempted, + * but a SYN flood causes syncookies to be generated. Both are of equal + * cost, so there's no point in trying to optimize the ACK flood case. + * Also, if you don't process certain ACKs for some reason, then all someone + * would have to do is launch a SYN and ACK flood at the same time, which + * would stop cookie verification and defeat the entire purpose of syncookies. + */ +static int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 }; + +static void +syncookie_generate(struct syncache_head *sch, struct syncache *sc, + u_int32_t *flowlabel) +{ + MD5_CTX ctx; + u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; + u_int32_t data; + u_int32_t *secbits; + u_int off, pmss, mss; + int i; + + SCH_LOCK_ASSERT(sch); + + /* Which of the two secrets to use. */ + secbits = sch->sch_oddeven ? + sch->sch_secbits_odd : sch->sch_secbits_even; + + /* Reseed secret if too old. */ + if (sch->sch_reseed < time_uptime) { + sch->sch_oddeven = sch->sch_oddeven ? 0 : 1; /* toggle */ + secbits = sch->sch_oddeven ? + sch->sch_secbits_odd : sch->sch_secbits_even; + for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++) + secbits[i] = arc4random(); + sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME; + } + + /* Secret rotation offset. */ + off = sc->sc_iss & 0x7; /* iss was randomized before */ + + /* Maximum segment size calculation. */ + pmss = + max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), V_tcp_minmss); + for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--) + if (tcp_sc_msstab[mss] <= pmss) + break; + + /* Fold parameters and MD5 digest into the ISN we will send. */ + data = sch->sch_oddeven;/* odd or even secret, 1 bit */ + data |= off << 1; /* secret offset, derived from iss, 3 bits */ + data |= mss << 4; /* mss, 3 bits */ + + MD5Init(&ctx); + MD5Update(&ctx, ((u_int8_t *)secbits) + off, + SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off); + MD5Update(&ctx, secbits, off); + MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc)); + MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs)); + MD5Update(&ctx, &data, sizeof(data)); + MD5Final((u_int8_t *)&md5_buffer, &ctx); + + data |= (md5_buffer[0] << 7); + sc->sc_iss = data; + +#ifdef INET6 + *flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK; +#endif + + /* Additional parameters are stored in the timestamp if present. */ + if (sc->sc_flags & SCF_TIMESTAMP) { + data = ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */ + data |= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */ + data |= sc->sc_requested_s_scale << 2; /* SWIN scale, 4 bits */ + data |= sc->sc_requested_r_scale << 6; /* RWIN scale, 4 bits */ + data |= md5_buffer[2] << 10; /* more digest bits */ + data ^= md5_buffer[3]; + sc->sc_ts = data; + sc->sc_tsoff = data - ticks; /* after XOR */ + } + + TCPSTAT_INC(tcps_sc_sendcookie); +} + +static struct syncache * +syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, + struct syncache *sc, struct tcpopt *to, struct tcphdr *th, + struct socket *so) +{ + MD5_CTX ctx; + u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)]; + u_int32_t data = 0; + u_int32_t *secbits; + tcp_seq ack, seq; + int off, mss, wnd, flags; + + SCH_LOCK_ASSERT(sch); + + /* + * Pull information out of SYN-ACK/ACK and + * revert sequence number advances. + */ + ack = th->th_ack - 1; + seq = th->th_seq - 1; + off = (ack >> 1) & 0x7; + mss = (ack >> 4) & 0x7; + flags = ack & 0x7f; + + /* Which of the two secrets to use. */ + secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even; + + /* + * The secret wasn't updated for the lifetime of a syncookie, + * so this SYN-ACK/ACK is either too old (replay) or totally bogus. + */ + if (sch->sch_reseed + SYNCOOKIE_LIFETIME < time_uptime) { + return (NULL); + } + + /* Recompute the digest so we can compare it. */ + MD5Init(&ctx); + MD5Update(&ctx, ((u_int8_t *)secbits) + off, + SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off); + MD5Update(&ctx, secbits, off); + MD5Update(&ctx, inc, sizeof(*inc)); + MD5Update(&ctx, &seq, sizeof(seq)); + MD5Update(&ctx, &flags, sizeof(flags)); + MD5Final((u_int8_t *)&md5_buffer, &ctx); + + /* Does the digest part of or ACK'ed ISS match? */ + if ((ack & (~0x7f)) != (md5_buffer[0] << 7)) + return (NULL); + + /* Does the digest part of our reflected timestamp match? */ + if (to->to_flags & TOF_TS) { + data = md5_buffer[3] ^ to->to_tsecr; + if ((data & (~0x3ff)) != (md5_buffer[2] << 10)) + return (NULL); + } + + /* Fill in the syncache values. */ + bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); + sc->sc_ipopts = NULL; + + sc->sc_irs = seq; + sc->sc_iss = ack; + +#ifdef INET6 + if (inc->inc_flags & INC_ISIPV6) { + if (sotoinpcb(so)->inp_flags & IN6P_AUTOFLOWLABEL) + sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK; + } else +#endif + { + sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl; + sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos; + } + + /* Additional parameters that were encoded in the timestamp. */ + if (data) { + sc->sc_flags |= SCF_TIMESTAMP; + sc->sc_tsreflect = to->to_tsval; + sc->sc_ts = to->to_tsecr; + sc->sc_tsoff = to->to_tsecr - ticks; + sc->sc_flags |= (data & 0x1) ? SCF_SIGNATURE : 0; + sc->sc_flags |= ((data >> 1) & 0x1) ? SCF_SACK : 0; + sc->sc_requested_s_scale = min((data >> 2) & 0xf, + TCP_MAX_WINSHIFT); + sc->sc_requested_r_scale = min((data >> 6) & 0xf, + TCP_MAX_WINSHIFT); + if (sc->sc_requested_s_scale || sc->sc_requested_r_scale) + sc->sc_flags |= SCF_WINSCALE; + } else + sc->sc_flags |= SCF_NOOPT; + + wnd = sbspace(&so->so_rcv); + wnd = imax(wnd, 0); + wnd = imin(wnd, TCP_MAXWIN); + sc->sc_wnd = wnd; + + sc->sc_rxmits = 0; + sc->sc_peer_mss = tcp_sc_msstab[mss]; + + TCPSTAT_INC(tcps_sc_recvcookie); + return (sc); +} + +/* + * Returns the current number of syncache entries. This number + * will probably change before you get around to calling + * syncache_pcblist. + */ + +int +syncache_pcbcount(void) +{ + struct syncache_head *sch; + int count, i; + + for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { + /* No need to lock for a read. */ + sch = &V_tcp_syncache.hashbase[i]; + count += sch->sch_length; + } + return count; +} + +/* + * Exports the syncache entries to userland so that netstat can display + * them alongside the other sockets. This function is intended to be + * called only from tcp_pcblist. + * + * Due to concurrency on an active system, the number of pcbs exported + * may have no relation to max_pcbs. max_pcbs merely indicates the + * amount of space the caller allocated for this function to use. + */ +int +syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) +{ + struct xtcpcb xt; + struct syncache *sc; + struct syncache_head *sch; + int count, error, i; + + for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { + sch = &V_tcp_syncache.hashbase[i]; + SCH_LOCK(sch); + TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { + if (count >= max_pcbs) { + SCH_UNLOCK(sch); + goto exit; + } + if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) + continue; + bzero(&xt, sizeof(xt)); + xt.xt_len = sizeof(xt); + if (sc->sc_inc.inc_flags & INC_ISIPV6) + xt.xt_inp.inp_vflag = INP_IPV6; + else + xt.xt_inp.inp_vflag = INP_IPV4; + bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); + xt.xt_tp.t_inpcb = &xt.xt_inp; + xt.xt_tp.t_state = TCPS_SYN_RECEIVED; + xt.xt_socket.xso_protocol = IPPROTO_TCP; + xt.xt_socket.xso_len = sizeof (struct xsocket); + xt.xt_socket.so_type = SOCK_STREAM; + xt.xt_socket.so_state = SS_ISCONNECTING; + error = SYSCTL_OUT(req, &xt, sizeof xt); + if (error) { + SCH_UNLOCK(sch); + goto exit; + } + count++; + } + SCH_UNLOCK(sch); + } +exit: + *pcbs_exported = count; + return error; +} diff --git a/freebsd/sys/netinet/tcp_syncache.h b/freebsd/sys/netinet/tcp_syncache.h new file mode 100644 index 00000000..96ba1535 --- /dev/null +++ b/freebsd/sys/netinet/tcp_syncache.h @@ -0,0 +1,127 @@ +/*- + * Copyright (c) 1982, 1986, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_SYNCACHE_HH_ +#define _NETINET_TCP_SYNCACHE_HH_ +#ifdef _KERNEL + +struct toeopt; + +void syncache_init(void); +#ifdef VIMAGE +void syncache_destroy(void); +#endif +void syncache_unreach(struct in_conninfo *, struct tcphdr *); +int syncache_expand(struct in_conninfo *, struct tcpopt *, + struct tcphdr *, struct socket **, struct mbuf *); +int tcp_offload_syncache_expand(struct in_conninfo *inc, struct toeopt *toeo, + struct tcphdr *th, struct socket **lsop, struct mbuf *m); +void syncache_add(struct in_conninfo *, struct tcpopt *, + struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *); +void tcp_offload_syncache_add(struct in_conninfo *, struct toeopt *, + struct tcphdr *, struct inpcb *, struct socket **, + struct toe_usrreqs *tu, void *toepcb); + +void syncache_chkrst(struct in_conninfo *, struct tcphdr *); +void syncache_badack(struct in_conninfo *); +int syncache_pcbcount(void); +int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported); + +struct syncache { + TAILQ_ENTRY(syncache) sc_hash; + struct in_conninfo sc_inc; /* addresses */ + int sc_rxttime; /* retransmit time */ + u_int16_t sc_rxmits; /* retransmit counter */ + u_int32_t sc_tsreflect; /* timestamp to reflect */ + u_int32_t sc_ts; /* our timestamp to send */ + u_int32_t sc_tsoff; /* ts offset w/ syncookies */ + u_int32_t sc_flowlabel; /* IPv6 flowlabel */ + tcp_seq sc_irs; /* seq from peer */ + tcp_seq sc_iss; /* our ISS */ + struct mbuf *sc_ipopts; /* source route */ + u_int16_t sc_peer_mss; /* peer's MSS */ + u_int16_t sc_wnd; /* advertised window */ + u_int8_t sc_ip_ttl; /* IPv4 TTL */ + u_int8_t sc_ip_tos; /* IPv4 TOS */ + u_int8_t sc_requested_s_scale:4, + sc_requested_r_scale:4; + u_int16_t sc_flags; +#ifndef TCP_OFFLOAD_DISABLE + struct toe_usrreqs *sc_tu; /* TOE operations */ + void *sc_toepcb; /* TOE protocol block */ +#endif + struct label *sc_label; /* MAC label reference */ + struct ucred *sc_cred; /* cred cache for jail checks */ +}; + +/* + * Flags for the sc_flags field. + */ +#define SCF_NOOPT 0x01 /* no TCP options */ +#define SCF_WINSCALE 0x02 /* negotiated window scaling */ +#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */ + /* MSS is implicit */ +#define SCF_UNREACH 0x10 /* icmp unreachable received */ +#define SCF_SIGNATURE 0x20 /* send MD5 digests */ +#define SCF_SACK 0x80 /* send SACK option */ +#define SCF_ECN 0x100 /* send ECN setup packet */ + +#define SYNCOOKIE_SECRET_SIZE 8 /* dwords */ +#define SYNCOOKIE_LIFETIME 16 /* seconds */ + +struct syncache_head { + struct vnet *sch_vnet; + struct mtx sch_mtx; + TAILQ_HEAD(sch_head, syncache) sch_bucket; + struct callout sch_timer; + int sch_nextc; + u_int sch_length; + u_int sch_oddeven; + u_int32_t sch_secbits_odd[SYNCOOKIE_SECRET_SIZE]; + u_int32_t sch_secbits_even[SYNCOOKIE_SECRET_SIZE]; + u_int sch_reseed; /* time_uptime, seconds */ +}; + +struct tcp_syncache { + struct syncache_head *hashbase; + uma_zone_t zone; + u_int hashsize; + u_int hashmask; + u_int bucket_limit; + u_int cache_count; /* XXX: unprotected */ + u_int cache_limit; + u_int rexmt_limit; + u_int hash_secret; +}; + +#endif /* _KERNEL */ +#endif /* !_NETINET_TCP_SYNCACHE_HH_ */ diff --git a/freebsd/sys/netinet/tcp_timer.c b/freebsd/sys/netinet/tcp_timer.c new file mode 100644 index 00000000..36e2bec2 --- /dev/null +++ b/freebsd/sys/netinet/tcp_timer.c @@ -0,0 +1,660 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif + +int tcp_keepinit; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); + +int tcp_keepidle; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); + +int tcp_keepintvl; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, + &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); + +int tcp_delacktime; +SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, + &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", + "Time before a delayed ACK is sent"); + +int tcp_msl; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, + &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); + +int tcp_rexmit_min; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, + &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", + "Minimum Retransmission Timeout"); + +int tcp_rexmit_slop; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, + &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", + "Retransmission Timer Slop"); + +static int always_keepalive = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, + &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); + +int tcp_fast_finwait2_recycle = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, + &tcp_fast_finwait2_recycle, 0, + "Recycle closed FIN_WAIT_2 connections faster"); + +int tcp_finwait2_timeout; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, + &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); + + +static int tcp_keepcnt = TCPTV_KEEPCNT; + /* max idle probes */ +int tcp_maxpersistidle; + /* max idle time in persist */ +int tcp_maxidle; + +/* + * Tcp protocol timeout routine called every 500 ms. + * Updates timestamps used for TCP + * causes finite state machine actions if timers expire. + */ +void +tcp_slowtimo(void) +{ + VNET_ITERATOR_DECL(vnet_iter); + + VNET_LIST_RLOCK_NOSLEEP(); + VNET_FOREACH(vnet_iter) { + CURVNET_SET(vnet_iter); + tcp_maxidle = tcp_keepcnt * tcp_keepintvl; + INP_INFO_WLOCK(&V_tcbinfo); + (void) tcp_tw_2msl_scan(0); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + } + VNET_LIST_RUNLOCK_NOSLEEP(); +} + +int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; + +int tcp_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; + +static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ + +static int tcp_timer_race; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race, + 0, "Count of t_inpcb races on tcp_discardcb"); + +/* + * TCP timer processing. + */ + +void +tcp_timer_delack(void *xtp) +{ + struct tcpcb *tp = xtp; + struct inpcb *inp; + CURVNET_SET(tp->t_vnet); + + inp = tp->t_inpcb; + /* + * XXXRW: While this assert is in fact correct, bugs in the tcpcb + * tear-down mean we need it as a work-around for races between + * timers and tcp_discardcb(). + * + * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL")); + */ + if (inp == NULL) { + tcp_timer_race++; + CURVNET_RESTORE(); + return; + } + INP_WLOCK(inp); + if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_delack) + || !callout_active(&tp->t_timers->tt_delack)) { + INP_WUNLOCK(inp); + CURVNET_RESTORE(); + return; + } + callout_deactivate(&tp->t_timers->tt_delack); + + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_delack); + (void) tcp_output(tp); + INP_WUNLOCK(inp); + CURVNET_RESTORE(); +} + +void +tcp_timer_2msl(void *xtp) +{ + struct tcpcb *tp = xtp; + struct inpcb *inp; + CURVNET_SET(tp->t_vnet); +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + /* + * XXXRW: Does this actually happen? + */ + INP_INFO_WLOCK(&V_tcbinfo); + inp = tp->t_inpcb; + /* + * XXXRW: While this assert is in fact correct, bugs in the tcpcb + * tear-down mean we need it as a work-around for races between + * timers and tcp_discardcb(). + * + * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL")); + */ + if (inp == NULL) { + tcp_timer_race++; + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + INP_WLOCK(inp); + tcp_free_sackholes(tp); + if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_2msl) || + !callout_active(&tp->t_timers->tt_2msl)) { + INP_WUNLOCK(tp->t_inpcb); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + callout_deactivate(&tp->t_timers->tt_2msl); + /* + * 2 MSL timeout in shutdown went off. If we're closed but + * still waiting for peer to close and connection has been idle + * too long, or if 2MSL time is up from TIME_WAIT, delete connection + * control block. Otherwise, check again in a bit. + * + * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, + * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. + * Ignore fact that there were recent incoming segments. + */ + if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && + tp->t_inpcb && tp->t_inpcb->inp_socket && + (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { + TCPSTAT_INC(tcps_finwait2_drops); + tp = tcp_close(tp); + } else { + if (tp->t_state != TCPS_TIME_WAIT && + ticks - tp->t_rcvtime <= tcp_maxidle) + callout_reset(&tp->t_timers->tt_2msl, tcp_keepintvl, + tcp_timer_2msl, tp); + else + tp = tcp_close(tp); + } + +#ifdef TCPDEBUG + if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + if (tp != NULL) + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); +} + +void +tcp_timer_keep(void *xtp) +{ + struct tcpcb *tp = xtp; + struct tcptemp *t_template; + struct inpcb *inp; + CURVNET_SET(tp->t_vnet); +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + INP_INFO_WLOCK(&V_tcbinfo); + inp = tp->t_inpcb; + /* + * XXXRW: While this assert is in fact correct, bugs in the tcpcb + * tear-down mean we need it as a work-around for races between + * timers and tcp_discardcb(). + * + * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL")); + */ + if (inp == NULL) { + tcp_timer_race++; + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + INP_WLOCK(inp); + if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_keep) + || !callout_active(&tp->t_timers->tt_keep)) { + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + callout_deactivate(&tp->t_timers->tt_keep); + /* + * Keep-alive timer went off; send something + * or drop connection if idle for too long. + */ + TCPSTAT_INC(tcps_keeptimeo); + if (tp->t_state < TCPS_ESTABLISHED) + goto dropit; + if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && + tp->t_state <= TCPS_CLOSING) { + if (ticks - tp->t_rcvtime >= tcp_keepidle + tcp_maxidle) + goto dropit; + /* + * Send a packet designed to force a response + * if the peer is up and reachable: + * either an ACK if the connection is still alive, + * or an RST if the peer has closed the connection + * due to timeout or reboot. + * Using sequence number tp->snd_una-1 + * causes the transmitted zero-length segment + * to lie outside the receive window; + * by the protocol spec, this requires the + * correspondent TCP to respond. + */ + TCPSTAT_INC(tcps_keepprobe); + t_template = tcpip_maketemplate(inp); + if (t_template) { + tcp_respond(tp, t_template->tt_ipgen, + &t_template->tt_t, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); + free(t_template, M_TEMP); + } + callout_reset(&tp->t_timers->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); + } else + callout_reset(&tp->t_timers->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + +#ifdef TCPDEBUG + if (inp->inp_socket->so_options & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + +dropit: + TCPSTAT_INC(tcps_keepdrops); + tp = tcp_drop(tp, ETIMEDOUT); + +#ifdef TCPDEBUG + if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + if (tp != NULL) + INP_WUNLOCK(tp->t_inpcb); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); +} + +void +tcp_timer_persist(void *xtp) +{ + struct tcpcb *tp = xtp; + struct inpcb *inp; + CURVNET_SET(tp->t_vnet); +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + INP_INFO_WLOCK(&V_tcbinfo); + inp = tp->t_inpcb; + /* + * XXXRW: While this assert is in fact correct, bugs in the tcpcb + * tear-down mean we need it as a work-around for races between + * timers and tcp_discardcb(). + * + * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL")); + */ + if (inp == NULL) { + tcp_timer_race++; + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + INP_WLOCK(inp); + if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_persist) + || !callout_active(&tp->t_timers->tt_persist)) { + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + callout_deactivate(&tp->t_timers->tt_persist); + /* + * Persistance timer into zero window. + * Force a byte to be output, if possible. + */ + TCPSTAT_INC(tcps_persisttimeo); + /* + * Hack: if the peer is dead/unreachable, we do not + * time out if the window is closed. After a full + * backoff, drop the connection if the idle time + * (no responses to probes) reaches the maximum + * backoff that we would use if retransmitting. + */ + if (tp->t_rxtshift == TCP_MAXRXTSHIFT && + (ticks - tp->t_rcvtime >= tcp_maxpersistidle || + ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + TCPSTAT_INC(tcps_persistdrop); + tp = tcp_drop(tp, ETIMEDOUT); + goto out; + } + tcp_setpersist(tp); + tp->t_flags |= TF_FORCEDATA; + (void) tcp_output(tp); + tp->t_flags &= ~TF_FORCEDATA; + +out: +#ifdef TCPDEBUG + if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); +#endif + if (tp != NULL) + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); +} + +void +tcp_timer_rexmt(void * xtp) +{ + struct tcpcb *tp = xtp; + CURVNET_SET(tp->t_vnet); + int rexmt; + int headlocked; + struct inpcb *inp; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + INP_INFO_WLOCK(&V_tcbinfo); + headlocked = 1; + inp = tp->t_inpcb; + /* + * XXXRW: While this assert is in fact correct, bugs in the tcpcb + * tear-down mean we need it as a work-around for races between + * timers and tcp_discardcb(). + * + * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL")); + */ + if (inp == NULL) { + tcp_timer_race++; + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + INP_WLOCK(inp); + if ((inp->inp_flags & INP_DROPPED) || callout_pending(&tp->t_timers->tt_rexmt) + || !callout_active(&tp->t_timers->tt_rexmt)) { + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } + callout_deactivate(&tp->t_timers->tt_rexmt); + tcp_free_sackholes(tp); + /* + * Retransmission timer went off. Message has not + * been acked within retransmit interval. Back off + * to a longer retransmit interval and retransmit one segment. + */ + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + tp->t_rxtshift = TCP_MAXRXTSHIFT; + TCPSTAT_INC(tcps_timeoutdrop); + tp = tcp_drop(tp, tp->t_softerror ? + tp->t_softerror : ETIMEDOUT); + goto out; + } + INP_INFO_WUNLOCK(&V_tcbinfo); + headlocked = 0; + if (tp->t_rxtshift == 1) { + /* + * first retransmit; record ssthresh and cwnd so they can + * be recovered if this turns out to be a "bad" retransmit. + * A retransmit is considered "bad" if an ACK for this + * segment is received within RTT/2 interval; the assumption + * here is that the ACK was already in flight. See + * "On Estimating End-to-End Network Path Properties" by + * Allman and Paxson for more details. + */ + tp->snd_cwnd_prev = tp->snd_cwnd; + tp->snd_ssthresh_prev = tp->snd_ssthresh; + tp->snd_recover_prev = tp->snd_recover; + if (IN_FASTRECOVERY(tp)) + tp->t_flags |= TF_WASFRECOVERY; + else + tp->t_flags &= ~TF_WASFRECOVERY; + tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + } + TCPSTAT_INC(tcps_rexmttimeo); + if (tp->t_state == TCPS_SYN_SENT) + rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; + else + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + TCPT_RANGESET(tp->t_rxtcur, rexmt, + tp->t_rttmin, TCPTV_REXMTMAX); + /* + * Disable rfc1323 if we havn't got any response to + * our third SYN to work-around some broken terminal servers + * (most of which have hopefully been retired) that have bad VJ + * header compression code which trashes TCP segments containing + * unknown-to-them TCP options. + */ + if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) + tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP); + /* + * If we backed off this far, our srtt estimate is probably bogus. + * Clobber it so we'll take the next rtt measurement as our srtt; + * move the current srtt into rttvar to keep the current + * retransmit times until then. + */ + if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { +#ifdef INET6 + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) + in6_losing(tp->t_inpcb); + else +#endif + tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); + tp->t_srtt = 0; + } + tp->snd_nxt = tp->snd_una; + tp->snd_recover = tp->snd_max; + /* + * Force a segment to be sent. + */ + tp->t_flags |= TF_ACKNOW; + /* + * If timing a segment in this window, stop the timer. + */ + tp->t_rtttime = 0; + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_dupacks = 0; + } + EXIT_FASTRECOVERY(tp); + tp->t_bytes_acked = 0; + (void) tcp_output(tp); + +out: +#ifdef TCPDEBUG + if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, + PRU_SLOWTIMO); +#endif + if (tp != NULL) + INP_WUNLOCK(inp); + if (headlocked) + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); +} + +void +tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) +{ + struct callout *t_callout; + void *f_callout; + + switch (timer_type) { + case TT_DELACK: + t_callout = &tp->t_timers->tt_delack; + f_callout = tcp_timer_delack; + break; + case TT_REXMT: + t_callout = &tp->t_timers->tt_rexmt; + f_callout = tcp_timer_rexmt; + break; + case TT_PERSIST: + t_callout = &tp->t_timers->tt_persist; + f_callout = tcp_timer_persist; + break; + case TT_KEEP: + t_callout = &tp->t_timers->tt_keep; + f_callout = tcp_timer_keep; + break; + case TT_2MSL: + t_callout = &tp->t_timers->tt_2msl; + f_callout = tcp_timer_2msl; + break; + default: + panic("bad timer_type"); + } + if (delta == 0) { + callout_stop(t_callout); + } else { + callout_reset(t_callout, delta, f_callout, tp); + } +} + +int +tcp_timer_active(struct tcpcb *tp, int timer_type) +{ + struct callout *t_callout; + + switch (timer_type) { + case TT_DELACK: + t_callout = &tp->t_timers->tt_delack; + break; + case TT_REXMT: + t_callout = &tp->t_timers->tt_rexmt; + break; + case TT_PERSIST: + t_callout = &tp->t_timers->tt_persist; + break; + case TT_KEEP: + t_callout = &tp->t_timers->tt_keep; + break; + case TT_2MSL: + t_callout = &tp->t_timers->tt_2msl; + break; + default: + panic("bad timer_type"); + } + return callout_active(t_callout); +} diff --git a/freebsd/sys/netinet/tcp_timer.h b/freebsd/sys/netinet/tcp_timer.h new file mode 100644 index 00000000..1514a293 --- /dev/null +++ b/freebsd/sys/netinet/tcp_timer.h @@ -0,0 +1,183 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_timer.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_TIMER_HH_ +#define _NETINET_TCP_TIMER_HH_ + +/* + * The TCPT_REXMT timer is used to force retransmissions. + * The TCP has the TCPT_REXMT timer set whenever segments + * have been sent for which ACKs are expected but not yet + * received. If an ACK is received which advances tp->snd_una, + * then the retransmit timer is cleared (if there are no more + * outstanding segments) or reset to the base value (if there + * are more ACKs expected). Whenever the retransmit timer goes off, + * we retransmit one unacknowledged segment, and do a backoff + * on the retransmit timer. + * + * The TCPT_PERSIST timer is used to keep window size information + * flowing even if the window goes shut. If all previous transmissions + * have been acknowledged (so that there are no retransmissions in progress), + * and the window is too small to bother sending anything, then we start + * the TCPT_PERSIST timer. When it expires, if the window is nonzero, + * we go to transmit state. Otherwise, at intervals send a single byte + * into the peer's window to force him to update our window information. + * We do this at most as often as TCPT_PERSMIN time intervals, + * but no more frequently than the current estimate of round-trip + * packet time. The TCPT_PERSIST timer is cleared whenever we receive + * a window update from the peer. + * + * The TCPT_KEEP timer is used to keep connections alive. If an + * connection is idle (no segments received) for TCPTV_KEEP_INIT amount of time, + * but not yet established, then we drop the connection. Once the connection + * is established, if the connection is idle for TCPTV_KEEP_IDLE time + * (and keepalives have been enabled on the socket), we begin to probe + * the connection. We force the peer to send us a segment by sending: + * + * This segment is (deliberately) outside the window, and should elicit + * an ack segment in response from the peer. If, despite the TCPT_KEEP + * initiated segments we cannot elicit a response from a peer in TCPT_MAXIDLE + * amount of time probing, then we drop the connection. + */ + +/* + * Time constants. + */ +#define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ +#define TCPTV_SRTTBASE 0 /* base roundtrip time; + if 0, no idea yet */ +#define TCPTV_RTOBASE ( 3*hz) /* assumed RTO if no info */ +#define TCPTV_SRTTDFLT ( 3*hz) /* assumed RTT if no info */ + +#define TCPTV_PERSMIN ( 5*hz) /* retransmit persistence */ +#define TCPTV_PERSMAX ( 60*hz) /* maximum persist interval */ + +#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ +#define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ +#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ +#define TCPTV_KEEPCNT 8 /* max probes before drop */ + +#define TCPTV_INFLIGHT_RTTTHRESH (10*hz/1000) /* below which inflight + disengages, in msec */ + +#define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ + +/* + * Minimum retransmit timer is 3 ticks, for algorithmic stability. + * TCPT_RANGESET() will add another TCPTV_CPU_VAR to deal with + * the expected worst-case processing variances by the kernels + * representing the end points. Such variances do not always show + * up in the srtt because the timestamp is often calculated at + * the interface rather then at the TCP layer. This value is + * typically 50ms. However, it is also possible that delayed + * acks (typically 100ms) could create issues so we set the slop + * to 200ms to try to cover it. Note that, properly speaking, + * delayed-acks should not create a major issue for interactive + * environments which 'P'ush the last segment, at least as + * long as implementations do the required 'at least one ack + * for every two packets' for the non-interactive streaming case. + * (maybe the RTO calculation should use 2*RTT instead of RTT + * to handle the ack-every-other-packet case). + * + * The prior minimum of 1*hz (1 second) badly breaks throughput on any + * networks faster then a modem that has minor (e.g. 1%) packet loss. + */ +#define TCPTV_MIN ( hz/33 ) /* minimum allowable value */ +#define TCPTV_CPU_VAR ( hz/5 ) /* cpu variance allowed (200ms) */ +#define TCPTV_REXMTMAX ( 64*hz) /* max allowable REXMT value */ + +#define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ + +#define TCP_LINGERTIME 120 /* linger at most 2 minutes */ + +#define TCP_MAXRXTSHIFT 12 /* maximum retransmits */ + +#define TCPTV_DELACK (hz / PR_FASTHZ / 2) /* 100ms timeout */ + +#ifdef TCPTIMERS +static const char *tcptimers[] = + { "REXMT", "PERSIST", "KEEP", "2MSL" }; +#endif + +/* + * Force a time value to be in a certain range. + */ +#define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ + (tv) = (value) + tcp_rexmit_slop; \ + if ((u_long)(tv) < (u_long)(tvmin)) \ + (tv) = (tvmin); \ + if ((u_long)(tv) > (u_long)(tvmax)) \ + (tv) = (tvmax); \ +} while(0) + +#ifdef _KERNEL + +struct tcp_timer { + struct callout tt_rexmt; /* retransmit timer */ + struct callout tt_persist; /* retransmit persistence */ + struct callout tt_keep; /* keepalive */ + struct callout tt_2msl; /* 2*msl TIME_WAIT timer */ + struct callout tt_delack; /* delayed ACK timer */ +}; +#define TT_DELACK 0x01 +#define TT_REXMT 0x02 +#define TT_PERSIST 0x04 +#define TT_KEEP 0x08 +#define TT_2MSL 0x10 + +extern int tcp_keepinit; /* time to establish connection */ +extern int tcp_keepidle; /* time before keepalive probes begin */ +extern int tcp_keepintvl; /* time between keepalive probes */ +extern int tcp_maxidle; /* time to drop after starting probes */ +extern int tcp_delacktime; /* time before sending a delayed ACK */ +extern int tcp_maxpersistidle; +extern int tcp_rexmit_min; +extern int tcp_rexmit_slop; +extern int tcp_msl; +extern int tcp_ttl; /* time to live for TCP segs */ +extern int tcp_backoff[]; + +extern int tcp_finwait2_timeout; +extern int tcp_fast_finwait2_recycle; + +void tcp_timer_init(void); +void tcp_timer_2msl(void *xtp); +struct tcptw * + tcp_tw_2msl_scan(int _reuse); /* XXX temporary */ +void tcp_timer_keep(void *xtp); +void tcp_timer_persist(void *xtp); +void tcp_timer_rexmt(void *xtp); +void tcp_timer_delack(void *xtp); + +#endif /* _KERNEL */ + +#endif /* !_NETINET_TCP_TIMER_HH_ */ diff --git a/freebsd/sys/netinet/tcp_timewait.c b/freebsd/sys/netinet/tcp_timewait.c new file mode 100644 index 00000000..92643d0a --- /dev/null +++ b/freebsd/sys/netinet/tcp_timewait.c @@ -0,0 +1,618 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#ifdef INET6 +#include +#endif +#include +#include +#ifdef INET6 +#include +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#ifdef TCPDEBUG +#include +#endif +#include + +#include + +#include + +static VNET_DEFINE(uma_zone_t, tcptw_zone); +#define V_tcptw_zone VNET(tcptw_zone) +static int maxtcptw; + +/* + * The timed wait queue contains references to each of the TCP sessions + * currently in the TIME_WAIT state. The queue pointers, including the + * queue pointers in each tcptw structure, are protected using the global + * tcbinfo lock, which must be held over queue iteration and modification. + */ +static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl); +#define V_twq_2msl VNET(twq_2msl) + +static void tcp_tw_2msl_reset(struct tcptw *, int); +static void tcp_tw_2msl_stop(struct tcptw *); + +static int +tcptw_auto_size(void) +{ + int halfrange; + + /* + * Max out at half the ephemeral port range so that TIME_WAIT + * sockets don't tie up too many ephemeral ports. + */ + if (V_ipport_lastauto > V_ipport_firstauto) + halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2; + else + halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2; + /* Protect against goofy port ranges smaller than 32. */ + return (imin(imax(halfrange, 32), maxsockets / 5)); +} + +static int +sysctl_maxtcptw(SYSCTL_HANDLER_ARGS) +{ + int error, new; + + if (maxtcptw == 0) + new = tcptw_auto_size(); + else + new = maxtcptw; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) + if (new >= 32) { + maxtcptw = new; + uma_zone_set_max(V_tcptw_zone, maxtcptw); + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, CTLTYPE_INT|CTLFLAG_RW, + &maxtcptw, 0, sysctl_maxtcptw, "IU", + "Maximum number of compressed TCP TIME_WAIT entries"); + +VNET_DEFINE(int, nolocaltimewait) = 0; +#define V_nolocaltimewait VNET(nolocaltimewait) +SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_RW, + &VNET_NAME(nolocaltimewait), 0, + "Do not create compressed TCP TIME_WAIT entries for local connections"); + +void +tcp_tw_zone_change(void) +{ + + if (maxtcptw == 0) + uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); +} + +void +tcp_tw_init(void) +{ + + V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); + if (maxtcptw == 0) + uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); + else + uma_zone_set_max(V_tcptw_zone, maxtcptw); + TAILQ_INIT(&V_twq_2msl); +} + +#ifdef VIMAGE +void +tcp_tw_destroy(void) +{ + struct tcptw *tw; + + INP_INFO_WLOCK(&V_tcbinfo); + while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) + tcp_twclose(tw, 0); + INP_INFO_WUNLOCK(&V_tcbinfo); + + uma_zdestroy(V_tcptw_zone); +} +#endif + +/* + * Move a TCP connection into TIME_WAIT state. + * tcbinfo is locked. + * inp is locked, and is unlocked before returning. + */ +void +tcp_twstart(struct tcpcb *tp) +{ + struct tcptw *tw; + struct inpcb *inp = tp->t_inpcb; + int acknow; + struct socket *so; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */ + INP_WLOCK_ASSERT(inp); + + if (V_nolocaltimewait && in_localip(inp->inp_faddr)) { + tp = tcp_close(tp); + if (tp != NULL) + INP_WUNLOCK(inp); + return; + } + + tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); + if (tw == NULL) { + tw = tcp_tw_2msl_scan(1); + if (tw == NULL) { + tp = tcp_close(tp); + if (tp != NULL) + INP_WUNLOCK(inp); + return; + } + } + tw->tw_inpcb = inp; + + /* + * Recover last window size sent. + */ + tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; + + /* + * Set t_recent if timestamps are used on the connection. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == + (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { + tw->t_recent = tp->ts_recent; + tw->ts_offset = tp->ts_offset; + } else { + tw->t_recent = 0; + tw->ts_offset = 0; + } + + tw->snd_nxt = tp->snd_nxt; + tw->rcv_nxt = tp->rcv_nxt; + tw->iss = tp->iss; + tw->irs = tp->irs; + tw->t_starttime = tp->t_starttime; + tw->tw_time = 0; + +/* XXX + * If this code will + * be used for fin-wait-2 state also, then we may need + * a ts_recent from the last segment. + */ + acknow = tp->t_flags & TF_ACKNOW; + + /* + * First, discard tcpcb state, which includes stopping its timers and + * freeing it. tcp_discardcb() used to also release the inpcb, but + * that work is now done in the caller. + * + * Note: soisdisconnected() call used to be made in tcp_discardcb(), + * and might not be needed here any longer. + */ + tcp_discardcb(tp); + so = inp->inp_socket; + soisdisconnected(so); + tw->tw_cred = crhold(so->so_cred); + SOCK_LOCK(so); + tw->tw_so_options = so->so_options; + SOCK_UNLOCK(so); + if (acknow) + tcp_twrespond(tw, TH_ACK); + inp->inp_ppcb = tw; + inp->inp_flags |= INP_TIMEWAIT; + tcp_tw_2msl_reset(tw, 0); + + /* + * If the inpcb owns the sole reference to the socket, then we can + * detach and free the socket as it is not needed in time wait. + */ + if (inp->inp_flags & INP_SOCKREF) { + KASSERT(so->so_state & SS_PROTOREF, + ("tcp_twstart: !SS_PROTOREF")); + inp->inp_flags &= ~INP_SOCKREF; + INP_WUNLOCK(inp); + ACCEPT_LOCK(); + SOCK_LOCK(so); + so->so_state &= ~SS_PROTOREF; + sofree(so); + } else + INP_WUNLOCK(inp); +} + +#if 0 +/* + * The appromixate rate of ISN increase of Microsoft TCP stacks; + * the actual rate is slightly higher due to the addition of + * random positive increments. + * + * Most other new OSes use semi-randomized ISN values, so we + * do not need to worry about them. + */ +#define MS_ISN_BYTES_PER_SECOND 250000 + +/* + * Determine if the ISN we will generate has advanced beyond the last + * sequence number used by the previous connection. If so, indicate + * that it is safe to recycle this tw socket by returning 1. + */ +int +tcp_twrecycleable(struct tcptw *tw) +{ + tcp_seq new_iss = tw->iss; + tcp_seq new_irs = tw->irs; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + new_iss += (ticks - tw->t_starttime) * (ISN_BYTES_PER_SECOND / hz); + new_irs += (ticks - tw->t_starttime) * (MS_ISN_BYTES_PER_SECOND / hz); + + if (SEQ_GT(new_iss, tw->snd_nxt) && SEQ_GT(new_irs, tw->rcv_nxt)) + return (1); + else + return (0); +} +#endif + +/* + * Returns 1 if the TIME_WAIT state was killed and we should start over, + * looking for a pcb in the listen state. Returns 0 otherwise. + */ +int +tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, + struct mbuf *m, int tlen) +{ + struct tcptw *tw; + int thflags; + tcp_seq seq; + + /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); + + /* + * XXXRW: Time wait state for inpcb has been recycled, but inpcb is + * still present. This is undesirable, but temporarily necessary + * until we work out how to handle inpcb's who's timewait state has + * been removed. + */ + tw = intotw(inp); + if (tw == NULL) + goto drop; + + thflags = th->th_flags; + + /* + * NOTE: for FIN_WAIT_2 (to be added later), + * must validate sequence number before accepting RST + */ + + /* + * If the segment contains RST: + * Drop the segment - see Stevens, vol. 2, p. 964 and + * RFC 1337. + */ + if (thflags & TH_RST) + goto drop; + +#if 0 +/* PAWS not needed at the moment */ + /* + * RFC 1323 PAWS: If we have a timestamp reply on this segment + * and it's less than ts_recent, drop it. + */ + if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && + TSTMP_LT(to.to_tsval, tp->ts_recent)) { + if ((thflags & TH_ACK) == 0) + goto drop; + goto ack; + } + /* + * ts_recent is never updated because we never accept new segments. + */ +#endif + + /* + * If a new connection request is received + * while in TIME_WAIT, drop the old connection + * and start over if the sequence numbers + * are above the previous ones. + */ + if ((thflags & TH_SYN) && SEQ_GT(th->th_seq, tw->rcv_nxt)) { + tcp_twclose(tw, 0); + return (1); + } + + /* + * Drop the the segment if it does not contain an ACK. + */ + if ((thflags & TH_ACK) == 0) + goto drop; + + /* + * Reset the 2MSL timer if this is a duplicate FIN. + */ + if (thflags & TH_FIN) { + seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); + if (seq + 1 == tw->rcv_nxt) + tcp_tw_2msl_reset(tw, 1); + } + + /* + * Acknowledge the segment if it has data or is not a duplicate ACK. + */ + if (thflags != TH_ACK || tlen != 0 || + th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) + tcp_twrespond(tw, TH_ACK); +drop: + INP_WUNLOCK(inp); + m_freem(m); + return (0); +} + +void +tcp_twclose(struct tcptw *tw, int reuse) +{ + struct socket *so; + struct inpcb *inp; + + /* + * At this point, we are in one of two situations: + * + * (1) We have no socket, just an inpcb<->twtcp pair. We can free + * all state. + * + * (2) We have a socket -- if we own a reference, release it and + * notify the socket layer. + */ + inp = tw->tw_inpcb; + KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait")); + KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */ + INP_WLOCK_ASSERT(inp); + + tw->tw_inpcb = NULL; + tcp_tw_2msl_stop(tw); + inp->inp_ppcb = NULL; + in_pcbdrop(inp); + + so = inp->inp_socket; + if (so != NULL) { + /* + * If there's a socket, handle two cases: first, we own a + * strong reference, which we will now release, or we don't + * in which case another reference exists (XXXRW: think + * about this more), and we don't need to take action. + */ + if (inp->inp_flags & INP_SOCKREF) { + inp->inp_flags &= ~INP_SOCKREF; + INP_WUNLOCK(inp); + ACCEPT_LOCK(); + SOCK_LOCK(so); + KASSERT(so->so_state & SS_PROTOREF, + ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); + so->so_state &= ~SS_PROTOREF; + sofree(so); + } else { + /* + * If we don't own the only reference, the socket and + * inpcb need to be left around to be handled by + * tcp_usr_detach() later. + */ + INP_WUNLOCK(inp); + } + } else + in_pcbfree(inp); + TCPSTAT_INC(tcps_closed); + crfree(tw->tw_cred); + tw->tw_cred = NULL; + if (reuse) + return; + uma_zfree(V_tcptw_zone, tw); +} + +int +tcp_twrespond(struct tcptw *tw, int flags) +{ + struct inpcb *inp = tw->tw_inpcb; + struct tcphdr *th; + struct mbuf *m; + struct ip *ip = NULL; + u_int hdrlen, optlen; + int error; + struct tcpopt to; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; +#endif + + INP_WLOCK_ASSERT(inp); + + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + return (ENOBUFS); + m->m_data += max_linkhdr; + +#ifdef MAC + mac_inpcb_create_mbuf(inp, m); +#endif + +#ifdef INET6 + if (isipv6) { + hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + tcpip_fillheaders(inp, ip6, th); + } else +#endif + { + hdrlen = sizeof(struct tcpiphdr); + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + tcpip_fillheaders(inp, ip, th); + } + to.to_flags = 0; + + /* + * Send a timestamp and echo-reply if both our side and our peer + * have sent timestamps in our SYN's and this is not a RST. + */ + if (tw->t_recent && flags == TH_ACK) { + to.to_flags |= TOF_TS; + to.to_tsval = ticks + tw->ts_offset; + to.to_tsecr = tw->t_recent; + } + optlen = tcp_addoptions(&to, (u_char *)(th + 1)); + + m->m_len = hdrlen + optlen; + m->m_pkthdr.len = m->m_len; + + KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); + + th->th_seq = htonl(tw->snd_nxt); + th->th_ack = htonl(tw->rcv_nxt); + th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + th->th_flags = flags; + th->th_win = htons(tw->last_win); + +#ifdef INET6 + if (isipv6) { + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), + sizeof(struct tcphdr) + optlen); + ip6->ip6_hlim = in6_selecthlim(inp, NULL); + error = ip6_output(m, inp->in6p_outputopts, NULL, + (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); + } else +#endif + { + th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + ip->ip_len = m->m_pkthdr.len; + if (V_path_mtu_discovery) + ip->ip_off |= IP_DF; + error = ip_output(m, inp->inp_options, NULL, + ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), + NULL, inp); + } + if (flags & TH_ACK) + TCPSTAT_INC(tcps_sndacks); + else + TCPSTAT_INC(tcps_sndctrl); + TCPSTAT_INC(tcps_sndtotal); + return (error); +} + +static void +tcp_tw_2msl_reset(struct tcptw *tw, int rearm) +{ + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tw->tw_inpcb); + if (rearm) + TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); + tw->tw_time = ticks + 2 * tcp_msl; + TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl); +} + +static void +tcp_tw_2msl_stop(struct tcptw *tw) +{ + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); +} + +struct tcptw * +tcp_tw_2msl_scan(int reuse) +{ + struct tcptw *tw; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + for (;;) { + tw = TAILQ_FIRST(&V_twq_2msl); + if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) + break; + INP_WLOCK(tw->tw_inpcb); + tcp_twclose(tw, reuse); + if (reuse) + return (tw); + } + return (NULL); +} diff --git a/freebsd/sys/netinet/tcp_usrreq.c b/freebsd/sys/netinet/tcp_usrreq.c new file mode 100644 index 00000000..fc083e05 --- /dev/null +++ b/freebsd/sys/netinet/tcp_usrreq.c @@ -0,0 +1,1886 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. + * Copyright (c) 2006-2007 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif /* INET6 */ +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +#include +#include +#include + +#include +#include +#ifdef INET6 +#include +#endif +#include +#ifdef INET6 +#include +#endif +#include +#include +#ifdef INET6 +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif +#include + +/* + * TCP protocol interface to socket abstraction. + */ +static int tcp_attach(struct socket *); +static int tcp_connect(struct tcpcb *, struct sockaddr *, + struct thread *td); +#ifdef INET6 +static int tcp6_connect(struct tcpcb *, struct sockaddr *, + struct thread *td); +#endif /* INET6 */ +static void tcp_disconnect(struct tcpcb *); +static void tcp_usrclosed(struct tcpcb *); +static void tcp_fill_info(struct tcpcb *, struct tcp_info *); + +#ifdef TCPDEBUG +#define TCPDEBUG0 int ostate = 0 +#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 +#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ + tcp_trace(TA_USER, ostate, tp, 0, 0, req) +#else +#define TCPDEBUG0 +#define TCPDEBUG1() +#define TCPDEBUG2(req) +#endif + +/* + * TCP attaches to socket via pru_attach(), reserving space, + * and an internet control block. + */ +static int +tcp_usr_attach(struct socket *so, int proto, struct thread *td) +{ + struct inpcb *inp; + struct tcpcb *tp = NULL; + int error; + TCPDEBUG0; + + inp = sotoinpcb(so); + KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); + TCPDEBUG1(); + + error = tcp_attach(so); + if (error) + goto out; + + if ((so->so_options & SO_LINGER) && so->so_linger == 0) + so->so_linger = TCP_LINGERTIME; + + inp = sotoinpcb(so); + tp = intotcpcb(inp); +out: + TCPDEBUG2(PRU_ATTACH); + return error; +} + +/* + * tcp_detach is called when the socket layer loses its final reference + * to the socket, be it a file descriptor reference, a reference from TCP, + * etc. At this point, there is only one case in which we will keep around + * inpcb state: time wait. + * + * This function can probably be re-absorbed back into tcp_usr_detach() now + * that there is a single detach path. + */ +static void +tcp_detach(struct socket *so, struct inpcb *inp) +{ + struct tcpcb *tp; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); + + KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); + KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so")); + + tp = intotcpcb(inp); + + if (inp->inp_flags & INP_TIMEWAIT) { + /* + * There are two cases to handle: one in which the time wait + * state is being discarded (INP_DROPPED), and one in which + * this connection will remain in timewait. In the former, + * it is time to discard all state (except tcptw, which has + * already been discarded by the timewait close code, which + * should be further up the call stack somewhere). In the + * latter case, we detach from the socket, but leave the pcb + * present until timewait ends. + * + * XXXRW: Would it be cleaner to free the tcptw here? + */ + if (inp->inp_flags & INP_DROPPED) { + KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " + "INP_DROPPED && tp != NULL")); + in_pcbdetach(inp); + in_pcbfree(inp); + } else { + in_pcbdetach(inp); + INP_WUNLOCK(inp); + } + } else { + /* + * If the connection is not in timewait, we consider two + * two conditions: one in which no further processing is + * necessary (dropped || embryonic), and one in which TCP is + * not yet done, but no longer requires the socket, so the + * pcb will persist for the time being. + * + * XXXRW: Does the second case still occur? + */ + if (inp->inp_flags & INP_DROPPED || + tp->t_state < TCPS_SYN_SENT) { + tcp_discardcb(tp); + in_pcbdetach(inp); + in_pcbfree(inp); + } else + in_pcbdetach(inp); + } +} + +/* + * pru_detach() detaches the TCP protocol from the socket. + * If the protocol state is non-embryonic, then can't + * do this directly: have to initiate a pru_disconnect(), + * which may finish later; embryonic TCB's can just + * be discarded here. + */ +static void +tcp_usr_detach(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + KASSERT(inp->inp_socket != NULL, + ("tcp_usr_detach: inp_socket == NULL")); + tcp_detach(so, inp); + INP_INFO_WUNLOCK(&V_tcbinfo); +} + +/* + * Give the socket an address. + */ +static int +tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + struct sockaddr_in *sinp; + + sinp = (struct sockaddr_in *)nam; + if (nam->sa_len != sizeof (*sinp)) + return (EINVAL); + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + if (sinp->sin_family == AF_INET && + IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) + return (EAFNOSUPPORT); + + TCPDEBUG0; + INP_INFO_WLOCK(&V_tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + error = in_pcbbind(inp, nam, td->td_ucred); +out: + TCPDEBUG2(PRU_BIND); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + + return (error); +} + +#ifdef INET6 +static int +tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + struct sockaddr_in6 *sin6p; + + sin6p = (struct sockaddr_in6 *)nam; + if (nam->sa_len != sizeof (*sin6p)) + return (EINVAL); + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + if (sin6p->sin6_family == AF_INET6 && + IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) + return (EAFNOSUPPORT); + + TCPDEBUG0; + INP_INFO_WLOCK(&V_tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { + if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) + inp->inp_vflag |= INP_IPV4; + else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + struct sockaddr_in sin; + + in6_sin6_2_sin(&sin, sin6p); + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; + error = in_pcbbind(inp, (struct sockaddr *)&sin, + td->td_ucred); + goto out; + } + } + error = in6_pcbbind(inp, nam, td->td_ucred); +out: + TCPDEBUG2(PRU_BIND); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (error); +} +#endif /* INET6 */ + +/* + * Prepare to accept connections. + */ +static int +tcp_usr_listen(struct socket *so, int backlog, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + + TCPDEBUG0; + INP_INFO_WLOCK(&V_tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + SOCK_LOCK(so); + error = solisten_proto_check(so); + if (error == 0 && inp->inp_lport == 0) + error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + if (error == 0) { + tp->t_state = TCPS_LISTEN; + solisten_proto(so, backlog); + tcp_offload_listen_open(tp); + } + SOCK_UNLOCK(so); + +out: + TCPDEBUG2(PRU_LISTEN); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (error); +} + +#ifdef INET6 +static int +tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + + TCPDEBUG0; + INP_INFO_WLOCK(&V_tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + SOCK_LOCK(so); + error = solisten_proto_check(so); + if (error == 0 && inp->inp_lport == 0) { + inp->inp_vflag &= ~INP_IPV4; + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) + inp->inp_vflag |= INP_IPV4; + error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + } + if (error == 0) { + tp->t_state = TCPS_LISTEN; + solisten_proto(so, backlog); + } + SOCK_UNLOCK(so); + +out: + TCPDEBUG2(PRU_LISTEN); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (error); +} +#endif /* INET6 */ + +/* + * Initiate connection to peer. + * Create a template for use in transmissions on this connection. + * Enter SYN_SENT state, and mark socket as connecting. + * Start keep-alive timer, and seed output sequence space. + * Send initial segment on connection. + */ +static int +tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + struct sockaddr_in *sinp; + + sinp = (struct sockaddr_in *)nam; + if (nam->sa_len != sizeof (*sinp)) + return (EINVAL); + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + if (sinp->sin_family == AF_INET + && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) + return (EAFNOSUPPORT); + if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) + return (error); + + TCPDEBUG0; + INP_INFO_WLOCK(&V_tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + if ((error = tcp_connect(tp, nam, td)) != 0) + goto out; + error = tcp_output_connect(so, nam); +out: + TCPDEBUG2(PRU_CONNECT); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (error); +} + +#ifdef INET6 +static int +tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + struct sockaddr_in6 *sin6p; + + TCPDEBUG0; + + sin6p = (struct sockaddr_in6 *)nam; + if (nam->sa_len != sizeof (*sin6p)) + return (EINVAL); + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + if (sin6p->sin6_family == AF_INET6 + && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) + return (EAFNOSUPPORT); + + INP_INFO_WLOCK(&V_tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + struct sockaddr_in sin; + + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { + error = EINVAL; + goto out; + } + + in6_sin6_2_sin(&sin, sin6p); + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; + if ((error = prison_remote_ip4(td->td_ucred, + &sin.sin_addr)) != 0) + goto out; + if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) + goto out; + error = tcp_output_connect(so, nam); + goto out; + } + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; + inp->inp_inc.inc_flags |= INC_ISIPV6; + if ((error = prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr)) != 0) + goto out; + if ((error = tcp6_connect(tp, nam, td)) != 0) + goto out; + error = tcp_output_connect(so, nam); + +out: + TCPDEBUG2(PRU_CONNECT); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (error); +} +#endif /* INET6 */ + +/* + * Initiate disconnect from peer. + * If connection never passed embryonic stage, just drop; + * else if don't need to let data drain, then can just drop anyways, + * else have to begin TCP shutdown process: mark socket disconnecting, + * drain unread data, state switch to reflect user close, and + * send segment (e.g. FIN) to peer. Socket will be really disconnected + * when peer sends FIN and acks ours. + * + * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. + */ +static int +tcp_usr_disconnect(struct socket *so) +{ + struct inpcb *inp; + struct tcpcb *tp = NULL; + int error = 0; + + TCPDEBUG0; + INP_INFO_WLOCK(&V_tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNRESET; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + tcp_disconnect(tp); +out: + TCPDEBUG2(PRU_DISCONNECT); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (error); +} + +/* + * Accept a connection. Essentially all the work is done at higher levels; + * just return the address of the peer, storing through addr. + * + * The rationale for acquiring the tcbinfo lock here is somewhat complicated, + * and is described in detail in the commit log entry for r175612. Acquiring + * it delays an accept(2) racing with sonewconn(), which inserts the socket + * before the inpcb address/port fields are initialized. A better fix would + * prevent the socket from being placed in the listen queue until all fields + * are fully initialized. + */ +static int +tcp_usr_accept(struct socket *so, struct sockaddr **nam) +{ + int error = 0; + struct inpcb *inp = NULL; + struct tcpcb *tp = NULL; + struct in_addr addr; + in_port_t port = 0; + TCPDEBUG0; + + if (so->so_state & SS_ISDISCONNECTED) + return (ECONNABORTED); + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); + INP_INFO_RLOCK(&V_tcbinfo); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNABORTED; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + + /* + * We inline in_getpeeraddr and COMMON_END here, so that we can + * copy the data of interest and defer the malloc until after we + * release the lock. + */ + port = inp->inp_fport; + addr = inp->inp_faddr; + +out: + TCPDEBUG2(PRU_ACCEPT); + INP_WUNLOCK(inp); + INP_INFO_RUNLOCK(&V_tcbinfo); + if (error == 0) + *nam = in_sockaddr(port, &addr); + return error; +} + +#ifdef INET6 +static int +tcp6_usr_accept(struct socket *so, struct sockaddr **nam) +{ + struct inpcb *inp = NULL; + int error = 0; + struct tcpcb *tp = NULL; + struct in_addr addr; + struct in6_addr addr6; + in_port_t port = 0; + int v4 = 0; + TCPDEBUG0; + + if (so->so_state & SS_ISDISCONNECTED) + return (ECONNABORTED); + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNABORTED; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + + /* + * We inline in6_mapped_peeraddr and COMMON_END here, so that we can + * copy the data of interest and defer the malloc until after we + * release the lock. + */ + if (inp->inp_vflag & INP_IPV4) { + v4 = 1; + port = inp->inp_fport; + addr = inp->inp_faddr; + } else { + port = inp->inp_fport; + addr6 = inp->in6p_faddr; + } + +out: + TCPDEBUG2(PRU_ACCEPT); + INP_WUNLOCK(inp); + if (error == 0) { + if (v4) + *nam = in6_v4mapsin6_sockaddr(port, &addr); + else + *nam = in6_sockaddr(port, &addr6); + } + return error; +} +#endif /* INET6 */ + +/* + * Mark the connection as being incapable of further output. + */ +static int +tcp_usr_shutdown(struct socket *so) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + + TCPDEBUG0; + INP_INFO_WLOCK(&V_tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNRESET; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + socantsendmore(so); + tcp_usrclosed(tp); + if (!(inp->inp_flags & INP_DROPPED)) + error = tcp_output_disconnect(tp); + +out: + TCPDEBUG2(PRU_SHUTDOWN); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + + return (error); +} + +/* + * After a receive, possibly send window update to peer. + */ +static int +tcp_usr_rcvd(struct socket *so, int flags) +{ + struct inpcb *inp; + struct tcpcb *tp = NULL; + int error = 0; + + TCPDEBUG0; + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNRESET; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + tcp_output_rcvd(tp); + +out: + TCPDEBUG2(PRU_RCVD); + INP_WUNLOCK(inp); + return (error); +} + +/* + * Do a send by putting data in output queue and updating urgent + * marker if URG set. Possibly send more data. Unlike the other + * pru_*() routines, the mbuf chains are our responsibility. We + * must either enqueue them or free them. The other pru_* routines + * generally are caller-frees. + */ +static int +tcp_usr_send(struct socket *so, int flags, struct mbuf *m, + struct sockaddr *nam, struct mbuf *control, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + int headlocked = 0; +#ifdef INET6 + int isipv6; +#endif + TCPDEBUG0; + + /* + * We require the pcbinfo lock in two cases: + * + * (1) An implied connect is taking place, which can result in + * binding IPs and ports and hence modification of the pcb hash + * chains. + * + * (2) PRUS_EOF is set, resulting in explicit close on the send. + */ + if ((nam != NULL) || (flags & PRUS_EOF)) { + INP_INFO_WLOCK(&V_tcbinfo); + headlocked = 1; + } + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + if (control) + m_freem(control); + if (m) + m_freem(m); + error = ECONNRESET; + goto out; + } +#ifdef INET6 + isipv6 = nam && nam->sa_family == AF_INET6; +#endif /* INET6 */ + tp = intotcpcb(inp); + TCPDEBUG1(); + if (control) { + /* TCP doesn't do control messages (rights, creds, etc) */ + if (control->m_len) { + m_freem(control); + if (m) + m_freem(m); + error = EINVAL; + goto out; + } + m_freem(control); /* empty control, just free it */ + } + if (!(flags & PRUS_OOB)) { + sbappendstream(&so->so_snd, m); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); +#ifdef INET6 + if (isipv6) + error = tcp6_connect(tp, nam, td); + else +#endif /* INET6 */ + error = tcp_connect(tp, nam, td); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + } + if (flags & PRUS_EOF) { + /* + * Close the send side of the connection after + * the data is sent. + */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + socantsendmore(so); + tcp_usrclosed(tp); + } + if (headlocked) { + INP_INFO_WUNLOCK(&V_tcbinfo); + headlocked = 0; + } + if (!(inp->inp_flags & INP_DROPPED)) { + if (flags & PRUS_MORETOCOME) + tp->t_flags |= TF_MORETOCOME; + error = tcp_output_send(tp); + if (flags & PRUS_MORETOCOME) + tp->t_flags &= ~TF_MORETOCOME; + } + } else { + /* + * XXXRW: PRUS_EOF not implemented with PRUS_OOB? + */ + SOCKBUF_LOCK(&so->so_snd); + if (sbspace(&so->so_snd) < -512) { + SOCKBUF_UNLOCK(&so->so_snd); + m_freem(m); + error = ENOBUFS; + goto out; + } + /* + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section. + * Otherwise, snd_up should be one lower. + */ + sbappendstream_locked(&so->so_snd, m); + SOCKBUF_UNLOCK(&so->so_snd); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); +#ifdef INET6 + if (isipv6) + error = tcp6_connect(tp, nam, td); + else +#endif /* INET6 */ + error = tcp_connect(tp, nam, td); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + INP_INFO_WUNLOCK(&V_tcbinfo); + headlocked = 0; + } else if (nam) { + INP_INFO_WUNLOCK(&V_tcbinfo); + headlocked = 0; + } + tp->snd_up = tp->snd_una + so->so_snd.sb_cc; + tp->t_flags |= TF_FORCEDATA; + error = tcp_output_send(tp); + tp->t_flags &= ~TF_FORCEDATA; + } +out: + TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : + ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); + INP_WUNLOCK(inp); + if (headlocked) + INP_INFO_WUNLOCK(&V_tcbinfo); + return (error); +} + +/* + * Abort the TCP. Drop the connection abruptly. + */ +static void +tcp_usr_abort(struct socket *so) +{ + struct inpcb *inp; + struct tcpcb *tp = NULL; + TCPDEBUG0; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + KASSERT(inp->inp_socket != NULL, + ("tcp_usr_abort: inp_socket == NULL")); + + /* + * If we still have full TCP state, and we're not dropped, drop. + */ + if (!(inp->inp_flags & INP_TIMEWAIT) && + !(inp->inp_flags & INP_DROPPED)) { + tp = intotcpcb(inp); + TCPDEBUG1(); + tcp_drop(tp, ECONNABORTED); + TCPDEBUG2(PRU_ABORT); + } + if (!(inp->inp_flags & INP_DROPPED)) { + SOCK_LOCK(so); + so->so_state |= SS_PROTOREF; + SOCK_UNLOCK(so); + inp->inp_flags |= INP_SOCKREF; + } + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); +} + +/* + * TCP socket is closed. Start friendly disconnect. + */ +static void +tcp_usr_close(struct socket *so) +{ + struct inpcb *inp; + struct tcpcb *tp = NULL; + TCPDEBUG0; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); + + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + KASSERT(inp->inp_socket != NULL, + ("tcp_usr_close: inp_socket == NULL")); + + /* + * If we still have full TCP state, and we're not dropped, initiate + * a disconnect. + */ + if (!(inp->inp_flags & INP_TIMEWAIT) && + !(inp->inp_flags & INP_DROPPED)) { + tp = intotcpcb(inp); + TCPDEBUG1(); + tcp_disconnect(tp); + TCPDEBUG2(PRU_CLOSE); + } + if (!(inp->inp_flags & INP_DROPPED)) { + SOCK_LOCK(so); + so->so_state |= SS_PROTOREF; + SOCK_UNLOCK(so); + inp->inp_flags |= INP_SOCKREF; + } + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); +} + +/* + * Receive out-of-band data. + */ +static int +tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + + TCPDEBUG0; + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNRESET; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + if ((so->so_oobmark == 0 && + (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || + so->so_options & SO_OOBINLINE || + tp->t_oobflags & TCPOOB_HADDATA) { + error = EINVAL; + goto out; + } + if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { + error = EWOULDBLOCK; + goto out; + } + m->m_len = 1; + *mtod(m, caddr_t) = tp->t_iobc; + if ((flags & MSG_PEEK) == 0) + tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); + +out: + TCPDEBUG2(PRU_RCVOOB); + INP_WUNLOCK(inp); + return (error); +} + +struct pr_usrreqs tcp_usrreqs = { + .pru_abort = tcp_usr_abort, + .pru_accept = tcp_usr_accept, + .pru_attach = tcp_usr_attach, + .pru_bind = tcp_usr_bind, + .pru_connect = tcp_usr_connect, + .pru_control = in_control, + .pru_detach = tcp_usr_detach, + .pru_disconnect = tcp_usr_disconnect, + .pru_listen = tcp_usr_listen, + .pru_peeraddr = in_getpeeraddr, + .pru_rcvd = tcp_usr_rcvd, + .pru_rcvoob = tcp_usr_rcvoob, + .pru_send = tcp_usr_send, + .pru_shutdown = tcp_usr_shutdown, + .pru_sockaddr = in_getsockaddr, +#if 0 + .pru_soreceive = soreceive_stream, +#endif + .pru_sosetlabel = in_pcbsosetlabel, + .pru_close = tcp_usr_close, +}; + +#ifdef INET6 +struct pr_usrreqs tcp6_usrreqs = { + .pru_abort = tcp_usr_abort, + .pru_accept = tcp6_usr_accept, + .pru_attach = tcp_usr_attach, + .pru_bind = tcp6_usr_bind, + .pru_connect = tcp6_usr_connect, + .pru_control = in6_control, + .pru_detach = tcp_usr_detach, + .pru_disconnect = tcp_usr_disconnect, + .pru_listen = tcp6_usr_listen, + .pru_peeraddr = in6_mapped_peeraddr, + .pru_rcvd = tcp_usr_rcvd, + .pru_rcvoob = tcp_usr_rcvoob, + .pru_send = tcp_usr_send, + .pru_shutdown = tcp_usr_shutdown, + .pru_sockaddr = in6_mapped_sockaddr, +#if 0 + .pru_soreceive = soreceive_stream, +#endif + .pru_sosetlabel = in_pcbsosetlabel, + .pru_close = tcp_usr_close, +}; +#endif /* INET6 */ + +/* + * Common subroutine to open a TCP connection to remote host specified + * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local + * port number if needed. Call in_pcbconnect_setup to do the routing and + * to choose a local host address (interface). If there is an existing + * incarnation of the same connection in TIME-WAIT state and if the remote + * host was sending CC options and if the connection duration was < MSL, then + * truncate the previous TIME-WAIT state and proceed. + * Initialize connection parameters and enter SYN-SENT state. + */ +static int +tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct in_addr laddr; + u_short lport; + int error; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); + + if (inp->inp_lport == 0) { + error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + */ + laddr = inp->inp_laddr; + lport = inp->inp_lport; + error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, + &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); + if (error && oinp == NULL) + return error; + if (oinp) + return EADDRINUSE; + inp->inp_laddr = laddr; + in_pcbrehash(inp); + + /* + * Compute window scaling to request: + * Scale to fit into sweet spot. See tcp_syncache.c. + * XXX: This should move to tcp_output(). + */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < sb_max) + tp->request_r_scale++; + + soisconnecting(so); + TCPSTAT_INC(tcps_connattempt); + tp->t_state = TCPS_SYN_SENT; + tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); + tp->iss = tcp_new_isn(tp); + tp->t_bw_rtseq = tp->iss; + tcp_sendseqinit(tp); + + return 0; +} + +#ifdef INET6 +static int +tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct in6_addr addr6; + int error; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); + + if (inp->inp_lport == 0) { + error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + * in6_pcbladdr() also handles scope zone IDs. + */ + error = in6_pcbladdr(inp, nam, &addr6); + if (error) + return error; + oinp = in6_pcblookup_hash(inp->inp_pcbinfo, + &sin6->sin6_addr, sin6->sin6_port, + IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) + ? &addr6 + : &inp->in6p_laddr, + inp->inp_lport, 0, NULL); + if (oinp) + return EADDRINUSE; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = addr6; + inp->in6p_faddr = sin6->sin6_addr; + inp->inp_fport = sin6->sin6_port; + /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ + inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; + if (inp->inp_flags & IN6P_AUTOFLOWLABEL) + inp->inp_flow |= + (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); + in_pcbrehash(inp); + + /* Compute window scaling to request. */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < sb_max) + tp->request_r_scale++; + + soisconnecting(so); + TCPSTAT_INC(tcps_connattempt); + tp->t_state = TCPS_SYN_SENT; + tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); + tp->iss = tcp_new_isn(tp); + tp->t_bw_rtseq = tp->iss; + tcp_sendseqinit(tp); + + return 0; +} +#endif /* INET6 */ + +/* + * Export TCP internal state information via a struct tcp_info, based on the + * Linux 2.6 API. Not ABI compatible as our constants are mapped differently + * (TCP state machine, etc). We export all information using FreeBSD-native + * constants -- for example, the numeric values for tcpi_state will differ + * from Linux. + */ +static void +tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) +{ + + INP_WLOCK_ASSERT(tp->t_inpcb); + bzero(ti, sizeof(*ti)); + + ti->tcpi_state = tp->t_state; + if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) + ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; + if (tp->t_flags & TF_SACK_PERMIT) + ti->tcpi_options |= TCPI_OPT_SACK; + if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { + ti->tcpi_options |= TCPI_OPT_WSCALE; + ti->tcpi_snd_wscale = tp->snd_scale; + ti->tcpi_rcv_wscale = tp->rcv_scale; + } + + ti->tcpi_rto = tp->t_rxtcur * tick; + ti->tcpi_last_data_recv = (long)(ticks - (int)tp->t_rcvtime) * tick; + ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; + ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; + + ti->tcpi_snd_ssthresh = tp->snd_ssthresh; + ti->tcpi_snd_cwnd = tp->snd_cwnd; + + /* + * FreeBSD-specific extension fields for tcp_info. + */ + ti->tcpi_rcv_space = tp->rcv_wnd; + ti->tcpi_rcv_nxt = tp->rcv_nxt; + ti->tcpi_snd_wnd = tp->snd_wnd; + ti->tcpi_snd_bwnd = tp->snd_bwnd; + ti->tcpi_snd_nxt = tp->snd_nxt; + ti->tcpi_snd_mss = tp->t_maxseg; + ti->tcpi_rcv_mss = tp->t_maxseg; + if (tp->t_flags & TF_TOE) + ti->tcpi_options |= TCPI_OPT_TOE; +} + +/* + * tcp_ctloutput() must drop the inpcb lock before performing copyin on + * socket option arguments. When it re-acquires the lock after the copy, it + * has to revalidate that the connection is still valid for the socket + * option. + */ +#define INP_WLOCK_RECHECK(inp) do { \ + INP_WLOCK(inp); \ + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ + INP_WUNLOCK(inp); \ + return (ECONNRESET); \ + } \ + tp = intotcpcb(inp); \ +} while(0) + +int +tcp_ctloutput(struct socket *so, struct sockopt *sopt) +{ + int error, opt, optval; + struct inpcb *inp; + struct tcpcb *tp; + struct tcp_info ti; + + error = 0; + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); + INP_WLOCK(inp); + if (sopt->sopt_level != IPPROTO_TCP) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) { + INP_WUNLOCK(inp); + error = ip6_ctloutput(so, sopt); + } else { +#endif /* INET6 */ + INP_WUNLOCK(inp); + error = ip_ctloutput(so, sopt); +#ifdef INET6 + } +#endif + return (error); + } + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { +#ifdef TCP_SIGNATURE + case TCP_MD5SIG: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + if (optval > 0) + tp->t_flags |= TF_SIGNATURE; + else + tp->t_flags &= ~TF_SIGNATURE; + INP_WUNLOCK(inp); + break; +#endif /* TCP_SIGNATURE */ + case TCP_NODELAY: + case TCP_NOOPT: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + switch (sopt->sopt_name) { + case TCP_NODELAY: + opt = TF_NODELAY; + break; + case TCP_NOOPT: + opt = TF_NOOPT; + break; + default: + opt = 0; /* dead code to fool gcc */ + break; + } + + if (optval) + tp->t_flags |= opt; + else + tp->t_flags &= ~opt; + INP_WUNLOCK(inp); + break; + + case TCP_NOPUSH: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + if (optval) + tp->t_flags |= TF_NOPUSH; + else if (tp->t_flags & TF_NOPUSH) { + tp->t_flags &= ~TF_NOPUSH; + if (TCPS_HAVEESTABLISHED(tp->t_state)) + error = tcp_output(tp); + } + INP_WUNLOCK(inp); + break; + + case TCP_MAXSEG: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + if (optval > 0 && optval <= tp->t_maxseg && + optval + 40 >= V_tcp_minmss) + tp->t_maxseg = optval; + else + error = EINVAL; + INP_WUNLOCK(inp); + break; + + case TCP_INFO: + INP_WUNLOCK(inp); + error = EINVAL; + break; + + default: + INP_WUNLOCK(inp); + error = ENOPROTOOPT; + break; + } + break; + + case SOPT_GET: + tp = intotcpcb(inp); + switch (sopt->sopt_name) { +#ifdef TCP_SIGNATURE + case TCP_MD5SIG: + optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; +#endif + + case TCP_NODELAY: + optval = tp->t_flags & TF_NODELAY; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + case TCP_MAXSEG: + optval = tp->t_maxseg; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + case TCP_NOOPT: + optval = tp->t_flags & TF_NOOPT; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + case TCP_NOPUSH: + optval = tp->t_flags & TF_NOPUSH; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + case TCP_INFO: + tcp_fill_info(tp, &ti); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &ti, sizeof ti); + break; + default: + INP_WUNLOCK(inp); + error = ENOPROTOOPT; + break; + } + break; + } + return (error); +} +#undef INP_WLOCK_RECHECK + +/* + * tcp_sendspace and tcp_recvspace are the default send and receive window + * sizes, respectively. These are obsolescent (this information should + * be set by the route). + */ +u_long tcp_sendspace = 1024*32; +SYSCTL_ULONG(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, + &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); +u_long tcp_recvspace = 1024*64; +SYSCTL_ULONG(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); + +/* + * Attach TCP protocol to socket, allocating + * internet protocol control block, tcp control block, + * bufer space, and entering LISTEN state if to accept connections. + */ +static int +tcp_attach(struct socket *so) +{ + struct tcpcb *tp; + struct inpcb *inp; + int error; + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + error = soreserve(so, tcp_sendspace, tcp_recvspace); + if (error) + return (error); + } + so->so_rcv.sb_flags |= SB_AUTOSIZE; + so->so_snd.sb_flags |= SB_AUTOSIZE; + INP_INFO_WLOCK(&V_tcbinfo); + error = in_pcballoc(so, &V_tcbinfo); + if (error) { + INP_INFO_WUNLOCK(&V_tcbinfo); + return (error); + } + inp = sotoinpcb(so); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) { + inp->inp_vflag |= INP_IPV6; + inp->in6p_hops = -1; /* use kernel default */ + } + else +#endif + inp->inp_vflag |= INP_IPV4; + tp = tcp_newtcpcb(inp); + if (tp == NULL) { + in_pcbdetach(inp); + in_pcbfree(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (ENOBUFS); + } + tp->t_state = TCPS_CLOSED; + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + return (0); +} + +/* + * Initiate (or continue) disconnect. + * If embryonic state, just send reset (once). + * If in ``let data drain'' option and linger null, just drop. + * Otherwise (hard), mark socket disconnecting and drop + * current input data; switch states based on user close, and + * send segment to peer (with FIN). + */ +static void +tcp_disconnect(struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(inp); + + /* + * Neither tcp_close() nor tcp_drop() should return NULL, as the + * socket is still open. + */ + if (tp->t_state < TCPS_ESTABLISHED) { + tp = tcp_close(tp); + KASSERT(tp != NULL, + ("tcp_disconnect: tcp_close() returned NULL")); + } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { + tp = tcp_drop(tp, 0); + KASSERT(tp != NULL, + ("tcp_disconnect: tcp_drop() returned NULL")); + } else { + soisdisconnecting(so); + sbflush(&so->so_rcv); + tcp_usrclosed(tp); + if (!(inp->inp_flags & INP_DROPPED)) + tcp_output_disconnect(tp); + } +} + +/* + * User issued close, and wish to trail through shutdown states: + * if never received SYN, just forget it. If got a SYN from peer, + * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. + * If already got a FIN from peer, then almost done; go to LAST_ACK + * state. In all other cases, have already sent FIN to peer (e.g. + * after PRU_SHUTDOWN), and just have to play tedious game waiting + * for peer to send FIN or not respond to keep-alives, etc. + * We can let the user exit from the close as soon as the FIN is acked. + */ +static void +tcp_usrclosed(struct tcpcb *tp) +{ + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_WLOCK_ASSERT(tp->t_inpcb); + + switch (tp->t_state) { + case TCPS_LISTEN: + tcp_offload_listen_close(tp); + /* FALLTHROUGH */ + case TCPS_CLOSED: + tp->t_state = TCPS_CLOSED; + tp = tcp_close(tp); + /* + * tcp_close() should never return NULL here as the socket is + * still open. + */ + KASSERT(tp != NULL, + ("tcp_usrclosed: tcp_close() returned NULL")); + break; + + case TCPS_SYN_SENT: + case TCPS_SYN_RECEIVED: + tp->t_flags |= TF_NEEDFIN; + break; + + case TCPS_ESTABLISHED: + tp->t_state = TCPS_FIN_WAIT_1; + break; + + case TCPS_CLOSE_WAIT: + tp->t_state = TCPS_LAST_ACK; + break; + } + if (tp->t_state >= TCPS_FIN_WAIT_2) { + soisdisconnected(tp->t_inpcb->inp_socket); + /* Prevent the connection hanging in FIN_WAIT_2 forever. */ + if (tp->t_state == TCPS_FIN_WAIT_2) { + int timeout; + + timeout = (tcp_fast_finwait2_recycle) ? + tcp_finwait2_timeout : tcp_maxidle; + tcp_timer_activate(tp, TT_2MSL, timeout); + } + } +} + +#ifdef DDB +static void +db_print_indent(int indent) +{ + int i; + + for (i = 0; i < indent; i++) + db_printf(" "); +} + +static void +db_print_tstate(int t_state) +{ + + switch (t_state) { + case TCPS_CLOSED: + db_printf("TCPS_CLOSED"); + return; + + case TCPS_LISTEN: + db_printf("TCPS_LISTEN"); + return; + + case TCPS_SYN_SENT: + db_printf("TCPS_SYN_SENT"); + return; + + case TCPS_SYN_RECEIVED: + db_printf("TCPS_SYN_RECEIVED"); + return; + + case TCPS_ESTABLISHED: + db_printf("TCPS_ESTABLISHED"); + return; + + case TCPS_CLOSE_WAIT: + db_printf("TCPS_CLOSE_WAIT"); + return; + + case TCPS_FIN_WAIT_1: + db_printf("TCPS_FIN_WAIT_1"); + return; + + case TCPS_CLOSING: + db_printf("TCPS_CLOSING"); + return; + + case TCPS_LAST_ACK: + db_printf("TCPS_LAST_ACK"); + return; + + case TCPS_FIN_WAIT_2: + db_printf("TCPS_FIN_WAIT_2"); + return; + + case TCPS_TIME_WAIT: + db_printf("TCPS_TIME_WAIT"); + return; + + default: + db_printf("unknown"); + return; + } +} + +static void +db_print_tflags(u_int t_flags) +{ + int comma; + + comma = 0; + if (t_flags & TF_ACKNOW) { + db_printf("%sTF_ACKNOW", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_DELACK) { + db_printf("%sTF_DELACK", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_NODELAY) { + db_printf("%sTF_NODELAY", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_NOOPT) { + db_printf("%sTF_NOOPT", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_SENTFIN) { + db_printf("%sTF_SENTFIN", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_REQ_SCALE) { + db_printf("%sTF_REQ_SCALE", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_RCVD_SCALE) { + db_printf("%sTF_RECVD_SCALE", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_REQ_TSTMP) { + db_printf("%sTF_REQ_TSTMP", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_RCVD_TSTMP) { + db_printf("%sTF_RCVD_TSTMP", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_SACK_PERMIT) { + db_printf("%sTF_SACK_PERMIT", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_NEEDSYN) { + db_printf("%sTF_NEEDSYN", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_NEEDFIN) { + db_printf("%sTF_NEEDFIN", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_NOPUSH) { + db_printf("%sTF_NOPUSH", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_NOPUSH) { + db_printf("%sTF_NOPUSH", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_MORETOCOME) { + db_printf("%sTF_MORETOCOME", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_LQ_OVERFLOW) { + db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_LASTIDLE) { + db_printf("%sTF_LASTIDLE", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_RXWIN0SENT) { + db_printf("%sTF_RXWIN0SENT", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_FASTRECOVERY) { + db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_WASFRECOVERY) { + db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_SIGNATURE) { + db_printf("%sTF_SIGNATURE", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_FORCEDATA) { + db_printf("%sTF_FORCEDATA", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_TSO) { + db_printf("%sTF_TSO", comma ? ", " : ""); + comma = 1; + } + if (t_flags & TF_ECN_PERMIT) { + db_printf("%sTF_ECN_PERMIT", comma ? ", " : ""); + comma = 1; + } +} + +static void +db_print_toobflags(char t_oobflags) +{ + int comma; + + comma = 0; + if (t_oobflags & TCPOOB_HAVEDATA) { + db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : ""); + comma = 1; + } + if (t_oobflags & TCPOOB_HADDATA) { + db_printf("%sTCPOOB_HADDATA", comma ? ", " : ""); + comma = 1; + } +} + +static void +db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) +{ + + db_print_indent(indent); + db_printf("%s at %p\n", name, tp); + + indent += 2; + + db_print_indent(indent); + db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", + LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); + + db_print_indent(indent); + db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", + &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep); + + db_print_indent(indent); + db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl, + &tp->t_timers->tt_delack, tp->t_inpcb); + + db_print_indent(indent); + db_printf("t_state: %d (", tp->t_state); + db_print_tstate(tp->t_state); + db_printf(")\n"); + + db_print_indent(indent); + db_printf("t_flags: 0x%x (", tp->t_flags); + db_print_tflags(tp->t_flags); + db_printf(")\n"); + + db_print_indent(indent); + db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n", + tp->snd_una, tp->snd_max, tp->snd_nxt); + + db_print_indent(indent); + db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n", + tp->snd_up, tp->snd_wl1, tp->snd_wl2); + + db_print_indent(indent); + db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n", + tp->iss, tp->irs, tp->rcv_nxt); + + db_print_indent(indent); + db_printf("rcv_adv: 0x%08x rcv_wnd: %lu rcv_up: 0x%08x\n", + tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); + + db_print_indent(indent); + db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n", + tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd); + + db_print_indent(indent); + db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: " + "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth, + tp->snd_recover); + + db_print_indent(indent); + db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n", + tp->t_maxopd, tp->t_rcvtime, tp->t_starttime); + + db_print_indent(indent); + db_printf("t_rttime: %u t_rtsq: 0x%08x t_bw_rtttime: %u\n", + tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime); + + db_print_indent(indent); + db_printf("t_bw_rtseq: 0x%08x t_rxtcur: %d t_maxseg: %u " + "t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg, + tp->t_srtt); + + db_print_indent(indent); + db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " + "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, + tp->t_rttbest); + + db_print_indent(indent); + db_printf("t_rttupdated: %lu max_sndwnd: %lu t_softerror: %d\n", + tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); + + db_print_indent(indent); + db_printf("t_oobflags: 0x%x (", tp->t_oobflags); + db_print_toobflags(tp->t_oobflags); + db_printf(") t_iobc: 0x%02x\n", tp->t_iobc); + + db_print_indent(indent); + db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n", + tp->snd_scale, tp->rcv_scale, tp->request_r_scale); + + db_print_indent(indent); + db_printf("ts_recent: %u ts_recent_age: %u\n", + tp->ts_recent, tp->ts_recent_age); + + db_print_indent(indent); + db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " + "%lu\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); + + db_print_indent(indent); + db_printf("snd_ssthresh_prev: %lu snd_recover_prev: 0x%08x " + "t_badrxtwin: %u\n", tp->snd_ssthresh_prev, + tp->snd_recover_prev, tp->t_badrxtwin); + + db_print_indent(indent); + db_printf("snd_numholes: %d snd_holes first: %p\n", + tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes)); + + db_print_indent(indent); + db_printf("snd_fack: 0x%08x rcv_numsacks: %d sack_newdata: " + "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata); + + /* Skip sackblks, sackhint. */ + + db_print_indent(indent); + db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", + tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); +} + +DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) +{ + struct tcpcb *tp; + + if (!have_addr) { + db_printf("usage: show tcpcb \n"); + return; + } + tp = (struct tcpcb *)addr; + + db_print_tcpcb(tp, "tcpcb", 0); +} +#endif diff --git a/freebsd/sys/netinet/tcp_var.h b/freebsd/sys/netinet/tcp_var.h new file mode 100644 index 00000000..77586144 --- /dev/null +++ b/freebsd/sys/netinet/tcp_var.h @@ -0,0 +1,687 @@ +/*- + * Copyright (c) 1982, 1986, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCP_VAR_HH_ +#define _NETINET_TCP_VAR_HH_ + +#include + +#ifdef _KERNEL +#include + +/* + * Kernel variables for tcp. + */ +VNET_DECLARE(int, tcp_do_rfc1323); +#define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) + +#endif /* _KERNEL */ + +/* TCP segment queue entry */ +struct tseg_qent { + LIST_ENTRY(tseg_qent) tqe_q; + int tqe_len; /* TCP segment data length */ + struct tcphdr *tqe_th; /* a pointer to tcp header */ + struct mbuf *tqe_m; /* mbuf contains packet */ +}; +LIST_HEAD(tsegqe_head, tseg_qent); + +struct sackblk { + tcp_seq start; /* start seq no. of sack block */ + tcp_seq end; /* end seq no. */ +}; + +struct sackhole { + tcp_seq start; /* start seq no. of hole */ + tcp_seq end; /* end seq no. */ + tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ + TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ +}; + +struct sackhint { + struct sackhole *nexthole; + int sack_bytes_rexmit; + + int ispare; /* explicit pad for 64bit alignment */ + uint64_t _pad[2]; /* 1 sacked_bytes, 1 TBD */ +}; + +struct tcptemp { + u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ + struct tcphdr tt_t; +}; + +#define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ + +/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ +#ifdef INET6 +#define ND6_HINT(tp) \ +do { \ + if ((tp) && (tp)->t_inpcb && \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ + nd6_nud_hint(NULL, NULL, 0); \ +} while (0) +#else +#define ND6_HINT(tp) +#endif + +/* + * Tcp control block, one per tcp; fields: + * Organized for 16 byte cacheline efficiency. + */ +struct tcpcb { + struct tsegqe_head t_segq; /* segment reassembly queue */ + void *t_pspare[2]; /* new reassembly queue */ + int t_segqlen; /* segment reassembly queue length */ + int t_dupacks; /* consecutive dup acks recd */ + + struct tcp_timer *t_timers; /* All the TCP timers in one struct */ + + struct inpcb *t_inpcb; /* back pointer to internet pcb */ + int t_state; /* state of this connection */ + u_int t_flags; + + struct vnet *t_vnet; /* back pointer to parent vnet */ + + tcp_seq snd_una; /* send unacknowledged */ + tcp_seq snd_max; /* highest sequence number sent; + * used to recognize retransmits + */ + tcp_seq snd_nxt; /* send next */ + tcp_seq snd_up; /* send urgent pointer */ + + tcp_seq snd_wl1; /* window update seg seq number */ + tcp_seq snd_wl2; /* window update seg ack number */ + tcp_seq iss; /* initial send sequence number */ + tcp_seq irs; /* initial receive sequence number */ + + tcp_seq rcv_nxt; /* receive next */ + tcp_seq rcv_adv; /* advertised window */ + u_long rcv_wnd; /* receive window */ + tcp_seq rcv_up; /* receive urgent pointer */ + + u_long snd_wnd; /* send window */ + u_long snd_cwnd; /* congestion-controlled window */ + u_long snd_bwnd; /* bandwidth-controlled window */ + u_long snd_ssthresh; /* snd_cwnd size threshold for + * for slow start exponential to + * linear switch + */ + u_long snd_bandwidth; /* calculated bandwidth or 0 */ + tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ + + u_int t_maxopd; /* mss plus options */ + + u_int t_rcvtime; /* inactivity time */ + u_int t_starttime; /* time connection was established */ + u_int t_rtttime; /* RTT measurement start time */ + tcp_seq t_rtseq; /* sequence number being timed */ + + u_int t_bw_rtttime; /* used for bandwidth calculation */ + tcp_seq t_bw_rtseq; /* used for bandwidth calculation */ + + int t_rxtcur; /* current retransmit value (ticks) */ + u_int t_maxseg; /* maximum segment size */ + int t_srtt; /* smoothed round-trip time */ + int t_rttvar; /* variance in round-trip time */ + + int t_rxtshift; /* log(2) of rexmt exp. backoff */ + u_int t_rttmin; /* minimum rtt allowed */ + u_int t_rttbest; /* best rtt we've seen */ + u_long t_rttupdated; /* number of times rtt sampled */ + u_long max_sndwnd; /* largest window peer has offered */ + + int t_softerror; /* possible error not yet reported */ +/* out-of-band data */ + char t_oobflags; /* have some */ + char t_iobc; /* input character */ +/* RFC 1323 variables */ + u_char snd_scale; /* window scaling for send window */ + u_char rcv_scale; /* window scaling for recv window */ + u_char request_r_scale; /* pending window scaling */ + u_int32_t ts_recent; /* timestamp echo data */ + u_int ts_recent_age; /* when last updated */ + u_int32_t ts_offset; /* our timestamp offset */ + + tcp_seq last_ack_sent; +/* experimental */ + u_long snd_cwnd_prev; /* cwnd prior to retransmit */ + u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ + tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ + u_int t_badrxtwin; /* window for retransmit recovery */ + u_char snd_limited; /* segments limited transmitted */ +/* SACK related state */ + int snd_numholes; /* number of holes seen by sender */ + TAILQ_HEAD(sackhole_head, sackhole) snd_holes; + /* SACK scoreboard (sorted) */ + tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ + int rcv_numsacks; /* # distinct sack blks present */ + struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ + tcp_seq sack_newdata; /* New data xmitted in this recovery + episode starts at this seq number */ + struct sackhint sackhint; /* SACK scoreboard hint */ + int t_rttlow; /* smallest observerved RTT */ + u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ + int rfbuf_cnt; /* recv buffer autoscaling byte count */ + struct toe_usrreqs *t_tu; /* offload operations vector */ + void *t_toe; /* TOE pcb pointer */ + int t_bytes_acked; /* # bytes acked during current RTT */ + + int t_ispare; /* explicit pad for 64bit alignment */ + void *t_pspare2[6]; /* 2 CC / 4 TBD */ + uint64_t _pad[12]; /* 7 UTO, 5 TBD (1-2 CC/RTT?) */ +}; + +/* + * Flags and utility macros for the t_flags field. + */ +#define TF_ACKNOW 0x000001 /* ack peer immediately */ +#define TF_DELACK 0x000002 /* ack, but try to delay it */ +#define TF_NODELAY 0x000004 /* don't delay packets to coalesce */ +#define TF_NOOPT 0x000008 /* don't use tcp options */ +#define TF_SENTFIN 0x000010 /* have sent FIN */ +#define TF_REQ_SCALE 0x000020 /* have/will request window scaling */ +#define TF_RCVD_SCALE 0x000040 /* other side has requested scaling */ +#define TF_REQ_TSTMP 0x000080 /* have/will request timestamps */ +#define TF_RCVD_TSTMP 0x000100 /* a timestamp was received in SYN */ +#define TF_SACK_PERMIT 0x000200 /* other side said I could SACK */ +#define TF_NEEDSYN 0x000400 /* send SYN (implicit state) */ +#define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */ +#define TF_NOPUSH 0x001000 /* don't push */ +#define TF_MORETOCOME 0x010000 /* More data to be appended to sock */ +#define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */ +#define TF_LASTIDLE 0x040000 /* connection was previously idle */ +#define TF_RXWIN0SENT 0x080000 /* sent a receiver win 0 in response */ +#define TF_FASTRECOVERY 0x100000 /* in NewReno Fast Recovery */ +#define TF_WASFRECOVERY 0x200000 /* was in NewReno Fast Recovery */ +#define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ +#define TF_FORCEDATA 0x800000 /* force out a byte */ +#define TF_TSO 0x1000000 /* TSO enabled on this connection */ +#define TF_TOE 0x2000000 /* this connection is offloaded */ +#define TF_ECN_PERMIT 0x4000000 /* connection ECN-ready */ +#define TF_ECN_SND_CWR 0x8000000 /* ECN CWR in queue */ +#define TF_ECN_SND_ECE 0x10000000 /* ECN ECE in queue */ + +#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) +#define ENTER_FASTRECOVERY(tp) tp->t_flags |= TF_FASTRECOVERY +#define EXIT_FASTRECOVERY(tp) tp->t_flags &= ~TF_FASTRECOVERY + +/* + * Flags for the t_oobflags field. + */ +#define TCPOOB_HAVEDATA 0x01 +#define TCPOOB_HADDATA 0x02 + +#ifdef TCP_SIGNATURE +/* + * Defines which are needed by the xform_tcp module and tcp_[in|out]put + * for SADB verification and lookup. + */ +#define TCP_SIGLEN 16 /* length of computed digest in bytes */ +#define TCP_KEYLEN_MIN 1 /* minimum length of TCP-MD5 key */ +#define TCP_KEYLEN_MAX 80 /* maximum length of TCP-MD5 key */ +/* + * Only a single SA per host may be specified at this time. An SPI is + * needed in order for the KEY_ALLOCSA() lookup to work. + */ +#define TCP_SIG_SPI 0x1000 +#endif /* TCP_SIGNATURE */ + +/* + * Structure to hold TCP options that are only used during segment + * processing (in tcp_input), but not held in the tcpcb. + * It's basically used to reduce the number of parameters + * to tcp_dooptions and tcp_addoptions. + * The binary order of the to_flags is relevant for packing of the + * options in tcp_addoptions. + */ +struct tcpopt { + u_int64_t to_flags; /* which options are present */ +#define TOF_MSS 0x0001 /* maximum segment size */ +#define TOF_SCALE 0x0002 /* window scaling */ +#define TOF_SACKPERM 0x0004 /* SACK permitted */ +#define TOF_TS 0x0010 /* timestamp */ +#define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ +#define TOF_SACK 0x0080 /* Peer sent SACK option */ +#define TOF_MAXOPT 0x0100 + u_int32_t to_tsval; /* new timestamp */ + u_int32_t to_tsecr; /* reflected timestamp */ + u_char *to_sacks; /* pointer to the first SACK blocks */ + u_char *to_signature; /* pointer to the TCP-MD5 signature */ + u_int16_t to_mss; /* maximum segment size */ + u_int8_t to_wscale; /* window scaling */ + u_int8_t to_nsacks; /* number of SACK blocks */ +}; + +/* + * Flags for tcp_dooptions. + */ +#define TO_SYN 0x01 /* parse SYN-only options */ + +struct hc_metrics_lite { /* must stay in sync with hc_metrics */ + u_long rmx_mtu; /* MTU for this path */ + u_long rmx_ssthresh; /* outbound gateway buffer limit */ + u_long rmx_rtt; /* estimated round trip time */ + u_long rmx_rttvar; /* estimated rtt variance */ + u_long rmx_bandwidth; /* estimated bandwidth */ + u_long rmx_cwnd; /* congestion window */ + u_long rmx_sendpipe; /* outbound delay-bandwidth product */ + u_long rmx_recvpipe; /* inbound delay-bandwidth product */ +}; + +#ifndef _NETINET_IN_PCB_HH_ +struct in_conninfo; +#endif /* _NETINET_IN_PCB_HH_ */ + +struct tcptw { + struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ + tcp_seq snd_nxt; + tcp_seq rcv_nxt; + tcp_seq iss; + tcp_seq irs; + u_short last_win; /* cached window value */ + u_short tw_so_options; /* copy of so_options */ + struct ucred *tw_cred; /* user credentials */ + u_int32_t t_recent; + u_int32_t ts_offset; /* our timestamp offset */ + u_int t_starttime; + int tw_time; + TAILQ_ENTRY(tcptw) tw_2msl; +}; + +#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) +#define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) +#define sototcpcb(so) (intotcpcb(sotoinpcb(so))) + +/* + * The smoothed round-trip time and estimated variance + * are stored as fixed point numbers scaled by the values below. + * For convenience, these scales are also used in smoothing the average + * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). + * With these scales, srtt has 3 bits to the right of the binary point, + * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the + * binary point, and is smoothed with an ALPHA of 0.75. + */ +#define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ +#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ +#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ +#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ +#define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ + +/* + * The initial retransmission should happen at rtt + 4 * rttvar. + * Because of the way we do the smoothing, srtt and rttvar + * will each average +1/2 tick of bias. When we compute + * the retransmit timer, we want 1/2 tick of rounding and + * 1 extra tick because of +-1/2 tick uncertainty in the + * firing of the timer. The bias will give us exactly the + * 1.5 tick we need. But, because the bias is + * statistical, we have to test that we don't drop below + * the minimum feasible timer (which is 2 ticks). + * This version of the macro adapted from a paper by Lawrence + * Brakmo and Larry Peterson which outlines a problem caused + * by insufficient precision in the original implementation, + * which results in inappropriately large RTO values for very + * fast networks. + */ +#define TCP_REXMTVAL(tp) \ + max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) + +/* + * TCP statistics. + * Many of these should be kept per connection, + * but that's inconvenient at the moment. + */ +struct tcpstat { + u_long tcps_connattempt; /* connections initiated */ + u_long tcps_accepts; /* connections accepted */ + u_long tcps_connects; /* connections established */ + u_long tcps_drops; /* connections dropped */ + u_long tcps_conndrops; /* embryonic connections dropped */ + u_long tcps_minmssdrops; /* average minmss too low drops */ + u_long tcps_closed; /* conn. closed (includes drops) */ + u_long tcps_segstimed; /* segs where we tried to get rtt */ + u_long tcps_rttupdated; /* times we succeeded */ + u_long tcps_delack; /* delayed acks sent */ + u_long tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ + u_long tcps_rexmttimeo; /* retransmit timeouts */ + u_long tcps_persisttimeo; /* persist timeouts */ + u_long tcps_keeptimeo; /* keepalive timeouts */ + u_long tcps_keepprobe; /* keepalive probes sent */ + u_long tcps_keepdrops; /* connections dropped in keepalive */ + + u_long tcps_sndtotal; /* total packets sent */ + u_long tcps_sndpack; /* data packets sent */ + u_long tcps_sndbyte; /* data bytes sent */ + u_long tcps_sndrexmitpack; /* data packets retransmitted */ + u_long tcps_sndrexmitbyte; /* data bytes retransmitted */ + u_long tcps_sndrexmitbad; /* unnecessary packet retransmissions */ + u_long tcps_sndacks; /* ack-only packets sent */ + u_long tcps_sndprobe; /* window probes sent */ + u_long tcps_sndurg; /* packets sent with URG only */ + u_long tcps_sndwinup; /* window update-only packets sent */ + u_long tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ + + u_long tcps_rcvtotal; /* total packets received */ + u_long tcps_rcvpack; /* packets received in sequence */ + u_long tcps_rcvbyte; /* bytes received in sequence */ + u_long tcps_rcvbadsum; /* packets received with ccksum errs */ + u_long tcps_rcvbadoff; /* packets received with bad offset */ + u_long tcps_rcvmemdrop; /* packets dropped for lack of memory */ + u_long tcps_rcvshort; /* packets received too short */ + u_long tcps_rcvduppack; /* duplicate-only packets received */ + u_long tcps_rcvdupbyte; /* duplicate-only bytes received */ + u_long tcps_rcvpartduppack; /* packets with some duplicate data */ + u_long tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ + u_long tcps_rcvoopack; /* out-of-order packets received */ + u_long tcps_rcvoobyte; /* out-of-order bytes received */ + u_long tcps_rcvpackafterwin; /* packets with data after window */ + u_long tcps_rcvbyteafterwin; /* bytes rcvd after window */ + u_long tcps_rcvafterclose; /* packets rcvd after "close" */ + u_long tcps_rcvwinprobe; /* rcvd window probe packets */ + u_long tcps_rcvdupack; /* rcvd duplicate acks */ + u_long tcps_rcvacktoomuch; /* rcvd acks for unsent data */ + u_long tcps_rcvackpack; /* rcvd ack packets */ + u_long tcps_rcvackbyte; /* bytes acked by rcvd acks */ + u_long tcps_rcvwinupd; /* rcvd window update packets */ + u_long tcps_pawsdrop; /* segments dropped due to PAWS */ + u_long tcps_predack; /* times hdr predict ok for acks */ + u_long tcps_preddat; /* times hdr predict ok for data pkts */ + u_long tcps_pcbcachemiss; + u_long tcps_cachedrtt; /* times cached RTT in route updated */ + u_long tcps_cachedrttvar; /* times cached rttvar updated */ + u_long tcps_cachedssthresh; /* times cached ssthresh updated */ + u_long tcps_usedrtt; /* times RTT initialized from route */ + u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */ + u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/ + u_long tcps_persistdrop; /* timeout in persist state */ + u_long tcps_badsyn; /* bogus SYN, e.g. premature ACK */ + u_long tcps_mturesent; /* resends due to MTU discovery */ + u_long tcps_listendrop; /* listen queue overflows */ + u_long tcps_badrst; /* ignored RSTs in the window */ + + u_long tcps_sc_added; /* entry added to syncache */ + u_long tcps_sc_retransmitted; /* syncache entry was retransmitted */ + u_long tcps_sc_dupsyn; /* duplicate SYN packet */ + u_long tcps_sc_dropped; /* could not reply to packet */ + u_long tcps_sc_completed; /* successful extraction of entry */ + u_long tcps_sc_bucketoverflow; /* syncache per-bucket limit hit */ + u_long tcps_sc_cacheoverflow; /* syncache cache limit hit */ + u_long tcps_sc_reset; /* RST removed entry from syncache */ + u_long tcps_sc_stale; /* timed out or listen socket gone */ + u_long tcps_sc_aborted; /* syncache entry aborted */ + u_long tcps_sc_badack; /* removed due to bad ACK */ + u_long tcps_sc_unreach; /* ICMP unreachable received */ + u_long tcps_sc_zonefail; /* zalloc() failed */ + u_long tcps_sc_sendcookie; /* SYN cookie sent */ + u_long tcps_sc_recvcookie; /* SYN cookie received */ + + u_long tcps_hc_added; /* entry added to hostcache */ + u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */ + + u_long tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ + + /* SACK related stats */ + u_long tcps_sack_recovery_episode; /* SACK recovery episodes */ + u_long tcps_sack_rexmits; /* SACK rexmit segments */ + u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ + u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */ + u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */ + u_long tcps_sack_sboverflow; /* times scoreboard overflowed */ + + /* ECN related stats */ + u_long tcps_ecn_ce; /* ECN Congestion Experienced */ + u_long tcps_ecn_ect0; /* ECN Capable Transport */ + u_long tcps_ecn_ect1; /* ECN Capable Transport */ + u_long tcps_ecn_shs; /* ECN successful handshakes */ + u_long tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ + + u_long _pad[12]; /* 6 UTO, 6 TBD */ +}; + +#ifdef _KERNEL +/* + * In-kernel consumers can use these accessor macros directly to update + * stats. + */ +#define TCPSTAT_ADD(name, val) V_tcpstat.name += (val) +#define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) + +/* + * Kernel module consumers must use this accessor macro. + */ +void kmod_tcpstat_inc(int statnum); +#define KMOD_TCPSTAT_INC(name) \ + kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(u_long)) +#endif + +/* + * TCB structure exported to user-land via sysctl(3). + * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been + * included. Not all of our clients do. + */ +#if defined(_NETINET_IN_PCB_HH_) && defined(_SYS_SOCKETVAR_HH_) +struct xtcpcb { + size_t xt_len; + struct inpcb xt_inp; + struct tcpcb xt_tp; + struct xsocket xt_socket; + u_quad_t xt_alignment_hack; +}; +#endif + +/* + * Names for TCP sysctl objects + */ +#define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ +#define TCPCTL_MSSDFLT 3 /* MSS default */ +#define TCPCTL_STATS 4 /* statistics (read-only) */ +#define TCPCTL_RTTDFLT 5 /* default RTT estimate */ +#define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ +#define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ +#define TCPCTL_SENDSPACE 8 /* send buffer space */ +#define TCPCTL_RECVSPACE 9 /* receive buffer space */ +#define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ +#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ +#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ +#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ +#define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ +#define TCPCTL_DROP 15 /* drop tcp connection */ +#define TCPCTL_MAXID 16 +#define TCPCTL_FINWAIT2_TIMEOUT 17 + +#define TCPCTL_NAMES { \ + { 0, 0 }, \ + { "rfc1323", CTLTYPE_INT }, \ + { "mssdflt", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "rttdflt", CTLTYPE_INT }, \ + { "keepidle", CTLTYPE_INT }, \ + { "keepintvl", CTLTYPE_INT }, \ + { "sendspace", CTLTYPE_INT }, \ + { "recvspace", CTLTYPE_INT }, \ + { "keepinit", CTLTYPE_INT }, \ + { "pcblist", CTLTYPE_STRUCT }, \ + { "delacktime", CTLTYPE_INT }, \ + { "v6mssdflt", CTLTYPE_INT }, \ + { "maxid", CTLTYPE_INT }, \ +} + + +#ifdef _KERNEL +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet_tcp); +SYSCTL_DECL(_net_inet_tcp_sack); +MALLOC_DECLARE(M_TCPLOG); +#endif + +VNET_DECLARE(struct inpcbhead, tcb); /* queue of active tcpcb's */ +VNET_DECLARE(struct inpcbinfo, tcbinfo); +VNET_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ +extern int tcp_log_in_vain; +VNET_DECLARE(int, tcp_mssdflt); /* XXX */ +VNET_DECLARE(int, tcp_minmss); +VNET_DECLARE(int, tcp_delack_enabled); +VNET_DECLARE(int, tcp_do_rfc3390); +VNET_DECLARE(int, tcp_do_newreno); +VNET_DECLARE(int, path_mtu_discovery); +VNET_DECLARE(int, ss_fltsz); +VNET_DECLARE(int, ss_fltsz_local); +#define V_tcb VNET(tcb) +#define V_tcbinfo VNET(tcbinfo) +#define V_tcpstat VNET(tcpstat) +#define V_tcp_mssdflt VNET(tcp_mssdflt) +#define V_tcp_minmss VNET(tcp_minmss) +#define V_tcp_delack_enabled VNET(tcp_delack_enabled) +#define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) +#define V_tcp_do_newreno VNET(tcp_do_newreno) +#define V_path_mtu_discovery VNET(path_mtu_discovery) +#define V_ss_fltsz VNET(ss_fltsz) +#define V_ss_fltsz_local VNET(ss_fltsz_local) + +VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */ +VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */ +#define V_tcp_do_sack VNET(tcp_do_sack) +#define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) + +VNET_DECLARE(int, tcp_do_ecn); /* TCP ECN enabled/disabled */ +VNET_DECLARE(int, tcp_ecn_maxretries); +#define V_tcp_do_ecn VNET(tcp_do_ecn) +#define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) + +int tcp_addoptions(struct tcpopt *, u_char *); +struct tcpcb * + tcp_close(struct tcpcb *); +void tcp_discardcb(struct tcpcb *); +void tcp_twstart(struct tcpcb *); +#if 0 +int tcp_twrecycleable(struct tcptw *tw); +#endif +void tcp_twclose(struct tcptw *_tw, int _reuse); +void tcp_ctlinput(int, struct sockaddr *, void *); +int tcp_ctloutput(struct socket *, struct sockopt *); +#ifndef __rtems__ +struct tcpcb * + tcp_drop(struct tcpcb *, int); +#else +struct tcpcb * +tcp_drop(struct tcpcb *tp, int errno); +#endif +void tcp_drain(void); +void tcp_init(void); +#ifdef VIMAGE +void tcp_destroy(void); +#endif +void tcp_fini(void *); +char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, + const void *); +char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, + const void *); +int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); +void tcp_reass_init(void); +void tcp_reass_flush(struct tcpcb *); +#ifdef VIMAGE +void tcp_reass_destroy(void); +#endif +void tcp_input(struct mbuf *, int); +u_long tcp_maxmtu(struct in_conninfo *, int *); +u_long tcp_maxmtu6(struct in_conninfo *, int *); +void tcp_mss_update(struct tcpcb *, int, struct hc_metrics_lite *, int *); +void tcp_mss(struct tcpcb *, int); +int tcp_mssopt(struct in_conninfo *); +#ifndef __rtems__ +struct inpcb * + tcp_drop_syn_sent(struct inpcb *, int); +struct inpcb * + tcp_mtudisc(struct inpcb *, int); +#else +struct inpcb * +tcp_drop_syn_sent(struct inpcb *inp, int errno); +struct inpcb * +tcp_mtudisc(struct inpcb *inp, int errno); +#endif +struct tcpcb * + tcp_newtcpcb(struct inpcb *); +int tcp_output(struct tcpcb *); +void tcp_respond(struct tcpcb *, void *, + struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); +void tcp_tw_init(void); +#ifdef VIMAGE +void tcp_tw_destroy(void); +#endif +void tcp_tw_zone_change(void); +int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, + struct mbuf *, int); +int tcp_twrespond(struct tcptw *, int); +void tcp_setpersist(struct tcpcb *); +#ifdef TCP_SIGNATURE +int tcp_signature_compute(struct mbuf *, int, int, int, u_char *, u_int); +#endif +void tcp_slowtimo(void); +struct tcptemp * + tcpip_maketemplate(struct inpcb *); +void tcpip_fillheaders(struct inpcb *, void *, void *); +void tcp_timer_activate(struct tcpcb *, int, u_int); +int tcp_timer_active(struct tcpcb *, int); +void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); +void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq); +/* + * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) + */ +void tcp_hc_init(void); +#ifdef VIMAGE +void tcp_hc_destroy(void); +#endif +void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); +u_long tcp_hc_getmtu(struct in_conninfo *); +void tcp_hc_updatemtu(struct in_conninfo *, u_long); +void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); + +extern struct pr_usrreqs tcp_usrreqs; +extern u_long tcp_sendspace; +extern u_long tcp_recvspace; +tcp_seq tcp_new_isn(struct tcpcb *); + +void tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); +void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); +void tcp_clean_sackreport(struct tcpcb *tp); +void tcp_sack_adjust(struct tcpcb *tp); +struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); +void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); +void tcp_free_sackholes(struct tcpcb *tp); +int tcp_newreno(struct tcpcb *, struct tcphdr *); +u_long tcp_seq_subtract(u_long, u_long ); + +#endif /* _KERNEL */ + +#endif /* _NETINET_TCP_VAR_HH_ */ diff --git a/freebsd/sys/netinet/tcpip.h b/freebsd/sys/netinet/tcpip.h new file mode 100644 index 00000000..337c07a6 --- /dev/null +++ b/freebsd/sys/netinet/tcpip.h @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcpip.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_TCPIP_HH_ +#define _NETINET_TCPIP_HH_ + +/* + * Tcp+ip header, after ip options removed. + */ +struct tcpiphdr { + struct ipovly ti_i; /* overlaid ip structure */ + struct tcphdr ti_t; /* tcp header */ +}; +#define ti_x1 ti_i.ih_x1 +#define ti_pr ti_i.ih_pr +#define ti_len ti_i.ih_len +#define ti_src ti_i.ih_src +#define ti_dst ti_i.ih_dst +#define ti_sport ti_t.th_sport +#define ti_dport ti_t.th_dport +#define ti_seq ti_t.th_seq +#define ti_ack ti_t.th_ack +#define ti_x2 ti_t.th_x2 +#define ti_off ti_t.th_off +#define ti_flags ti_t.th_flags +#define ti_win ti_t.th_win +#define ti_sum ti_t.th_sum +#define ti_urp ti_t.th_urp + +#endif diff --git a/freebsd/sys/netinet/toedev.h b/freebsd/sys/netinet/toedev.h new file mode 100644 index 00000000..4623845c --- /dev/null +++ b/freebsd/sys/netinet/toedev.h @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 2007, Chelsio Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the Chelsio Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_TOEDEV_HH_ +#define _NETINET_TOEDEV_HH_ + +#ifndef _KERNEL +#error "no user-serviceable parts inside" +#endif + +extern uint32_t toedev_registration_count; + +/* Parameter values for offload_get_phys_egress(). */ +enum { + TOE_OPEN, + TOE_FAILOVER, +}; + +/* Parameter values for toe_failover(). */ +enum { + TOE_ACTIVE_SLAVE, + TOE_LINK_DOWN, + TOE_LINK_UP, + TOE_RELEASE, + TOE_RELEASE_ALL, +}; + +#define TOENAMSIZ 16 + +/* Get the toedev associated with a ifnet. */ +#define TOEDEV(ifp) ((ifp)->if_llsoftc) + +struct offload_id { + unsigned int id; + unsigned long data; +}; + +struct ifnet; +struct rt_entry; +struct tom_info; +struct sysctl_oid; +struct socket; +struct mbuf; + +struct toedev { + TAILQ_ENTRY(toedev) entry; + char tod_name[TOENAMSIZ]; /* TOE device name */ + unsigned int tod_ttid; /* TOE type id */ + unsigned long tod_flags; /* device flags */ + unsigned int tod_mtu; /* max TX offloaded data */ + unsigned int tod_nconn; /* max # of offloaded + * connections + */ + struct ifnet *tod_lldev; /* first interface */ + const struct tom_info *tod_offload_mod; /* TCP offload module */ + + /* + * This TOE device is capable of offloading the connection for socket so + */ + int (*tod_can_offload)(struct toedev *dev, struct socket *so); + + /* + * Establish a connection to nam using the TOE device dev + */ + int (*tod_connect)(struct toedev *dev, struct socket *so, + struct rtentry *rt, struct sockaddr *nam); + /* + * Send an mbuf down to the toe device + */ + int (*tod_send)(struct toedev *dev, struct mbuf *m); + /* + * Receive an array of mbufs from the TOE device dev + */ + int (*tod_recv)(struct toedev *dev, struct mbuf **m, int n); + /* + * Device specific ioctl interface + */ + int (*tod_ctl)(struct toedev *dev, unsigned int req, void *data); + /* + * Update L2 entry in toedev + */ + void (*tod_arp_update)(struct toedev *dev, struct rtentry *neigh); + /* + * Failover from one toe device to another + */ + void (*tod_failover)(struct toedev *dev, struct ifnet *bond_ifp, + struct ifnet *ndev, int event); + void *tod_priv; /* driver private data */ + void *tod_l2opt; /* optional layer 2 data */ + void *tod_l3opt; /* optional layer 3 data */ + void *tod_l4opt; /* optional layer 4 data */ + void *tod_ulp; /* upper lever protocol */ +}; + +struct tom_info { + TAILQ_ENTRY(tom_info) entry; + int (*ti_attach)(struct toedev *dev, + const struct offload_id *entry); + int (*ti_detach)(struct toedev *dev); + const char *ti_name; + const struct offload_id *ti_id_table; +}; + +static __inline void +init_offload_dev(struct toedev *dev) +{ +} + +int register_tom(struct tom_info *t); +int unregister_tom(struct tom_info *t); +int register_toedev(struct toedev *dev, const char *name); +int unregister_toedev(struct toedev *dev); +int activate_offload(struct toedev *dev); +int toe_send(struct toedev *dev, struct mbuf *m); +void toe_arp_update(struct rtentry *rt); +struct ifnet *offload_get_phys_egress(struct ifnet *ifp, + struct socket *so, int context); +int toe_receive_mbuf(struct toedev *dev, struct mbuf **m, int n); + +static __inline void +toe_neigh_update(struct ifnet *ifp) +{ +} + +static __inline void +toe_failover(struct ifnet *bond_ifp, struct ifnet *fail_ifp, int event) +{ +} + +static __inline int +toe_enslave(struct ifnet *bond_ifp, struct ifnet *slave_ifp) +{ + return (0); +} + +#endif /* _NETINET_TOEDEV_HH_ */ diff --git a/freebsd/sys/netinet/udp.h b/freebsd/sys/netinet/udp.h new file mode 100644 index 00000000..e7010ac5 --- /dev/null +++ b/freebsd/sys/netinet/udp.h @@ -0,0 +1,2 @@ +#include +#include diff --git a/freebsd/sys/netinet/udp_usrreq.c b/freebsd/sys/netinet/udp_usrreq.c new file mode 100644 index 00000000..f992f5f6 --- /dev/null +++ b/freebsd/sys/netinet/udp_usrreq.c @@ -0,0 +1,1633 @@ +#include + +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. + * Copyright (c) 2008 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_usrreq.c 8.6 (Berkeley) 5/23/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#include + +#ifdef IPSEC +#include +#include +#endif + +#include + +#include + +/* + * UDP protocol implementation. + * Per RFC 768, August, 1980. + */ + +/* + * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums + * removes the only data integrity mechanism for packets and malformed + * packets that would otherwise be discarded due to bad checksums, and may + * cause problems (especially for NFS data blocks). + */ +static int udp_cksum = 1; +SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, &udp_cksum, + 0, "compute udp checksum"); + +int udp_log_in_vain = 0; +SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, + &udp_log_in_vain, 0, "Log all incoming UDP packets"); + +VNET_DEFINE(int, udp_blackhole) = 0; +SYSCTL_VNET_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, + &VNET_NAME(udp_blackhole), 0, + "Do not send port unreachables for refused connects"); + +u_long udp_sendspace = 9216; /* really max datagram size */ + /* 40 1K datagrams */ +SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, + &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); + +u_long udp_recvspace = 40 * (1024 + +#ifdef INET6 + sizeof(struct sockaddr_in6) +#else + sizeof(struct sockaddr_in) +#endif + ); + +SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); + +VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ +VNET_DEFINE(struct inpcbinfo, udbinfo); +static VNET_DEFINE(uma_zone_t, udpcb_zone); +#define V_udpcb_zone VNET(udpcb_zone) + +#ifndef UDBHASHSIZE +#define UDBHASHSIZE 128 +#endif + +VNET_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ +SYSCTL_VNET_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RW, + &VNET_NAME(udpstat), udpstat, + "UDP statistics (struct udpstat, netinet/udp_var.h)"); + +static void udp_detach(struct socket *so); +static int udp_output(struct inpcb *, struct mbuf *, struct sockaddr *, + struct mbuf *, struct thread *); +#ifdef IPSEC +#ifdef IPSEC_NAT_T +#define UF_ESPINUDP_ALL (UF_ESPINUDP_NON_IKE|UF_ESPINUDP) +#ifdef INET +static struct mbuf *udp4_espdecap(struct inpcb *, struct mbuf *, int); +#endif +#endif /* IPSEC_NAT_T */ +#endif /* IPSEC */ + +static void +udp_zone_change(void *tag) +{ + + uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); + uma_zone_set_max(V_udpcb_zone, maxsockets); +} + +static int +udp_inpcb_init(void *mem, int size, int flags) +{ + struct inpcb *inp; + + inp = mem; + INP_LOCK_INIT(inp, "inp", "udpinp"); + return (0); +} + +void +udp_init(void) +{ + + + INP_INFO_LOCK_INIT(&V_udbinfo, "udp"); + LIST_INIT(&V_udb); +#ifdef VIMAGE + V_udbinfo.ipi_vnet = curvnet; +#endif + V_udbinfo.ipi_listhead = &V_udb; + V_udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB, + &V_udbinfo.ipi_hashmask); + V_udbinfo.ipi_porthashbase = hashinit(UDBHASHSIZE, M_PCB, + &V_udbinfo.ipi_porthashmask); + V_udbinfo.ipi_zone = uma_zcreate("udp_inpcb", sizeof(struct inpcb), + NULL, NULL, udp_inpcb_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(V_udbinfo.ipi_zone, maxsockets); + + V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(V_udpcb_zone, maxsockets); + + EVENTHANDLER_REGISTER(maxsockets_change, udp_zone_change, NULL, + EVENTHANDLER_PRI_ANY); +} + +/* + * Kernel module interface for updating udpstat. The argument is an index + * into udpstat treated as an array of u_long. While this encodes the + * general layout of udpstat into the caller, it doesn't encode its location, + * so that future changes to add, for example, per-CPU stats support won't + * cause binary compatibility problems for kernel modules. + */ +void +kmod_udpstat_inc(int statnum) +{ + + (*((u_long *)&V_udpstat + statnum))++; +} + +int +udp_newudpcb(struct inpcb *inp) +{ + struct udpcb *up; + + up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO); + if (up == NULL) + return (ENOBUFS); + inp->inp_ppcb = up; + return (0); +} + +void +udp_discardcb(struct udpcb *up) +{ + + uma_zfree(V_udpcb_zone, up); +} + +#ifdef VIMAGE +void +udp_destroy(void) +{ + + hashdestroy(V_udbinfo.ipi_hashbase, M_PCB, + V_udbinfo.ipi_hashmask); + hashdestroy(V_udbinfo.ipi_porthashbase, M_PCB, + V_udbinfo.ipi_porthashmask); + + uma_zdestroy(V_udpcb_zone); + uma_zdestroy(V_udbinfo.ipi_zone); + INP_INFO_LOCK_DESTROY(&V_udbinfo); +} +#endif + +/* + * Subroutine of udp_input(), which appends the provided mbuf chain to the + * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that + * contains the source address. If the socket ends up being an IPv6 socket, + * udp_append() will convert to a sockaddr_in6 before passing the address + * into the socket code. + */ +static void +udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, + struct sockaddr_in *udp_in) +{ + struct sockaddr *append_sa; + struct socket *so; + struct mbuf *opts = 0; +#ifdef INET6 + struct sockaddr_in6 udp_in6; +#endif +#ifdef IPSEC +#ifdef IPSEC_NAT_T +#ifdef INET + struct udpcb *up; +#endif +#endif +#endif + + INP_RLOCK_ASSERT(inp); + +#ifdef IPSEC + /* Check AH/ESP integrity. */ + if (ipsec4_in_reject(n, inp)) { + m_freem(n); + V_ipsec4stat.in_polvio++; + return; + } +#ifdef IPSEC_NAT_T +#ifdef INET + up = intoudpcb(inp); + KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); + if (up->u_flags & UF_ESPINUDP_ALL) { /* IPSec UDP encaps. */ + n = udp4_espdecap(inp, n, off); + if (n == NULL) /* Consumed. */ + return; + } +#endif /* INET */ +#endif /* IPSEC_NAT_T */ +#endif /* IPSEC */ +#ifdef MAC + if (mac_inpcb_check_deliver(inp, n) != 0) { + m_freem(n); + return; + } +#endif + if (inp->inp_flags & INP_CONTROLOPTS || + inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) + (void)ip6_savecontrol_v4(inp, n, &opts, NULL); + else +#endif + ip_savecontrol(inp, &opts, ip, n); + } +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + bzero(&udp_in6, sizeof(udp_in6)); + udp_in6.sin6_len = sizeof(udp_in6); + udp_in6.sin6_family = AF_INET6; + in6_sin_2_v4mapsin6(udp_in, &udp_in6); + append_sa = (struct sockaddr *)&udp_in6; + } else +#endif + append_sa = (struct sockaddr *)udp_in; + m_adj(n, off); + + so = inp->inp_socket; + SOCKBUF_LOCK(&so->so_rcv); + if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { + SOCKBUF_UNLOCK(&so->so_rcv); + m_freem(n); + if (opts) + m_freem(opts); + UDPSTAT_INC(udps_fullsock); + } else + sorwakeup_locked(so); +} + +void +udp_input(struct mbuf *m, int off) +{ + int iphlen = off; + struct ip *ip; + struct udphdr *uh; + struct ifnet *ifp; + struct inpcb *inp; + struct udpcb *up; + int len; + struct ip save_ip; + struct sockaddr_in udp_in; +#ifdef IPFIREWALL_FORWARD + struct m_tag *fwd_tag; +#endif + + ifp = m->m_pkthdr.rcvif; + UDPSTAT_INC(udps_ipackets); + + /* + * Strip IP options, if any; should skip this, make available to + * user, and use on returned packets, but we don't yet have a way to + * check the checksum with options still present. + */ + if (iphlen > sizeof (struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + iphlen = sizeof(struct ip); + } + + /* + * Get IP and UDP header together in first mbuf. + */ + ip = mtod(m, struct ip *); + if (m->m_len < iphlen + sizeof(struct udphdr)) { + if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == 0) { + UDPSTAT_INC(udps_hdrops); + return; + } + ip = mtod(m, struct ip *); + } + uh = (struct udphdr *)((caddr_t)ip + iphlen); + + /* + * Destination port of 0 is illegal, based on RFC768. + */ + if (uh->uh_dport == 0) + goto badunlocked; + + /* + * Construct sockaddr format source address. Stuff source address + * and datagram in user buffer. + */ + bzero(&udp_in, sizeof(udp_in)); + udp_in.sin_len = sizeof(udp_in); + udp_in.sin_family = AF_INET; + udp_in.sin_port = uh->uh_sport; + udp_in.sin_addr = ip->ip_src; + + /* + * Make mbuf data length reflect UDP length. If not enough data to + * reflect UDP length, drop. + */ + len = ntohs((u_short)uh->uh_ulen); + if (ip->ip_len != len) { + if (len > ip->ip_len || len < sizeof(struct udphdr)) { + UDPSTAT_INC(udps_badlen); + goto badunlocked; + } + m_adj(m, len - ip->ip_len); + /* ip->ip_len = len; */ + } + + /* + * Save a copy of the IP header in case we want restore it for + * sending an ICMP error message in response. + */ + if (!V_udp_blackhole) + save_ip = *ip; + else + memset(&save_ip, 0, sizeof(save_ip)); + + /* + * Checksum extended UDP header and data. + */ + if (uh->uh_sum) { + u_short uh_sum; + + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + uh_sum = m->m_pkthdr.csum_data; + else + uh_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htonl((u_short)len + + m->m_pkthdr.csum_data + IPPROTO_UDP)); + uh_sum ^= 0xffff; + } else { + char b[9]; + + bcopy(((struct ipovly *)ip)->ih_x1, b, 9); + bzero(((struct ipovly *)ip)->ih_x1, 9); + ((struct ipovly *)ip)->ih_len = uh->uh_ulen; + uh_sum = in_cksum(m, len + sizeof (struct ip)); + bcopy(b, ((struct ipovly *)ip)->ih_x1, 9); + } + if (uh_sum) { + UDPSTAT_INC(udps_badsum); + m_freem(m); + return; + } + } else + UDPSTAT_INC(udps_nosum); + +#ifdef IPFIREWALL_FORWARD + /* + * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. + */ + fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); + if (fwd_tag != NULL) { + struct sockaddr_in *next_hop; + + /* + * Do the hack. + */ + next_hop = (struct sockaddr_in *)(fwd_tag + 1); + ip->ip_dst = next_hop->sin_addr; + uh->uh_dport = ntohs(next_hop->sin_port); + + /* + * Remove the tag from the packet. We don't need it anymore. + */ + m_tag_delete(m, fwd_tag); + } +#endif + + INP_INFO_RLOCK(&V_udbinfo); + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || + in_broadcast(ip->ip_dst, ifp)) { + struct inpcb *last; + struct ip_moptions *imo; + + last = NULL; + LIST_FOREACH(inp, &V_udb, inp_list) { + if (inp->inp_lport != uh->uh_dport) + continue; +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_laddr.s_addr != INADDR_ANY && + inp->inp_laddr.s_addr != ip->ip_dst.s_addr) + continue; + if (inp->inp_faddr.s_addr != INADDR_ANY && + inp->inp_faddr.s_addr != ip->ip_src.s_addr) + continue; + if (inp->inp_fport != 0 && + inp->inp_fport != uh->uh_sport) + continue; + + INP_RLOCK(inp); + + /* + * Handle socket delivery policy for any-source + * and source-specific multicast. [RFC3678] + */ + imo = inp->inp_moptions; + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && + imo != NULL) { + struct sockaddr_in group; + int blocked; + + bzero(&group, sizeof(struct sockaddr_in)); + group.sin_len = sizeof(struct sockaddr_in); + group.sin_family = AF_INET; + group.sin_addr = ip->ip_dst; + + blocked = imo_multi_filter(imo, ifp, + (struct sockaddr *)&group, + (struct sockaddr *)&udp_in); + if (blocked != MCAST_PASS) { + if (blocked == MCAST_NOTGMEMBER) + IPSTAT_INC(ips_notmember); + if (blocked == MCAST_NOTSMEMBER || + blocked == MCAST_MUTED) + UDPSTAT_INC(udps_filtermcast); + INP_RUNLOCK(inp); + continue; + } + } + if (last != NULL) { + struct mbuf *n; + + n = m_copy(m, 0, M_COPYALL); + up = intoudpcb(last); + if (up->u_tun_func == NULL) { + if (n != NULL) + udp_append(last, + ip, n, + iphlen + + sizeof(struct udphdr), + &udp_in); + } else { + /* + * Engage the tunneling protocol we + * will have to leave the info_lock + * up, since we are hunting through + * multiple UDP's. + */ + + (*up->u_tun_func)(n, iphlen, last); + } + INP_RUNLOCK(last); + } + last = inp; + /* + * Don't look for additional matches if this one does + * not have either the SO_REUSEPORT or SO_REUSEADDR + * socket options set. This heuristic avoids + * searching through all pcbs in the common case of a + * non-shared port. It assumes that an application + * will never clear these options after setting them. + */ + if ((last->inp_socket->so_options & + (SO_REUSEPORT|SO_REUSEADDR)) == 0) + break; + } + + if (last == NULL) { + /* + * No matching pcb found; discard datagram. (No need + * to send an ICMP Port Unreachable for a broadcast + * or multicast datgram.) + */ + UDPSTAT_INC(udps_noportbcast); + goto badheadlocked; + } + up = intoudpcb(last); + if (up->u_tun_func == NULL) { + udp_append(last, ip, m, iphlen + sizeof(struct udphdr), + &udp_in); + } else { + /* + * Engage the tunneling protocol. + */ + (*up->u_tun_func)(m, iphlen, last); + } + INP_RUNLOCK(last); + INP_INFO_RUNLOCK(&V_udbinfo); + return; + } + + /* + * Locate pcb for datagram. + */ + inp = in_pcblookup_hash(&V_udbinfo, ip->ip_src, uh->uh_sport, + ip->ip_dst, uh->uh_dport, 1, ifp); + if (inp == NULL) { + if (udp_log_in_vain) { + char buf[4*sizeof "123"]; + + strcpy(buf, inet_ntoa(ip->ip_dst)); + log(LOG_INFO, + "Connection attempt to UDP %s:%d from %s:%d\n", + buf, ntohs(uh->uh_dport), inet_ntoa(ip->ip_src), + ntohs(uh->uh_sport)); + } + UDPSTAT_INC(udps_noport); + if (m->m_flags & (M_BCAST | M_MCAST)) { + UDPSTAT_INC(udps_noportbcast); + goto badheadlocked; + } + if (V_udp_blackhole) + goto badheadlocked; + if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) + goto badheadlocked; + *ip = save_ip; + ip->ip_len += iphlen; + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); + INP_INFO_RUNLOCK(&V_udbinfo); + return; + } + + /* + * Check the minimum TTL for socket. + */ + INP_RLOCK(inp); + INP_INFO_RUNLOCK(&V_udbinfo); + if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { + INP_RUNLOCK(inp); + goto badunlocked; + } + up = intoudpcb(inp); + if (up->u_tun_func == NULL) { + udp_append(inp, ip, m, iphlen + sizeof(struct udphdr), &udp_in); + } else { + /* + * Engage the tunneling protocol. + */ + + (*up->u_tun_func)(m, iphlen, inp); + } + INP_RUNLOCK(inp); + return; + +badheadlocked: + if (inp) + INP_RUNLOCK(inp); + INP_INFO_RUNLOCK(&V_udbinfo); +badunlocked: + m_freem(m); +} + +/* + * Notify a udp user of an asynchronous error; just wake up so that they can + * collect error status. + */ +struct inpcb * +udp_notify(struct inpcb *inp, int errno) +{ + + /* + * While udp_ctlinput() always calls udp_notify() with a read lock + * when invoking it directly, in_pcbnotifyall() currently uses write + * locks due to sharing code with TCP. For now, accept either a read + * or a write lock, but a read lock is sufficient. + */ + INP_LOCK_ASSERT(inp); + + inp->inp_socket->so_error = errno; + sorwakeup(inp->inp_socket); + sowwakeup(inp->inp_socket); + return (inp); +} + +void +udp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + struct ip *ip = vip; + struct udphdr *uh; + struct in_addr faddr; + struct inpcb *inp; + + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; + + /* + * Redirects don't need to be handled up here. + */ + if (PRC_IS_REDIRECT(cmd)) + return; + + /* + * Hostdead is ugly because it goes linearly through all PCBs. + * + * XXX: We never get this from ICMP, otherwise it makes an excellent + * DoS attack on machines with many connections. + */ + if (cmd == PRC_HOSTDEAD) + ip = NULL; + else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) + return; + if (ip != NULL) { + uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); + INP_INFO_RLOCK(&V_udbinfo); + inp = in_pcblookup_hash(&V_udbinfo, faddr, uh->uh_dport, + ip->ip_src, uh->uh_sport, 0, NULL); + if (inp != NULL) { + INP_RLOCK(inp); + if (inp->inp_socket != NULL) { + udp_notify(inp, inetctlerrmap[cmd]); + } + INP_RUNLOCK(inp); + } + INP_INFO_RUNLOCK(&V_udbinfo); + } else + in_pcbnotifyall(&V_udbinfo, faddr, inetctlerrmap[cmd], + udp_notify); +} + +static int +udp_pcblist(SYSCTL_HANDLER_ARGS) +{ + int error, i, n; + struct inpcb *inp, **inp_list; + inp_gen_t gencnt; + struct xinpgen xig; + + /* + * The process of preparing the PCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = V_udbinfo.ipi_count; + n += imax(n / 8, 10); + req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); + return (0); + } + + if (req->newptr != 0) + return (EPERM); + + /* + * OK, now we're committed to doing something. + */ + INP_INFO_RLOCK(&V_udbinfo); + gencnt = V_udbinfo.ipi_gencnt; + n = V_udbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_udbinfo); + + error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + + n * sizeof(struct xinpcb)); + if (error != 0) + return (error); + + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) + return (error); + + inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) + return (ENOMEM); + + INP_INFO_RLOCK(&V_udbinfo); + for (inp = LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; + inp = LIST_NEXT(inp, inp_list)) { + INP_WLOCK(inp); + if (inp->inp_gencnt <= gencnt && + cr_canseeinpcb(req->td->td_ucred, inp) == 0) { + in_pcbref(inp); + inp_list[i++] = inp; + } + INP_WUNLOCK(inp); + } + INP_INFO_RUNLOCK(&V_udbinfo); + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_RLOCK(inp); + if (inp->inp_gencnt <= gencnt) { + struct xinpcb xi; + + bzero(&xi, sizeof(xi)); + xi.xi_len = sizeof xi; + /* XXX should avoid extra copy */ + bcopy(inp, &xi.xi_inp, sizeof *inp); + if (inp->inp_socket) + sotoxsocket(inp->inp_socket, &xi.xi_socket); + xi.xi_inp.inp_gencnt = inp->inp_gencnt; + INP_RUNLOCK(inp); + error = SYSCTL_OUT(req, &xi, sizeof xi); + } else + INP_RUNLOCK(inp); + } + INP_INFO_WLOCK(&V_udbinfo); + for (i = 0; i < n; i++) { + inp = inp_list[i]; + INP_WLOCK(inp); + if (!in_pcbrele(inp)) + INP_WUNLOCK(inp); + } + INP_INFO_WUNLOCK(&V_udbinfo); + + if (!error) { + /* + * Give the user an updated idea of our state. If the + * generation differs from what we told her before, she knows + * that something happened while we were processing this + * request, and it might be necessary to retry. + */ + INP_INFO_RLOCK(&V_udbinfo); + xig.xig_gen = V_udbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = V_udbinfo.ipi_count; + INP_INFO_RUNLOCK(&V_udbinfo); + error = SYSCTL_OUT(req, &xig, sizeof xig); + } + free(inp_list, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, + udp_pcblist, "S,xinpcb", "List of active UDP sockets"); + +static int +udp_getcred(SYSCTL_HANDLER_ARGS) +{ + struct xucred xuc; + struct sockaddr_in addrs[2]; + struct inpcb *inp; + int error; + + error = priv_check(req->td, PRIV_NETINET_GETCRED); + if (error) + return (error); + error = SYSCTL_IN(req, addrs, sizeof(addrs)); + if (error) + return (error); + INP_INFO_RLOCK(&V_udbinfo); + inp = in_pcblookup_hash(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, + addrs[0].sin_addr, addrs[0].sin_port, 1, NULL); + if (inp != NULL) { + INP_RLOCK(inp); + INP_INFO_RUNLOCK(&V_udbinfo); + if (inp->inp_socket == NULL) + error = ENOENT; + if (error == 0) + error = cr_canseeinpcb(req->td->td_ucred, inp); + if (error == 0) + cru2x(inp->inp_cred, &xuc); + INP_RUNLOCK(inp); + } else { + INP_INFO_RUNLOCK(&V_udbinfo); + error = ENOENT; + } + if (error == 0) + error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); + return (error); +} + +SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, + CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, + udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); + +int +udp_ctloutput(struct socket *so, struct sockopt *sopt) +{ + int error = 0, optval; + struct inpcb *inp; +#ifdef IPSEC_NAT_T + struct udpcb *up; +#endif + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); + INP_WLOCK(inp); + if (sopt->sopt_level != IPPROTO_UDP) { +#ifdef INET6 + if (INP_CHECK_SOCKAF(so, AF_INET6)) { + INP_WUNLOCK(inp); + error = ip6_ctloutput(so, sopt); + } else { +#endif + INP_WUNLOCK(inp); + error = ip_ctloutput(so, sopt); +#ifdef INET6 + } +#endif + return (error); + } + + switch (sopt->sopt_dir) { + case SOPT_SET: + switch (sopt->sopt_name) { + case UDP_ENCAP: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); + INP_WLOCK(inp); +#ifdef IPSEC_NAT_T + up = intoudpcb(inp); + KASSERT(up != NULL, ("%s: up == NULL", __func__)); +#endif + switch (optval) { + case 0: + /* Clear all UDP encap. */ +#ifdef IPSEC_NAT_T + up->u_flags &= ~UF_ESPINUDP_ALL; +#endif + break; +#ifdef IPSEC_NAT_T + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + up->u_flags &= ~UF_ESPINUDP_ALL; + if (optval == UDP_ENCAP_ESPINUDP) + up->u_flags |= UF_ESPINUDP; + else if (optval == UDP_ENCAP_ESPINUDP_NON_IKE) + up->u_flags |= UF_ESPINUDP_NON_IKE; + break; +#endif + default: + error = EINVAL; + break; + } + INP_WUNLOCK(inp); + break; + default: + INP_WUNLOCK(inp); + error = ENOPROTOOPT; + break; + } + break; + case SOPT_GET: + switch (sopt->sopt_name) { +#ifdef IPSEC_NAT_T + case UDP_ENCAP: + up = intoudpcb(inp); + KASSERT(up != NULL, ("%s: up == NULL", __func__)); + optval = up->u_flags & UF_ESPINUDP_ALL; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; +#endif + default: + INP_WUNLOCK(inp); + error = ENOPROTOOPT; + break; + } + break; + } + return (error); +} + +static int +udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, + struct mbuf *control, struct thread *td) +{ + struct udpiphdr *ui; + int len = m->m_pkthdr.len; + struct in_addr faddr, laddr; + struct cmsghdr *cm; + struct sockaddr_in *sin, src; + int error = 0; + int ipflags; + u_short fport, lport; + int unlock_udbinfo; + + /* + * udp_output() may need to temporarily bind or connect the current + * inpcb. As such, we don't know up front whether we will need the + * pcbinfo lock or not. Do any work to decide what is needed up + * front before acquiring any locks. + */ + if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { + if (control) + m_freem(control); + m_freem(m); + return (EMSGSIZE); + } + + src.sin_family = 0; + if (control != NULL) { + /* + * XXX: Currently, we assume all the optional information is + * stored in a single mbuf. + */ + if (control->m_next) { + m_freem(control); + m_freem(m); + return (EINVAL); + } + for (; control->m_len > 0; + control->m_data += CMSG_ALIGN(cm->cmsg_len), + control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { + cm = mtod(control, struct cmsghdr *); + if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 + || cm->cmsg_len > control->m_len) { + error = EINVAL; + break; + } + if (cm->cmsg_level != IPPROTO_IP) + continue; + + switch (cm->cmsg_type) { + case IP_SENDSRCADDR: + if (cm->cmsg_len != + CMSG_LEN(sizeof(struct in_addr))) { + error = EINVAL; + break; + } + bzero(&src, sizeof(src)); + src.sin_family = AF_INET; + src.sin_len = sizeof(src); + src.sin_port = inp->inp_lport; + src.sin_addr = + *(struct in_addr *)CMSG_DATA(cm); + break; + + default: + error = ENOPROTOOPT; + break; + } + if (error) + break; + } + m_freem(control); + } + if (error) { + m_freem(m); + return (error); + } + + /* + * Depending on whether or not the application has bound or connected + * the socket, we may have to do varying levels of work. The optimal + * case is for a connected UDP socket, as a global lock isn't + * required at all. + * + * In order to decide which we need, we require stability of the + * inpcb binding, which we ensure by acquiring a read lock on the + * inpcb. This doesn't strictly follow the lock order, so we play + * the trylock and retry game; note that we may end up with more + * conservative locks than required the second time around, so later + * assertions have to accept that. Further analysis of the number of + * misses under contention is required. + */ + sin = (struct sockaddr_in *)addr; + INP_RLOCK(inp); + if (sin != NULL && + (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { + INP_RUNLOCK(inp); + INP_INFO_WLOCK(&V_udbinfo); + INP_WLOCK(inp); + unlock_udbinfo = 2; + } else if ((sin != NULL && ( + (sin->sin_addr.s_addr == INADDR_ANY) || + (sin->sin_addr.s_addr == INADDR_BROADCAST) || + (inp->inp_laddr.s_addr == INADDR_ANY) || + (inp->inp_lport == 0))) || + (src.sin_family == AF_INET)) { + if (!INP_INFO_TRY_RLOCK(&V_udbinfo)) { + INP_RUNLOCK(inp); + INP_INFO_RLOCK(&V_udbinfo); + INP_RLOCK(inp); + } + unlock_udbinfo = 1; + } else + unlock_udbinfo = 0; + + /* + * If the IP_SENDSRCADDR control message was specified, override the + * source address for this datagram. Its use is invalidated if the + * address thus specified is incomplete or clobbers other inpcbs. + */ + laddr = inp->inp_laddr; + lport = inp->inp_lport; + if (src.sin_family == AF_INET) { + INP_INFO_LOCK_ASSERT(&V_udbinfo); + if ((lport == 0) || + (laddr.s_addr == INADDR_ANY && + src.sin_addr.s_addr == INADDR_ANY)) { + error = EINVAL; + goto release; + } + error = in_pcbbind_setup(inp, (struct sockaddr *)&src, + &laddr.s_addr, &lport, td->td_ucred); + if (error) + goto release; + } + + /* + * If a UDP socket has been connected, then a local address/port will + * have been selected and bound. + * + * If a UDP socket has not been connected to, then an explicit + * destination address must be used, in which case a local + * address/port may not have been selected and bound. + */ + if (sin != NULL) { + INP_LOCK_ASSERT(inp); + if (inp->inp_faddr.s_addr != INADDR_ANY) { + error = EISCONN; + goto release; + } + + /* + * Jail may rewrite the destination address, so let it do + * that before we use it. + */ + error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); + if (error) + goto release; + + /* + * If a local address or port hasn't yet been selected, or if + * the destination address needs to be rewritten due to using + * a special INADDR_ constant, invoke in_pcbconnect_setup() + * to do the heavy lifting. Once a port is selected, we + * commit the binding back to the socket; we also commit the + * binding of the address if in jail. + * + * If we already have a valid binding and we're not + * requesting a destination address rewrite, use a fast path. + */ + if (inp->inp_laddr.s_addr == INADDR_ANY || + inp->inp_lport == 0 || + sin->sin_addr.s_addr == INADDR_ANY || + sin->sin_addr.s_addr == INADDR_BROADCAST) { + INP_INFO_LOCK_ASSERT(&V_udbinfo); + error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, + &lport, &faddr.s_addr, &fport, NULL, + td->td_ucred); + if (error) + goto release; + + /* + * XXXRW: Why not commit the port if the address is + * !INADDR_ANY? + */ + /* Commit the local port if newly assigned. */ + if (inp->inp_laddr.s_addr == INADDR_ANY && + inp->inp_lport == 0) { + INP_INFO_WLOCK_ASSERT(&V_udbinfo); + INP_WLOCK_ASSERT(inp); + /* + * Remember addr if jailed, to prevent + * rebinding. + */ + if (prison_flag(td->td_ucred, PR_IP4)) + inp->inp_laddr = laddr; + inp->inp_lport = lport; + if (in_pcbinshash(inp) != 0) { + inp->inp_lport = 0; + error = EAGAIN; + goto release; + } + inp->inp_flags |= INP_ANONPORT; + } + } else { + faddr = sin->sin_addr; + fport = sin->sin_port; + } + } else { + INP_LOCK_ASSERT(inp); + faddr = inp->inp_faddr; + fport = inp->inp_fport; + if (faddr.s_addr == INADDR_ANY) { + error = ENOTCONN; + goto release; + } + } + + /* + * Calculate data length and get a mbuf for UDP, IP, and possible + * link-layer headers. Immediate slide the data pointer back forward + * since we won't use that space at this layer. + */ + M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_DONTWAIT); + if (m == NULL) { + error = ENOBUFS; + goto release; + } + m->m_data += max_linkhdr; + m->m_len -= max_linkhdr; + m->m_pkthdr.len -= max_linkhdr; + + /* + * Fill in mbuf with extended UDP header and addresses and length put + * into network format. + */ + ui = mtod(m, struct udpiphdr *); + bzero(ui->ui_x1, sizeof(ui->ui_x1)); /* XXX still needed? */ + ui->ui_pr = IPPROTO_UDP; + ui->ui_src = laddr; + ui->ui_dst = faddr; + ui->ui_sport = lport; + ui->ui_dport = fport; + ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); + + /* + * Set the Don't Fragment bit in the IP header. + */ + if (inp->inp_flags & INP_DONTFRAG) { + struct ip *ip; + + ip = (struct ip *)&ui->ui_i; + ip->ip_off |= IP_DF; + } + + ipflags = 0; + if (inp->inp_socket->so_options & SO_DONTROUTE) + ipflags |= IP_ROUTETOIF; + if (inp->inp_socket->so_options & SO_BROADCAST) + ipflags |= IP_ALLOWBROADCAST; + if (inp->inp_flags & INP_ONESBCAST) + ipflags |= IP_SENDONES; + +#ifdef MAC + mac_inpcb_create_mbuf(inp, m); +#endif + + /* + * Set up checksum and output datagram. + */ + if (udp_cksum) { + if (inp->inp_flags & INP_ONESBCAST) + faddr.s_addr = INADDR_BROADCAST; + ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, + htons((u_short)len + sizeof(struct udphdr) + IPPROTO_UDP)); + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + } else + ui->ui_sum = 0; + ((struct ip *)ui)->ip_len = sizeof (struct udpiphdr) + len; + ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; /* XXX */ + ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ + UDPSTAT_INC(udps_opackets); + + if (unlock_udbinfo == 2) + INP_INFO_WUNLOCK(&V_udbinfo); + else if (unlock_udbinfo == 1) + INP_INFO_RUNLOCK(&V_udbinfo); + error = ip_output(m, inp->inp_options, NULL, ipflags, + inp->inp_moptions, inp); + if (unlock_udbinfo == 2) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); + return (error); + +release: + if (unlock_udbinfo == 2) { + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_udbinfo); + } else if (unlock_udbinfo == 1) { + INP_RUNLOCK(inp); + INP_INFO_RUNLOCK(&V_udbinfo); + } else + INP_RUNLOCK(inp); + m_freem(m); + return (error); +} + + +#if defined(IPSEC) && defined(IPSEC_NAT_T) +#ifdef INET +/* + * Potentially decap ESP in UDP frame. Check for an ESP header + * and optional marker; if present, strip the UDP header and + * push the result through IPSec. + * + * Returns mbuf to be processed (potentially re-allocated) or + * NULL if consumed and/or processed. + */ +static struct mbuf * +udp4_espdecap(struct inpcb *inp, struct mbuf *m, int off) +{ + size_t minlen, payload, skip, iphlen; + caddr_t data; + struct udpcb *up; + struct m_tag *tag; + struct udphdr *udphdr; + struct ip *ip; + + INP_RLOCK_ASSERT(inp); + + /* + * Pull up data so the longest case is contiguous: + * IP/UDP hdr + non ESP marker + ESP hdr. + */ + minlen = off + sizeof(uint64_t) + sizeof(struct esp); + if (minlen > m->m_pkthdr.len) + minlen = m->m_pkthdr.len; + if ((m = m_pullup(m, minlen)) == NULL) { + V_ipsec4stat.in_inval++; + return (NULL); /* Bypass caller processing. */ + } + data = mtod(m, caddr_t); /* Points to ip header. */ + payload = m->m_len - off; /* Size of payload. */ + + if (payload == 1 && data[off] == '\xff') + return (m); /* NB: keepalive packet, no decap. */ + + up = intoudpcb(inp); + KASSERT(up != NULL, ("%s: udpcb NULL", __func__)); + KASSERT((up->u_flags & UF_ESPINUDP_ALL) != 0, + ("u_flags 0x%x", up->u_flags)); + + /* + * Check that the payload is large enough to hold an + * ESP header and compute the amount of data to remove. + * + * NB: the caller has already done a pullup for us. + * XXX can we assume alignment and eliminate bcopys? + */ + if (up->u_flags & UF_ESPINUDP_NON_IKE) { + /* + * draft-ietf-ipsec-nat-t-ike-0[01].txt and + * draft-ietf-ipsec-udp-encaps-(00/)01.txt, ignoring + * possible AH mode non-IKE marker+non-ESP marker + * from draft-ietf-ipsec-udp-encaps-00.txt. + */ + uint64_t marker; + + if (payload <= sizeof(uint64_t) + sizeof(struct esp)) + return (m); /* NB: no decap. */ + bcopy(data + off, &marker, sizeof(uint64_t)); + if (marker != 0) /* Non-IKE marker. */ + return (m); /* NB: no decap. */ + skip = sizeof(uint64_t) + sizeof(struct udphdr); + } else { + uint32_t spi; + + if (payload <= sizeof(struct esp)) { + V_ipsec4stat.in_inval++; + m_freem(m); + return (NULL); /* Discard. */ + } + bcopy(data + off, &spi, sizeof(uint32_t)); + if (spi == 0) /* Non-ESP marker. */ + return (m); /* NB: no decap. */ + skip = sizeof(struct udphdr); + } + + /* + * Setup a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember + * the UDP ports. This is required if we want to select + * the right SPD for multiple hosts behind same NAT. + * + * NB: ports are maintained in network byte order everywhere + * in the NAT-T code. + */ + tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS, + 2 * sizeof(uint16_t), M_NOWAIT); + if (tag == NULL) { + V_ipsec4stat.in_nomem++; + m_freem(m); + return (NULL); /* Discard. */ + } + iphlen = off - sizeof(struct udphdr); + udphdr = (struct udphdr *)(data + iphlen); + ((uint16_t *)(tag + 1))[0] = udphdr->uh_sport; + ((uint16_t *)(tag + 1))[1] = udphdr->uh_dport; + m_tag_prepend(m, tag); + + /* + * Remove the UDP header (and possibly the non ESP marker) + * IP header length is iphlen + * Before: + * <--- off ---> + * +----+------+-----+ + * | IP | UDP | ESP | + * +----+------+-----+ + * <-skip-> + * After: + * +----+-----+ + * | IP | ESP | + * +----+-----+ + * <-skip-> + */ + ovbcopy(data, data + skip, iphlen); + m_adj(m, skip); + + ip = mtod(m, struct ip *); + ip->ip_len -= skip; + ip->ip_p = IPPROTO_ESP; + + /* + * We cannot yet update the cksums so clear any + * h/w cksum flags as they are no longer valid. + */ + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) + m->m_pkthdr.csum_flags &= ~(CSUM_DATA_VALID|CSUM_PSEUDO_HDR); + + (void) ipsec4_common_input(m, iphlen, ip->ip_p); + return (NULL); /* NB: consumed, bypass processing. */ +} +#endif /* INET */ +#endif /* defined(IPSEC) && defined(IPSEC_NAT_T) */ + +static void +udp_abort(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("udp_abort: inp == NULL")); + INP_INFO_WLOCK(&V_udbinfo); + INP_WLOCK(inp); + if (inp->inp_faddr.s_addr != INADDR_ANY) { + in_pcbdisconnect(inp); + inp->inp_laddr.s_addr = INADDR_ANY; + soisdisconnected(so); + } + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_udbinfo); +} + +static int +udp_attach(struct socket *so, int proto, struct thread *td) +{ + struct inpcb *inp; + int error; + + inp = sotoinpcb(so); + KASSERT(inp == NULL, ("udp_attach: inp != NULL")); + error = soreserve(so, udp_sendspace, udp_recvspace); + if (error) + return (error); + INP_INFO_WLOCK(&V_udbinfo); + error = in_pcballoc(so, &V_udbinfo); + if (error) { + INP_INFO_WUNLOCK(&V_udbinfo); + return (error); + } + + inp = sotoinpcb(so); + inp->inp_vflag |= INP_IPV4; + inp->inp_ip_ttl = V_ip_defttl; + + error = udp_newudpcb(inp); + if (error) { + in_pcbdetach(inp); + in_pcbfree(inp); + INP_INFO_WUNLOCK(&V_udbinfo); + return (error); + } + + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_udbinfo); + return (0); +} + +int +udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f) +{ + struct inpcb *inp; + struct udpcb *up; + + KASSERT(so->so_type == SOCK_DGRAM, + ("udp_set_kernel_tunneling: !dgram")); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL")); + INP_WLOCK(inp); + up = intoudpcb(inp); + if (up->u_tun_func != NULL) { + INP_WUNLOCK(inp); + return (EBUSY); + } + up->u_tun_func = f; + INP_WUNLOCK(inp); + return (0); +} + +static int +udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp; + int error; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("udp_bind: inp == NULL")); + INP_INFO_WLOCK(&V_udbinfo); + INP_WLOCK(inp); + error = in_pcbbind(inp, nam, td->td_ucred); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_udbinfo); + return (error); +} + +static void +udp_close(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("udp_close: inp == NULL")); + INP_INFO_WLOCK(&V_udbinfo); + INP_WLOCK(inp); + if (inp->inp_faddr.s_addr != INADDR_ANY) { + in_pcbdisconnect(inp); + inp->inp_laddr.s_addr = INADDR_ANY; + soisdisconnected(so); + } + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_udbinfo); +} + +static int +udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp; + int error; + struct sockaddr_in *sin; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("udp_connect: inp == NULL")); + INP_INFO_WLOCK(&V_udbinfo); + INP_WLOCK(inp); + if (inp->inp_faddr.s_addr != INADDR_ANY) { + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_udbinfo); + return (EISCONN); + } + sin = (struct sockaddr_in *)nam; + error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); + if (error != 0) { + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_udbinfo); + return (error); + } + error = in_pcbconnect(inp, nam, td->td_ucred); + if (error == 0) + soisconnected(so); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_udbinfo); + return (error); +} + +static void +udp_detach(struct socket *so) +{ + struct inpcb *inp; + struct udpcb *up; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("udp_detach: inp == NULL")); + KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, + ("udp_detach: not disconnected")); + INP_INFO_WLOCK(&V_udbinfo); + INP_WLOCK(inp); + up = intoudpcb(inp); + KASSERT(up != NULL, ("%s: up == NULL", __func__)); + inp->inp_ppcb = NULL; + in_pcbdetach(inp); + in_pcbfree(inp); + INP_INFO_WUNLOCK(&V_udbinfo); + udp_discardcb(up); +} + +static int +udp_disconnect(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); + INP_INFO_WLOCK(&V_udbinfo); + INP_WLOCK(inp); + if (inp->inp_faddr.s_addr == INADDR_ANY) { + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_udbinfo); + return (ENOTCONN); + } + + in_pcbdisconnect(inp); + inp->inp_laddr.s_addr = INADDR_ANY; + SOCK_LOCK(so); + so->so_state &= ~SS_ISCONNECTED; /* XXX */ + SOCK_UNLOCK(so); + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_udbinfo); + return (0); +} + +static int +udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, + struct mbuf *control, struct thread *td) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("udp_send: inp == NULL")); + return (udp_output(inp, m, addr, control, td)); +} + +int +udp_shutdown(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("udp_shutdown: inp == NULL")); + INP_WLOCK(inp); + socantsendmore(so); + INP_WUNLOCK(inp); + return (0); +} + +struct pr_usrreqs udp_usrreqs = { + .pru_abort = udp_abort, + .pru_attach = udp_attach, + .pru_bind = udp_bind, + .pru_connect = udp_connect, + .pru_control = in_control, + .pru_detach = udp_detach, + .pru_disconnect = udp_disconnect, + .pru_peeraddr = in_getpeeraddr, + .pru_send = udp_send, + .pru_soreceive = soreceive_dgram, + .pru_sosend = sosend_dgram, + .pru_shutdown = udp_shutdown, + .pru_sockaddr = in_getsockaddr, + .pru_sosetlabel = in_pcbsosetlabel, + .pru_close = udp_close, +}; diff --git a/freebsd/sys/netinet/udp_var.h b/freebsd/sys/netinet/udp_var.h new file mode 100644 index 00000000..0bff6ea9 --- /dev/null +++ b/freebsd/sys/netinet/udp_var.h @@ -0,0 +1,161 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)udp_var.h 8.1 (Berkeley) 6/10/93 + * $FreeBSD$ + */ + +#ifndef _NETINET_UDP_VAR_HH_ +#define _NETINET_UDP_VAR_HH_ + +/* + * UDP kernel structures and variables. + */ +struct udpiphdr { + struct ipovly ui_i; /* overlaid ip structure */ + struct udphdr ui_u; /* udp header */ +}; +#define ui_x1 ui_i.ih_x1 +#define ui_pr ui_i.ih_pr +#define ui_len ui_i.ih_len +#define ui_src ui_i.ih_src +#define ui_dst ui_i.ih_dst +#define ui_sport ui_u.uh_sport +#define ui_dport ui_u.uh_dport +#define ui_ulen ui_u.uh_ulen +#define ui_sum ui_u.uh_sum + +typedef void(*udp_tun_func_t)(struct mbuf *, int off, struct inpcb *); + +/* + * UDP control block; one per udp. + */ +struct udpcb { + udp_tun_func_t u_tun_func; /* UDP kernel tunneling callback. */ + u_int u_flags; /* Generic UDP flags. */ +}; + +#define intoudpcb(ip) ((struct udpcb *)(ip)->inp_ppcb) +#define sotoudpcb(so) (intoudpcb(sotoinpcb(so))) + + /* IPsec: ESP in UDP tunneling: */ +#define UF_ESPINUDP_NON_IKE 0x00000001 /* w/ non-IKE marker .. */ + /* .. per draft-ietf-ipsec-nat-t-ike-0[01], + * and draft-ietf-ipsec-udp-encaps-(00/)01.txt */ +#define UF_ESPINUDP 0x00000002 /* w/ non-ESP marker. */ + +struct udpstat { + /* input statistics: */ + u_long udps_ipackets; /* total input packets */ + u_long udps_hdrops; /* packet shorter than header */ + u_long udps_badsum; /* checksum error */ + u_long udps_nosum; /* no checksum */ + u_long udps_badlen; /* data length larger than packet */ + u_long udps_noport; /* no socket on port */ + u_long udps_noportbcast; /* of above, arrived as broadcast */ + u_long udps_fullsock; /* not delivered, input socket full */ + u_long udpps_pcbcachemiss; /* input packets missing pcb cache */ + u_long udpps_pcbhashmiss; /* input packets not for hashed pcb */ + /* output statistics: */ + u_long udps_opackets; /* total output packets */ + u_long udps_fastout; /* output packets on fast path */ + /* of no socket on port, arrived as multicast */ + u_long udps_noportmcast; + u_long udps_filtermcast; /* blocked by multicast filter */ +}; + +#ifdef _KERNEL +/* + * In-kernel consumers can use these accessor macros directly to update + * stats. + */ +#define UDPSTAT_ADD(name, val) V_udpstat.name += (val) +#define UDPSTAT_INC(name) UDPSTAT_ADD(name, 1) + +/* + * Kernel module consumers must use this accessor macro. + */ +void kmod_udpstat_inc(int statnum); +#define KMOD_UDPSTAT_INC(name) \ + kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(u_long)) +#endif + +/* + * Names for UDP sysctl objects. + */ +#define UDPCTL_CHECKSUM 1 /* checksum UDP packets */ +#define UDPCTL_STATS 2 /* statistics (read-only) */ +#define UDPCTL_MAXDGRAM 3 /* max datagram size */ +#define UDPCTL_RECVSPACE 4 /* default receive buffer space */ +#define UDPCTL_PCBLIST 5 /* list of PCBs for UDP sockets */ +#define UDPCTL_MAXID 6 + +#define UDPCTL_NAMES { \ + { 0, 0 }, \ + { "checksum", CTLTYPE_INT }, \ + { "stats", CTLTYPE_STRUCT }, \ + { "maxdgram", CTLTYPE_INT }, \ + { "recvspace", CTLTYPE_INT }, \ + { "pcblist", CTLTYPE_STRUCT }, \ +} + +#ifdef _KERNEL +SYSCTL_DECL(_net_inet_udp); + +extern struct pr_usrreqs udp_usrreqs; +VNET_DECLARE(struct inpcbhead, udb); +VNET_DECLARE(struct inpcbinfo, udbinfo); +#define V_udb VNET(udb) +#define V_udbinfo VNET(udbinfo) + +extern u_long udp_sendspace; +extern u_long udp_recvspace; +VNET_DECLARE(struct udpstat, udpstat); +VNET_DECLARE(int, udp_blackhole); +#define V_udpstat VNET(udpstat) +#define V_udp_blackhole VNET(udp_blackhole) +extern int udp_log_in_vain; + +int udp_newudpcb(struct inpcb *); +void udp_discardcb(struct udpcb *); + +void udp_ctlinput(int, struct sockaddr *, void *); +int udp_ctloutput(struct socket *, struct sockopt *); +void udp_init(void); +#ifdef VIMAGE +void udp_destroy(void); +#endif +void udp_input(struct mbuf *, int); +struct inpcb *udp_notify(struct inpcb *inp, int errno); +int udp_shutdown(struct socket *so); + +int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f); +#endif + +#endif -- cgit v1.2.3